Merge pull request #2603 from wmbrownIntel/gpu-updateFeb2021

GPU Package Update February 2021
2021-02-17 19:04:55 -05:00
parent 97f36d31d6 45f6e9ec2e
commit 53fdaa5741
364 changed files with 13750 additions and 7917 deletions
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -1,7 +1,9 @@
 set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
 set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
                ${GPU_SOURCES_DIR}/fix_gpu.h
-                ${GPU_SOURCES_DIR}/fix_gpu.cpp)
+                ${GPU_SOURCES_DIR}/fix_gpu.cpp
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.h
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp)
 target_compile_definitions(lammps PRIVATE -DLMP_GPU)

 set(GPU_API "opencl" CACHE STRING "API used by GPU package")
@ -155,11 +157,6 @@ elseif(GPU_API STREQUAL "OPENCL")
  else()
    find_package(OpenCL REQUIRED)
  endif()
-  set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
-  set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
-  set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
-  validate_option(OCL_TUNE OCL_TUNE_VALUES)
-  string(TOUPPER ${OCL_TUNE} OCL_TUNE)

  include(OpenCLUtils)
  set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
@ -203,7 +200,7 @@ elseif(GPU_API STREQUAL "OPENCL")
  add_library(gpu STATIC ${GPU_LIB_SOURCES})
  target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
  target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
+  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
  target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)

  target_link_libraries(lammps PRIVATE gpu)
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@ -120,8 +120,6 @@ CMake build
   -D GPU_API=value             # value = opencl (default) or cuda or hip
   -D GPU_PREC=value            # precision setting
                                # value = double or mixed (default) or single
-   -D OCL_TUNE=value            # hardware choice for GPU_API=opencl
-                                # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
   -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
                                # value = sm_XX, see below
                                # default is sm_50
--- a/doc/src/Commands_fix.rst
+++ b/doc/src/Commands_fix.rst
@ -114,7 +114,7 @@ OPT.
   * :doc:`nph/eff <fix_nh_eff>`
   * :doc:`nph/sphere (o) <fix_nph_sphere>`
   * :doc:`nphug <fix_nphug>`
-   * :doc:`npt (iko) <fix_nh>`
+   * :doc:`npt (giko) <fix_nh>`
   * :doc:`npt/asphere (o) <fix_npt_asphere>`
   * :doc:`npt/body <fix_npt_body>`
   * :doc:`npt/cauchy <fix_npt_cauchy>`
@ -122,8 +122,8 @@ OPT.
   * :doc:`npt/sphere (o) <fix_npt_sphere>`
   * :doc:`npt/uef <fix_nh_uef>`
   * :doc:`numdiff <fix_numdiff>`
-   * :doc:`nve (iko) <fix_nve>`
-   * :doc:`nve/asphere (i) <fix_nve_asphere>`
+   * :doc:`nve (giko) <fix_nve>`
+   * :doc:`nve/asphere (gi) <fix_nve_asphere>`
   * :doc:`nve/asphere/noforce <fix_nve_asphere_noforce>`
   * :doc:`nve/awpmd <fix_nve_awpmd>`
   * :doc:`nve/body <fix_nve_body>`
@ -138,7 +138,7 @@ OPT.
   * :doc:`nve/spin <fix_nve_spin>`
   * :doc:`nve/tri <fix_nve_tri>`
   * :doc:`nvk <fix_nvk>`
-   * :doc:`nvt (iko) <fix_nh>`
+   * :doc:`nvt (giko) <fix_nh>`
   * :doc:`nvt/asphere (o) <fix_nvt_asphere>`
   * :doc:`nvt/body <fix_nvt_body>`
   * :doc:`nvt/eff <fix_nh_eff>`
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@ -122,7 +122,7 @@ OPT.
   * :doc:`lebedeva/z <pair_lebedeva_z>`
   * :doc:`lennard/mdf <pair_mdf>`
   * :doc:`line/lj <pair_line_lj>`
-   * :doc:`lj/charmm/coul/charmm (iko) <pair_charmm>`
+   * :doc:`lj/charmm/coul/charmm (giko) <pair_charmm>`
   * :doc:`lj/charmm/coul/charmm/implicit (ko) <pair_charmm>`
   * :doc:`lj/charmm/coul/long (gikot) <pair_charmm>`
   * :doc:`lj/charmm/coul/long/soft (o) <pair_fep_soft>`
--- a/doc/src/Speed_gpu.rst
+++ b/doc/src/Speed_gpu.rst
@ -1,11 +1,14 @@
 GPU package
 ===========

-The GPU package was developed by Mike Brown while at SNL and ORNL
-and his collaborators, particularly Trung Nguyen (now at Northwestern).
-It provides GPU versions of many pair styles and for parts of the
-:doc:`kspace_style pppm <kspace_style>` for long-range Coulombics.
-It has the following general features:
+The GPU package was developed by Mike Brown while at SNL and ORNL (now
+at Intel Corp.) and his collaborators, particularly Trung Nguyen (now at
+Northwestern).  Support for AMD GPUs via HIP was added by Vsevolod Nikolskiy
+and coworkers at HSE University.
+
+The GPU package provides GPU versions of many pair styles and for
+parts of the :doc:`kspace_style pppm <kspace_style>` for long-range
+Coulombics.  It has the following general features:

 * It is designed to exploit common GPU hardware configurations where one
  or more GPUs are coupled to many cores of one or more multi-core CPUs,
@ -24,8 +27,9 @@ It has the following general features:
  force vectors.
 * LAMMPS-specific code is in the GPU package.  It makes calls to a
  generic GPU library in the lib/gpu directory.  This library provides
-  NVIDIA support as well as more general OpenCL support, so that the
-  same functionality is supported on a variety of hardware.
+  either Nvidia support, AMD support, or more general OpenCL support
+  (for Nvidia GPUs, AMD GPUs, Intel GPUs, and multi-core CPUs).
+  so that the same functionality is supported on a variety of hardware.

 **Required hardware/software:**

@ -45,12 +49,23 @@ to have the OpenCL headers and the (vendor neutral) OpenCL library installed.
 In OpenCL mode, the acceleration depends on having an `OpenCL Installable Client Driver (ICD) <https://www.khronos.org/news/permalink/opencl-installable-client-driver-icd-loader>`_
 installed. There can be multiple of them for the same or different hardware
 (GPUs, CPUs, Accelerators) installed at the same time. OpenCL refers to those
-as 'platforms'.  The GPU library will select the **first** suitable platform,
-but this can be overridden using the device option of the :doc:`package <package>`
+as 'platforms'.  The GPU library will try to auto-select the best suitable platform,
+but this can be overridden using the platform option of the :doc:`package <package>`
 command. run lammps/lib/gpu/ocl_get_devices to get a list of available
 platforms and devices with a suitable ICD available.

-To compute and use this package in HIP mode, you have to have the AMD ROCm
+To compile and use this package for Intel GPUs, OpenCL or the Intel oneAPI
+HPC Toolkit can be installed using linux package managers. The latter also
+provides optimized C++, MPI, and many other libraries and tools. See:
+
+* https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit/download.html
+
+If you do not have a discrete GPU card installed, this package can still provide
+significant speedups on some CPUs that include integrated GPUs. Additionally, for
+many macs, OpenCL is already included with the OS and Makefiles are available
+in the lib/gpu directory.
+
+To compile and use this package in HIP mode, you have to have the AMD ROCm
 software installed. Versions of ROCm older than 3.5 are currently deprecated
 by AMD.

@ -75,10 +90,20 @@ automatically if you create more MPI tasks/node than there are
 GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
 shared by 4 MPI tasks.

+The GPU package also has limited support for OpenMP for both
+multi-threading and vectorization of routines that are run on the CPUs.
+This requires that the GPU library and LAMMPS are built with flags to
+enable OpenMP support (e.g. -fopenmp). Some styles for time integration
+are also available in the GPU package. These run completely on the CPUs
+in full double precision, but exploit multi-threading and vectorization
+for faster performance.
+
 Use the "-sf gpu" :doc:`command-line switch <Run_options>`, which will
 automatically append "gpu" to styles that support it.  Use the "-pk
 gpu Ng" :doc:`command-line switch <Run_options>` to set Ng = # of
-GPUs/node to use.
+GPUs/node to use. If Ng is 0, the number is selected automatically as
+the number of matching GPUs that have the highest number of compute
+cores.

 .. code-block:: bash

@ -87,8 +112,8 @@ GPUs/node to use.
   mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes

 Note that if the "-sf gpu" switch is used, it also issues a default
-:doc:`package gpu 1 <package>` command, which sets the number of
-GPUs/node to 1.
+:doc:`package gpu 0 <package>` command, which will result in
+automatic selection of the number of GPUs to use.

 Using the "-pk" switch explicitly allows for setting of the number of
 GPUs/node to use and additional options.  Its syntax is the same as
@ -138,6 +163,13 @@ Likewise, you should experiment with the precision setting for the GPU
 library to see if single or mixed precision will give accurate
 results, since they will typically be faster.

+MPI parallelism typically outperforms OpenMP parallelism, but in some
+cases using fewer MPI tasks and multiple OpenMP threads with the GPU
+package can give better performance. 3-body potentials can often perform
+better with multiple OMP threads because the inter-process communication
+is higher for these styles with the GPU package in order to allow
+deterministic results.
+
 **Guidelines for best performance:**

 * Using multiple MPI tasks per GPU will often give the best performance,
@ -161,6 +193,12 @@ results, since they will typically be faster.
  :doc:`angle <angle_style>`, :doc:`dihedral <dihedral_style>`,
  :doc:`improper <improper_style>`, and :doc:`long-range <kspace_style>`
  calculations will not be included in the "Pair" time.
+* Since only part of the pppm kspace style is GPU accelerated, it
+  may be faster to only use GPU acceleration for Pair styles with
+  long-range electrostatics.  See the "pair/only" keyword of the
+  package command for a shortcut to do that.  The work between kspace
+  on the CPU and non-bonded interactions on the GPU can be balanced
+  through adjusting the coulomb cutoff without loss of accuracy.
 * When the *mode* setting for the package gpu command is force/neigh,
  the time for neighbor list calculations on the GPU will be added into
  the "Pair" time, not the "Neigh" time.  An additional breakdown of the
--- a/doc/src/Speed_packages.rst
+++ b/doc/src/Speed_packages.rst
@ -16,7 +16,7 @@ These are the accelerator packages currently in LAMMPS, either as
 standard or user packages:

 +-----------------------------------------+-------------------------------------------------------+
-| :doc:`GPU Package <Speed_gpu>`          | for NVIDIA GPUs as well as OpenCL support             |
+| :doc:`GPU Package <Speed_gpu>`          | for GPUs via CUDA, OpenCL, or ROCm HIP                |
 +-----------------------------------------+-------------------------------------------------------+
 | :doc:`USER-INTEL Package <Speed_intel>` | for Intel CPUs and Intel Xeon Phi                     |
 +-----------------------------------------+-------------------------------------------------------+
@ -43,7 +43,7 @@ three kinds of hardware, via the listed packages:
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
 | Many-core CPUs  | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>`, :doc:`USER-OMP <Speed_omp>`, :doc:`OPT <Speed_opt>` packages |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
-| NVIDIA/AMD GPUs | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                               |
+| GPUs            | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                               |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
 | Intel Phi/AVX   | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                      |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
@ -154,8 +154,8 @@ Here is a brief summary of what the various packages provide.  Details
 are in the individual accelerator sections.

 * Styles with a "gpu" suffix are part of the GPU package and can be run
-  on NVIDIA or AMD GPUs.  The speed-up on a GPU depends on a variety of
-  factors, discussed in the accelerator sections.
+  on Intel, NVIDIA, or AMD GPUs.  The speed-up on a GPU depends on a
+  variety of factors, discussed in the accelerator sections.
 * Styles with an "intel" suffix are part of the USER-INTEL
  package. These styles support vectorized single and mixed precision
  calculations, in addition to full double precision.  In extreme cases,
--- a/doc/src/fix_nh.rst
+++ b/doc/src/fix_nh.rst
@ -1,8 +1,10 @@
 .. index:: fix nvt
+.. index:: fix nvt/gpu
 .. index:: fix nvt/intel
 .. index:: fix nvt/kk
 .. index:: fix nvt/omp
 .. index:: fix npt
+.. index:: fix npt/gpu
 .. index:: fix npt/intel
 .. index:: fix npt/kk
 .. index:: fix npt/omp
@ -13,12 +15,12 @@
 fix nvt command
 ===============

-Accelerator Variants: *nvt/intel*, *nvt/kk*, *nvt/omp*
+Accelerator Variants: *nvt/gpu*, *nvt/intel*, *nvt/kk*, *nvt/omp*

 fix npt command
 ===============

-Accelerator Variants: *npt/intel*, *npt/kk*, *npt/omp*
+Accelerator Variants: *npt/gpu*, *npt/intel*, *npt/kk*, *npt/omp*

 fix nph command
 ===============
--- a/doc/src/fix_nve.rst
+++ b/doc/src/fix_nve.rst
@ -1,4 +1,5 @@
 .. index:: fix nve
+.. index:: fix nve/gpu
 .. index:: fix nve/intel
 .. index:: fix nve/kk
 .. index:: fix nve/omp
@ -6,7 +7,7 @@
 fix nve command
 ===============

-Accelerator Variants: *nve/intel*, *nve/kk*, *nve/omp*
+Accelerator Variants: *nve/gpu*, *nve/intel*, *nve/kk*, *nve/omp*

 Syntax
 """"""
--- a/doc/src/fix_nve_asphere.rst
+++ b/doc/src/fix_nve_asphere.rst
@ -1,10 +1,11 @@
 .. index:: fix nve/asphere
+.. index:: fix nve/asphere/gpu
 .. index:: fix nve/asphere/intel

 fix nve/asphere command
 =======================

-Accelerator Variants: *nve/asphere/intel*
+Accelerator Variants: *nve/asphere/gpu*, *nve/asphere/intel*

 Syntax
 """"""
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@ -18,7 +18,7 @@ Syntax
       *gpu* args = Ngpu keyword value ...
         Ngpu = # of GPUs per node
         zero or more keyword/value pairs may be appended
-         keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *device* or *blocksize*
+         keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *blocksize* or *platform* or *device_type* or *ocl_args*
           *neigh* value = *yes* or *no*
             yes = neighbor list build on GPU (default)
             no = neighbor list build on CPU
@ -32,17 +32,18 @@ Syntax
             size = bin size for neighbor list construction (distance units)
           *split* = fraction
             fraction = fraction of atoms assigned to GPU (default = 1.0)
-           *gpuID* values = first last
-             first = ID of first GPU to be used on each node
-             last = ID of last GPU to be used on each node
           *tpa* value = Nthreads
-             Nthreads = # of GPU threads used per atom
-           *device* value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
-             platform_id = numerical OpenCL platform id (default: -1)
-             device_type = *kepler* or *fermi* or *cypress* or *intel* or *phi* or *generic* or *custom*
-             val1,val2,... = custom OpenCL tune parameters (see below for details)
+             Nthreads = # of GPU vector lanes used per atom
           *blocksize* value = size
             size = thread block size for pair force computation
+           *platform* value = id
+             id = For OpenCL, platform ID for the GPU or accelerator
+           *gpuID* values = id
+             id = ID of first GPU to be used on each node
+           *device_type* value = *intelgpu* or *nvidiagpu* or *amdgpu* or *applegpu* or *generic* or *custom,val1,val2,...*
+             val1,val2,... = custom OpenCL accelerator configuration parameters (see below for details)
+           *ocl_args* value = args
+             args = List of additional OpenCL compiler arguments delimited by colons
       *intel* args = NPhi keyword value ...
         Nphi = # of co-processors per node
         zero or more keyword/value pairs may be appended
@ -112,12 +113,10 @@ Examples

 .. code-block:: LAMMPS

-   package gpu 1
+   package gpu 0
   package gpu 1 split 0.75
   package gpu 2 split -1.0
-   package gpu 1 device kepler
-   package gpu 1 device 2:generic
-   package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
+   package gpu 0 device_type intelgpu
   package kokkos neigh half comm device
   package omp 0 neigh no
   package omp 4
@ -174,10 +173,18 @@ simulations.
 The *gpu* style invokes settings associated with the use of the GPU
 package.

-The *Ngpu* argument sets the number of GPUs per node.  There must be
-at least as many MPI tasks per node as GPUs, as set by the mpirun or
-mpiexec command.  If there are more MPI tasks (per node)
-than GPUs, multiple MPI tasks will share each GPU.
+The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0
+and no other keywords are specified, GPU or accelerator devices are
+auto-selected. In this process, all platforms are searched for
+accelerator devices and GPUs are chosen if available. The device with
+the highest number of compute cores is selected. The number of devices
+is increased to be the number of matching accelerators with the same
+number of compute cores. If there are more devices than MPI tasks,
+the additional devices will be unused. The auto-selection of GPUs/
+accelerator devices and platforms can be restricted by specifying
+a non-zero value for *Ngpu* and / or using the *gpuID*, *platform*,
+and *device_type* keywords as described below. If there are more MPI
+tasks (per node) than GPUs, multiple MPI tasks will share each GPU.

 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
@ -212,18 +219,8 @@ overlapped with all other computations on the CPU.

 The *binsize* keyword sets the size of bins used to bin atoms in
 neighbor list builds performed on the GPU, if *neigh* = *yes* is set.
-If *binsize* is set to 0.0 (the default), then bins = the size of the
-pairwise cutoff + neighbor skin distance.  This is 2x larger than the
-LAMMPS default used for neighbor list building on the CPU.  This will
-be close to optimal for the GPU, so you do not normally need to use
-this keyword.  Note that if you use a longer-than-usual pairwise
-cutoff, e.g. to allow for a smaller fraction of KSpace work with a
-:doc:`long-range Coulombic solver <kspace_style>` because the GPU is
-faster at performing pairwise interactions, then it may be optimal to
-make the *binsize* smaller than the default.  For example, with a
-cutoff of 20\*sigma in LJ :doc:`units <units>` and a neighbor skin
-distance of sigma, a *binsize* = 5.25\*sigma can be more efficient than
-the default.
+If *binsize* is set to 0.0 (the default), then the binsize is set
+automatically using heuristics in the GPU package.

 The *split* keyword can be used for load balancing force calculations
 between CPU and GPU cores in GPU-enabled pair styles. If 0 < *split* <
@ -257,63 +254,71 @@ cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.

-The *gpuID* keyword allows selection of which GPUs on each node will
-be used for a simulation.  The *first* and *last* values specify the
-GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
-Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
-of physical GPUs.  If you only wish to use a subset, set Ngpu to a
-smaller number and first/last to a sub-range of the available GPUs.
+The *gpuID* keyword is used to specify the first ID for the GPU or
+other accelerator that LAMMPS will use. For example, if the ID is
+1 and *Ngpu* is 3, GPUs 1-3 will be used. Device IDs should be
+determined from the output of nvc_get_devices, ocl_get_devices,
+or hip_get_devices
+as provided in the lib/gpu directory. When using OpenCL with
+accelerators that have main memory NUMA, the accelerators can be
+split into smaller virtual accelerators for more efficient use
+with MPI.

-The *tpa* keyword sets the number of GPU thread per atom used to
+The *tpa* keyword sets the number of GPU vector lanes per atom used to
 perform force calculations.  With a default value of 1, the number of
 threads will be chosen based on the pair style, however, the value can
 be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
 the value can improve performance. The number of threads per atom must
-be a power of 2 and currently cannot be greater than 32.
-
-The *device* keyword can be used to tune parameters optimized for a
-specific accelerator and platform when using OpenCL. OpenCL supports
-the concept of a **platform**\ , which represents one or more devices that
-share the same driver (e.g. there would be a different platform for
-GPUs from different vendors or for CPU based accelerator support).
-In LAMMPS only one platform can be active at a time and by default
-the first platform with an accelerator is selected. This is equivalent
-to using a platform ID of -1. The platform ID is a number corresponding
-to the output of the ocl_get_devices tool. The platform ID is passed
-to the GPU library, by prefixing the *device* keyword with that number
-separated by a colon. For CUDA, the *device* keyword is ignored.
-Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
-Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
-More devices may be added later.  The default device type can be
-specified when building LAMMPS with the GPU library, via setting a
-variable in the lib/gpu/Makefile that is used.
-
-In addition, a device type *custom* is available, which is followed by
-13 comma separated numbers, which allows to set those tweakable parameters
-from the package command. It can be combined with the (colon separated)
-platform id. The individual settings are:
-
-* MEM_THREADS
-* THREADS_PER_ATOM
-* THREADS_PER_CHARGE
-* BLOCK_PAIR
-* MAX_SHARED_TYPES
-* BLOCK_NBOR_BUILD
-* BLOCK_BIO_PAIR
-* BLOCK_ELLIPSE
-* WARP_SIZE
-* PPPM_BLOCK_1D
-* BLOCK_CELL_2D
-* BLOCK_CELL_ID
-* MAX_BIO_SHARED_TYPES
+be a power of 2 and currently cannot be greater than the SIMD width
+for the GPU / accelerator. In the case it exceeds the SIMD width, it
+will automatically be decreased to meet the restriction.

 The *blocksize* keyword allows you to tweak the number of threads used
 per thread block. This number should be a multiple of 32 (for GPUs)
 and its maximum depends on the specific GPU hardware. Typical choices
 are 64, 128, or 256. A larger block size increases occupancy of
 individual GPU cores, but reduces the total number of thread blocks,
-thus may lead to load imbalance.
+thus may lead to load imbalance. On modern hardware, the sensitivity
+to the blocksize is typically low.
+
+The *platform* keyword is only used with OpenCL to specify the ID for
+an OpenCL platform. See the output from ocl_get_devices in the lib/gpu
+directory. In LAMMPS only one platform can be active at a time and by
+default (id=-1) the platform is auto-selected to find the GPU with the
+most compute cores. When *Ngpu* or other keywords are specified, the
+auto-selection is appropriately restricted. For example, if *Ngpu* is
+3, only platforms with at least 3 accelerators are considered. Similar
+restrictions can be enforced by the *gpuID* and *device_type* keywords.
+
+The *device_type* keyword can be used for OpenCL to specify the type of
+GPU to use or specify a custom configuration for an accelerator. In most
+cases this selection will be automatic and there is no need to use the
+keyword. The *applegpu* type is not specific to a particular GPU vendor,
+but is separate due to the more restrictive Apple OpenCL implementation.
+For expert users, to specify a custom configuration, the *custom* keyword
+followed by the next parameters can be specified:
+
+CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
+THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
+BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
+BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
+PPPM_MAX_SPLINE.
+
+CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
+(NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
+vector operations. FAST_MATH in {0,1} indicates that OpenCL fast math
+optimizations are used during the build and hardware-accelerated
+transcendental functions are used when available. THREADS_PER_* give the
+default *tpa* values for ellipsoidal models, styles using charge, and
+any other styles. The BLOCK_* parameters specify the block sizes for
+various kernel calls and the MAX_*SHARED*_ parameters are used to
+determine the amount of local shared memory to use for storing model
+parameters.
+
+For OpenCL, the routines are compiled at runtime for the specified GPU
+or accelerator architecture. The *ocl_args* keyword can be used to
+specify additional flags for the runtime build.

 ----------

@ -658,9 +663,9 @@ Related commands
 Default
 """""""

-For the GPU package, the default is Ngpu = 1 and the option defaults
+For the GPU package, the default is Ngpu = 0 and the option defaults
 are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
-to Ngpu-1, tpa = 1, and device = not used.  These settings are made
+to Ngpu-1, tpa = 1, and platform=-1.  These settings are made
 automatically if the "-sf gpu" :doc:`command-line switch <Run_options>`
 is used.  If it is not used, you must invoke the package gpu command
 in your input script or via the "-pk gpu" :doc:`command-line switch <Run_options>`.
--- a/doc/src/pair_charmm.rst
+++ b/doc/src/pair_charmm.rst
@ -1,4 +1,5 @@
 .. index:: pair_style lj/charmm/coul/charmm
+.. index:: pair_style lj/charmm/coul/charmm/gpu
 .. index:: pair_style lj/charmm/coul/charmm/intel
 .. index:: pair_style lj/charmm/coul/charmm/kk
 .. index:: pair_style lj/charmm/coul/charmm/omp
@ -19,7 +20,7 @@
 pair_style lj/charmm/coul/charmm command
 ========================================

-Accelerator Variants: *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*
+Accelerator Variants: *lj/charmm/coul/charmm/gpu*, *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*

 pair_style lj/charmm/coul/charmm/implicit command
 =================================================
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -2297,6 +2297,7 @@ omegaz
 Omelyan
 omp
 OMP
+oneAPI
 onelevel
 oneway
 onn
@ -2528,6 +2529,7 @@ ptm
 PTM
 ptol
 ptr
+PTX
 pu
 purdue
 Purohit
--- a/lib/gpu/Makefile.cuda_mps
+++ b/lib/gpu/Makefile.cuda_mps
@ -51,7 +51,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c

 # host code compiler and settings

-CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
+CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
 CUDR_OPTS = -O2 $(LMP_INC)
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
         $(CUDPP_OPT)
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@ -17,7 +17,7 @@ LMP_INC = -DLAMMPS_SMALLBIG
 HIP_PRECISION = -D_SINGLE_DOUBLE

 HIP_OPTS = -O3
-HIP_HOST_OPTS = -Wno-deprecated-declarations
+HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp
 HIP_HOST_INCLUDE =

 # use device sort
--- a/lib/gpu/Makefile.lammps.mac_ocl
+++ b/lib/gpu/Makefile.lammps.mac_ocl
@ -1,5 +1,5 @@
 # Settings that the LAMMPS build will import when this package library is used

-gpu_SYSINC =
+gpu_SYSINC = -DFFT_SINGLE
 gpu_SYSLIB = -framework OpenCL
 gpu_SYSPATH = 
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@ -1,25 +1,21 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for OpenCL 
+#  Generic Linux Makefile for OpenCL - Mixed precision
 # ------------------------------------------------------------------------- */

 # which file will be copied to Makefile.lammps

 EXTRAMAKE = Makefile.lammps.opencl

-# OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
-
 # this setting should match LAMMPS Makefile
 # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL

 LMP_INC = -DLAMMPS_SMALLBIG

-OCL_INC = -I/usr/local/cuda/include  # Path to CL directory
-OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) -std=c++11
-OCL_LINK = -L/usr/local/cuda/lib64 -lOpenCL
+OCL_INC = 
+OCL_CPP = mpic++ -std=c++11 -O3 -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
+OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -fopenmp -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT

 BIN_DIR = ./
 OBJ_DIR = ./
@ -28,4 +24,3 @@ AR = ar
 BSH = /bin/sh

 include Opencl.makefile
-
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@ -1,19 +1,17 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Mac Makefile for OpenCL 
+#  Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
 # ------------------------------------------------------------------------- */

 # which file will be copied to Makefile.lammps

 EXTRAMAKE = Makefile.lammps.mac_ocl

-OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-# OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
+LMP_INC = -DLAMMPS_SMALLBIG

-OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
+OCL_CPP = clang++ -std=c++11 -O3 -I../../src/STUBS
 OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE
+OCL_TUNE = -DUCL_NO_EXIT

 BIN_DIR = ./
 OBJ_DIR = ./
--- a/lib/gpu/Makefile.mac_opencl_mpi
+++ b/lib/gpu/Makefile.mac_opencl_mpi
@ -0,0 +1,23 @@
+# /* ----------------------------------------------------------------------   
+#  Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.mac_ocl
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_CPP = mpicxx -std=c++11 -O3 -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+OCL_LINK = -framework OpenCL
+OCL_PREC = -D_SINGLE_SINGLE
+OCL_TUNE = -DUCL_NO_EXIT -DMPI_GERYON
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
+
--- a/lib/gpu/Makefile.oneapi
+++ b/lib/gpu/Makefile.oneapi
@ -0,0 +1,26 @@
+# /* ----------------------------------------------------------------------
+#  Generic Linux Makefile for OpenCL
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.opencl
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_INC =
+OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd  -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
+OCL_LINK = -lOpenCL
+OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
--- a/lib/gpu/Makefile.opencl
+++ b/lib/gpu/Makefile.opencl
@ -1,92 +0,0 @@
-# /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for OpenCL 
-# ------------------------------------------------------------------------- */
-
-# which file will be copied to Makefile.lammps
-
-EXTRAMAKE = Makefile.lammps.opencl
-
-# this setting should match LAMMPS Makefile
-# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
-
-LMP_INC = -DLAMMPS_SMALLBIG
-
-# precision for GPU calculations
-# -D_SINGLE_SINGLE  # Single precision for all calculations
-# -D_DOUBLE_DOUBLE  # Double precision for all calculations
-# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
-
-OCL_PREC = -D_SINGLE_DOUBLE
-
-BIN_DIR = ./
-OBJ_DIR = ./
-LIB_DIR = ./
-AR = ar
-BSH = /bin/sh
-
-# Compiler and linker settings
-
-# OCL_TUNE = -DFERMI_OCL     # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
-
-OCL_INC = -I/usr/local/cuda/include  # Path to CL directory
-OCL_CPP = mpic++ $(DEFAULT_DEVICE) -g -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
-OCL_LINK = -lOpenCL
-OCL  = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
-
-# Headers for Geryon
-UCL_H  = $(wildcard ./geryon/ucl*.h)
-OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
-PRE1_H = lal_preprocessor.h lal_aux_fun1.h
-ALL_H  =  $(OCL_H) $(wildcard ./lal_*.h)
-
-# Source files
-SRCS := $(wildcard ./lal_*.cpp)
-OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
-CUS  := $(wildcard lal_*.cu)
-KERS := $(subst ./,$(OBJ_DIR)/,$(CUS:lal_%.cu=%_cl.h))
-KERS := $(addprefix $(OBJ_DIR)/, $(KERS))
-
-# targets
-
-GPU_LIB = $(LIB_DIR)/libgpu.a
-
-EXECS = $(BIN_DIR)/ocl_get_devices
-
-all: $(OBJ_DIR) $(KERS) $(GPU_LIB) $(EXECS)
-
-$(OBJ_DIR):
-	mkdir -p $@
-
-# device code compilation
-
-$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
-	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
-
-# host code compilation
-
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
-	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
-
-# build libgpu.a
-
-$(GPU_LIB): $(OBJS)
-	$(AR) -crusv $(GPU_LIB) $(OBJS)
-	@cp $(EXTRAMAKE) Makefile.lammps
-
-# test app for querying device info
-
-$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
-	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
-
-clean:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
-
-veryclean: clean
-	-rm -rf *~ *.linkinfo
-
-cleanlib:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
-
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -1,6 +1,7 @@
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
-NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
+         lal_pre_cuda_hip.h
 ALL_H  =  $(NVD_H) $(wildcard ./lal_*.h)

 # Source files
@ -39,17 +40,21 @@ BIN2C = $(CUDA_HOME)/bin/bin2c

 # device code compilation

-$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
+                         lal_pre_cuda_hip.h
 	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu

 $(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
 	$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
+	rm $(OBJ_DIR)/pppm_f.cubin

-$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
+                         lal_pre_cuda_hip.h
 	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu

 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
+	rm $(OBJ_DIR)/pppm_d.cubin

 $(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
@ -93,7 +98,7 @@ $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 

 clean:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.linkinfo
+	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.cubin *.linkinfo

 veryclean: clean
 	-rm -rf *~ *.linkinfo
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -1,8 +1,15 @@
+# Common headers for kernels
+PRE1_H = lal_preprocessor.h lal_aux_fun1.h
+
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
-OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
-PRE1_H = lal_preprocessor.h lal_aux_fun1.h
-ALL_H  =  $(OCL_H) $(wildcard ./lal_*.h)
+OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
+
+# Headers for Host files
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
+         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
+         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
+         lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)

 # Source files
 SRCS := $(wildcard ./lal_*.cpp)
@ -28,12 +35,75 @@ OCL  = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL

 # device code compilation

+$(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h
+
+$(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h
+
+$(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h
+
+$(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h
+
+$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
+
+$(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
+
+$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
+
+$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
+
+$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
+
+$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
+
+$(OBJ_DIR)/tersoff_cl.h: lal_tersoff.cu $(PRE1_H) lal_tersoff_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff $(PRE1_H) lal_tersoff_extra.h lal_tersoff.cu $(OBJ_DIR)/tersoff_cl.h;
+
+$(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff_mod $(PRE1_H) lal_tersoff_mod_extra.h lal_tersoff_mod.cu $(OBJ_DIR)/tersoff_mod_cl.h;
+
+$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
+
 $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;

 # host code compilation

-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(OCL) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(OCL) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(OCL) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cl.h $(HOST_H)
 	$(OCL) -o $@ -c $< -I$(OBJ_DIR)

 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
--- a/lib/gpu/README
+++ b/lib/gpu/README
@ -4,18 +4,109 @@

                       W. Michael Brown (ORNL)
                        Trung Dac Nguyen (ORNL/Northwestern)
-                          Peng Wang (NVIDIA)
+                        Nitin Dhamankar (Intel)
                       Axel Kohlmeyer (Temple)
+                          Peng Wang (NVIDIA)
+                        Anders Hafreager (UiO)
+                          V. Nikolskiy (HSE)
+                   Maurice de Koning (Unicamp/Brazil)
+                  Rodolfo Paula Leite (Unicamp/Brazil)
                         Steve Plimpton (SNL)
                        Inderaj Bains (NVIDIA)

-------------------------------------------------------------------

-This directory has source files to build a library that LAMMPS
-links against when using the GPU package.
+------------------------------------------------------------------------------

-This library must be built with a C++ compiler, before LAMMPS is
-built, so LAMMPS can link against it.
+This directory has source files to build a library that LAMMPS links against
+when using the GPU package.
+
+This library must be built with a C++ compiler along with CUDA, HIP, or OpenCL
+before LAMMPS is built, so LAMMPS can link against it.
+
+This library, libgpu.a, provides routines for acceleration of certain
+LAMMPS styles and neighbor list builds using CUDA, OpenCL, or ROCm HIP.
+
+Pair styles supported by this library are marked in the list of Pair style
+potentials with a "g". See the online version at:
+
+https://lammps.sandia.gov/doc/Commands_pair.html
+
+In addition the (plain) pppm kspace style is supported as well.
+
+------------------------------------------------------------------------------
+                              DEVICE QUERY
+------------------------------------------------------------------------------
+The gpu library includes binaries to check for available GPUs and their
+properties. It is a good idea to run this on first use to make sure the
+system and build is setup properly. Additionally, the GPU numbering for
+specific selection of devices should be taking from this output. The GPU
+library may split some accelerators into separate virtual accelerators for
+efficient use with MPI.
+
+After building the GPU library, for OpenCL:
+  ./ocl_get_devices
+for CUDA:
+  ./nvc_get_devices
+and for ROCm HIP:
+  ./hip_get_devices
+
+------------------------------------------------------------------------------
+                              QUICK START
+------------------------------------------------------------------------------
+OpenCL: Mac without MPI:
+  make -f Makefile.mac_opencl -j; cd ../../src/; make mpi-stubs
+  make g++_serial -j
+  ./lmp_g++_serial -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Mac with MPI:
+  make -f Makefile.mac_opencl_mpi -j; cd ../../src/; make g++_openmpi -j
+  mpirun -np $NUM_MPI ./lmp_g++_openmpi -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Linux with Intel oneAPI:
+  make -f Makefile.oneapi -j; cd ../../src; make oneapi -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_oneapi -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Linux with MPI:
+  make -f Makefile.linux_opencl -j; cd ../../src; make omp -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+
+NVIDIA CUDA:
+  make -f Makefile.cuda_mps -j; cd ../../src; make omp -j
+  export CUDA_MPS_LOG_DIRECTORY=/tmp; export CUDA_MPS_PIPE_DIRECTORY=/tmp
+  nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+  echo quit | /usr/bin/nvidia-cuda-mps-control
+
+AMD HIP:
+  make -f Makefile.hip -j; cd ../../src; make omp -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+
+------------------------------------------------------------------------------
+                 Installing oneAPI, OpenCl, CUDA, or ROCm
+------------------------------------------------------------------------------
+The easiest approach is to use the linux package manger to perform the
+installation from Intel, NVIDIA, etc. repositories. All are available for
+free. The oneAPI installation includes Intel optimized MPI and C++ compilers,
+along with many libraries. Alternatively, Intel OpenCL can also be installed
+separately from the Intel repository.
+
+NOTE: Installation of the CUDA SDK is not required, only the CUDA toolkit.
+
+See:
+
+https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit.html
+
+https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+
+https://github.com/RadeonOpenCompute/ROCm
+
+------------------------------------------------------------------------------
+                              Build Intro
+------------------------------------------------------------------------------

 You can type "make lib-gpu" from the src directory to see help on how
 to build this library via make commands, or you can do the same thing
@ -25,7 +116,7 @@ do it manually by following the instructions below.
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:

-make -f Makefile.linux
+make -f Makefile.linux_opencl

 When you are done building this library, two files should
 exist in this directory:
@ -45,33 +136,132 @@ IMPORTANT: If you re-build the library, e.g. for a different precision
 Makefile.linux clean, to insure all previous derived files are removed
 before the new build is done.

-Makefile.lammps has settings for 3 variables:
+NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
+      or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in
+      src/MAKE/Makefile.foo) should be consistent with that specified
+      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).

-user-gpu_SYSINC = leave blank for this package
-user-gpu_SYSLIB = CUDA libraries needed by this package
-user-gpu_SYSPATH = path(s) to where those libraries are

-Because you have the CUDA compilers on your system, you should have
-the needed libraries.  If the CUDA development tools were installed
-in the standard manner, the settings in the Makefile.lammps.standard
-file should work.
+------------------------------------------------------------------------------
+                             PRECISION MODES
+------------------------------------------------------------------------------
+The GPU library supports 3 precision modes: single, double, and mixed, with
+the latter being the default for most Makefiles aside from Mac specific
+Makefiles due to the more restrictive nature of the Apple OpenCL for some
+devices.

-------------------------------------------------------------------
+To specify the precision mode (output to the screen before LAMMPS runs for
+verification), set either CUDA_PRECISION, OCL_PREC, or HIP_PRECISION to one
+of -D_SINGLE_SINGLE, -D_DOUBLE_DOUBLE, or -D_SINGLE_DOUBLE.

-                          GENERAL NOTES
-                  --------------------------------
+Some accelerators or OpenCL implementations only support single precision.
+This mode should be used with care and appropriate validation as the errors
+can scale with system size in this implementation. This can be useful for
+accelerating test runs when setting up a simulation for production runs on
+another machine. In the case where only single precision is supported, either
+LAMMPS must be compiled with -DFFT_SINGLE to use PPPM with GPU acceleration
+or GPU acceleration should be disabled for PPPM (e.g. suffix off or pair/only
+as described in the LAMMPS documentation).

-This library, libgpu.a, provides routines for GPU acceleration
-of certain LAMMPS styles and neighbor list builds. Compilation of this 
-library requires installing the CUDA GPU driver and CUDA toolkit for
-your operating system. Installation of the CUDA SDK is not necessary.
-In addition to the LAMMPS library, the binary nvc_get_devices will also
-be built. This can be used to query the names and properties of GPU 
-devices on your system. A Makefile for OpenCL and ROCm HIP compilation
-is provided, but support for it is not currently provided by the developers.
-Details of the implementation are provided in:

----
+------------------------------------------------------------------------------
+                             CUDA BUILD NOTES
+------------------------------------------------------------------------------
+NOTE: when compiling with CMake, all of the considerations listed below
+are considered within the CMake configuration process, so no separate
+compilation of the gpu library is required. Also this will build in support
+for all compute architecture that are supported by the CUDA toolkit version
+used to build the gpu library.
+
+If you do not want to use a fat binary, that supports multiple CUDA
+architectures, the CUDA_ARCH must be set to match the GPU architecture. This
+is reported by nvc_get_devices executable created by the build process and
+a detailed list of GPU architectures and CUDA compatible GPUs can be found
+e.g. here: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+
+The CUDA_HOME variable should be set to the location of the CUDA toolkit.
+
+To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of
+the Makefiles. CUDA_ARCH should be set based on the compute capability of
+your GPU. This can be verified by running the nvc_get_devices executable after
+the build is complete. Additionally, the GPU package must be installed and
+compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
+LAMMPS makefile.
+
+Please note that the GPU library accesses the CUDA driver library directly,
+so it needs to be linked with the CUDA driver library (libcuda.so) that ships
+with the Nvidia driver. If you are compiling LAMMPS on the head node of a GPU
+cluster, this library may not be installed, so you may need to copy it over
+from one of the compute nodes (best into this directory). Recent CUDA toolkits
+starting from CUDA 9 provide a dummy libcuda.so library (typically under
+$(CUDA_HOME)/lib64/stubs), that can be used for linking.
+
+Best performance with the GPU library is typically with multiple MPI processes
+sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
+MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
+mode with MPS, the GPU library should be build with either of the equivalent
+-DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags.
+
+------------------------------------------------------------------------------
+                             HIP BUILD NOTES
+------------------------------------------------------------------------------
+
+1. GPU sorting requires installing hipcub
+(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
+additionally requires cub (https://nvlabs.github.io/cub). Download and
+extract the cub directory to lammps/lib/gpu/ or specify an appropriate
+path in lammps/lib/gpu/Makefile.hip.
+2. In Makefile.hip it is possible to specify the target platform via
+export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
+architecture (gfx803, gfx900, gfx906 etc.)
+3. If your MPI implementation does not support `mpicxx --showme` command,
+it is required to specify the corresponding MPI compiler and linker flags
+in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
+
+------------------------------------------------------------------------------
+                             OPENCL BUILD NOTES
+------------------------------------------------------------------------------
+If GERYON_NUMA_FISSION is defined at build time, LAMMPS will consider separate
+NUMA nodes on GPUs or accelerators as separate devices. For example, a 2-socket
+CPU would appear as two separate devices for OpenCL (and LAMMPS would require
+two MPI processes to use both sockets with the GPU library - each with its
+own device ID as output by ocl_get_devices).
+
+For a debug build, use "-DUCL_DEBUG -DGERYON_KERNEL_DUMP" and remove
+"-DUCL_NO_EXIT" and "-DMPI_GERYON" from the build options.
+
+------------------------------------------------------------------------------
+                   ALL PREPROCESSOR OPTIONS (For Advanced Users)
+------------------------------------------------------------------------------
+_SINGLE_SINGLE          Build library for single precision mode
+_SINGLE_DOUBLE          Build library for mixed precision mode
+_DOUBLE_DOUBLE          Build library for double precision mode
+CUDA_MPS_SUPPORT        Do not generate errors for exclusive mode for CUDA
+CUDA_PROXY              Same as above
+MPI_GERYON              Library should use MPI_Abort for unhandled errors
+GERYON_NUMA_FISSION     Accelerators with main memory NUMA are split into
+                        multiple virtual accelerators for each NUMA node
+LAL_USE_OMP=0           Disable OpenMP in lib, regardless of compiler setting
+LAL_USE_OMP_SIMD=0      Disable OpenMP SIMD in lib, regardless of compiler set
+GERYON_OCL_FLUSH        For OpenCL, flush queue after every enqueue
+LAL_NO_OCL_EV_JIT       Turn off JIT specialization for kernels in OpenCL
+LAL_USE_OLD_NEIGHBOR    Use old neighbor list algorithm
+USE_CUDPP               Enable GPU binning in neighbor builds (not recommended)
+USE_HIP_DEVICE_SORT     Enable GPU binning for HIP builds
+                        (only w/ LAL_USE_OLD_NEIGHBOR)
+LAL_NO_BLOCK_REDUCE     Use host for energy/virial accumulation
+LAL_OCL_EXTRA_ARGS      Supply extra args for OpenCL compiler delimited with :
+UCL_NO_EXIT             LAMMPS should handle errors instead of Geryon lib
+UCL_DEBUG               Debug build for Geryon
+GERYON_KERNEL_DUMP      Dump all compiled OpenCL programs with compiler
+                        flags and build logs
+GPU_CAST                Casting performed on GPU, untested recently
+THREE_CONCURRENT        Concurrent 3-body calcs in separate queues, untested
+
+
+------------------------------------------------------------------------------
+                           References for Details
+------------------------------------------------------------------------------

 Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
 Molecular Dynamics on Hybrid High Performance Computers - Short Range
@ -89,116 +279,3 @@ Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
 Performance Computers - Three-Body Potentials. Computer Physics Communications.
 2013. 184: p. 2785–2793.

----
-
-NOTE: Installation of the CUDA SDK is not required, only the CUDA
-toolkit itself or an OpenCL 1.2 compatible header and library.
-
-Pair styles supporting GPU acceleration this this library
-are marked in the list of Pair style potentials with a "g".
-See the online version at: https://lammps.sandia.gov/doc/Commands_pair.html
-
-In addition the (plain) pppm kspace style is supported as well.
-
-
-                     MULTIPLE LAMMPS PROCESSES
-                  --------------------------------
-                     
-Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
-GPUs cannot be utilized by a single MPI process. In many cases, the
-best performance will be obtained by running as many MPI processes as
-CPU cores available with the condition that the number of MPI processes
-is an integer multiple of the number of GPUs being used. See the 
-LAMMPS user manual for details on running with GPU acceleration.
-
-
-                    BUILDING AND PRECISION MODES
-                  --------------------------------
-
-To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of 
-the Makefiles. CUDA_ARCH should be set based on the compute capability of
-your GPU. This can be verified by running the nvc_get_devices executable after
-the build is complete. Additionally, the GPU package must be installed and
-compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
-LAMMPS makefile.
-
-Please note that the GPU library accesses the CUDA driver library directly,
-so it needs to be linked not only to the CUDA runtime library (libcudart.so)
-that ships with the CUDA toolkit, but also with the CUDA driver library
-(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
-on the head node of a GPU cluster, this library may not be installed,
-so you may need to copy it over from one of the compute nodes (best into
-this directory). Recent CUDA toolkits starting from CUDA 9 provide a dummy
-libcuda.so library (typically under $(CUDA_HOME)/lib64/stubs), that can be used for
-linking.
-
-The gpu library supports 3 precision modes as determined by 
-the CUDA_PRECISION variable:
-
-  CUDA_PRECISION = -D_SINGLE_SINGLE  # Single precision for all calculations
-  CUDA_PRECISION = -D_DOUBLE_DOUBLE  # Double precision for all calculations
-  CUDA_PRECISION = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
-
-As of CUDA 7.5 only GPUs with compute capability 2.0 (Fermi) or newer are
-supported and as of CUDA 9.0 only compute capability 3.0 (Kepler) or newer
-are supported. There are some limitations of this library for GPUs older
-than that, which require additional preprocessor flag, and limit features,
-but they are kept for historical reasons. There is no value in trying to
-use those GPUs for production calculations.
-
-You have to make sure that you set a CUDA_ARCH line suitable for your
-hardware and CUDA toolkit version: e.g. -arch=sm_35 for Tesla K20 or K40
-or -arch=sm_52 GeForce GTX Titan X. A detailed list of GPU architectures
-and CUDA compatible GPUs can be found e.g. here: 
-https://en.wikipedia.org/wiki/CUDA#GPUs_supported
-
-NOTE: when compiling with CMake, all of the considerations listed below
-are considered within the CMake configuration process, so no separate 
-compilation of the gpu library is required. Also this will build in support
-for all compute architecture that are supported by the CUDA toolkit version
-used to build the gpu library.
-
-Please note the CUDA_CODE settings in Makefile.linux_multi, which allows
-to compile this library with support for multiple GPUs. This list can be
-extended for newer GPUs with newer CUDA toolkits and should allow to build
-a single GPU library compatible with all GPUs that are worth using for
-GPU acceleration and supported by the current CUDA toolkits and drivers.
-
-NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG, 
-      or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in 
-      src/MAKE/Makefile.foo) should be consistent with that specified 
-      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
-
-                      BUILDING FOR HIP FRAMEWORK
-                   --------------------------------
-1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
-2. GPU sorting requires installing hipcub 
-(https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
-additionally requires cub (https://nvlabs.github.io/cub). Download and
-extract the cub directory to lammps/lib/gpu/ or specify an appropriate
-path in lammps/lib/gpu/Makefile.hip.
-3. In Makefile.hip it is possible to specify the target platform via 
-export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target 
-architecture (gfx803, gfx900, gfx906 etc.)
-4. If your MPI implementation does not support `mpicxx --showme` command,
-it is required to specify the corresponding MPI compiler and linker flags
-in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
-5. Building the GPU library (libgpu.a): 
-    cd lammps/lib/gpu; make -f Makefile.hip -j
-6. Building the LAMMPS executable (lmp_hip):
-    cd ../../src; make hip -j
-
-                      EXAMPLE CONVENTIONAL BUILD PROCESS
-                  --------------------------------
-                    
-cd ~/lammps/lib/gpu
-emacs Makefile.linux
-make -f Makefile.linux
-./nvc_get_devices
-cd ../../src
-emacs ./MAKE/Makefile.linux
-make yes-asphere
-make yes-kspace
-make yes-gpu
-make linux
-
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@ -24,6 +24,8 @@ namespace ucl_hip {
 // --------------------------------------------------------------------------
 typedef hipStream_t command_queue;

+inline void ucl_flush(command_queue &cq) {}
+
 inline void ucl_sync(hipStream_t &stream) {
  CU_SAFE_CALL(hipStreamSynchronize(stream));
 }
@ -143,15 +145,26 @@ class UCL_Device {
  inline std::string device_type_name(const int i) { return "GPU"; }

  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i) { return UCL_GPU; }
+  inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }

  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory() { return shared_memory(_device); }
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }

+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].SIMDWidth;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].SIMDWidth;}
+
  /// Returns true if double precision is support for the current device
  inline bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
@ -215,6 +228,18 @@ class UCL_Device {
  /// Get the maximum number of threads per block
  inline size_t group_size(const int i)
    { return _properties[i].maxThreadsPerBlock; }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].maxThreadsDim[dim];}
+  
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].sharedMemPerBlock; }
 
  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
@ -255,11 +280,20 @@ class UCL_Device {
  inline int max_sub_devices(const int i)
    { return 0; }

+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return arch(i)>=3.0; }
+
  /// List all devices along with all properties
  inline void print_all(std::ostream &out);

-  /// Select the platform that has accelerators (for compatibility with OpenCL)
-  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+  /// For compatability with OCL API
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="")
+    { return set_platform(0); }

  inline int load_module(const void* program, hipModule_t& module, std::string *log=nullptr){
    auto it = _loaded_modules.emplace(program, hipModule_t());
--- a/lib/gpu/geryon/hip_kernel.h
+++ b/lib/gpu/geryon/hip_kernel.h
@ -14,6 +14,7 @@
 #include <fstream>
 #include <string>
 #include <iostream>
+#include <cstdio>

 namespace ucl_hip {

@ -64,7 +65,7 @@ class UCL_Program {
  }

  /// Load a program from a string and compile with flags
-  inline int load_string(const void *program, const char *flags="", std::string *log=nullptr) {
+  inline int load_string(const void *program, const char *flags="", std::string *log=nullptr, FILE* foutput=nullptr) {
    return _device_ptr->load_module(program, _module, log);
  }

@ -73,6 +74,7 @@ class UCL_Program {
  hipModule_t _module;
  hipStream_t _cq;
  friend class UCL_Texture;
+  friend class UCL_Const;
 };

 /// Class for dealing with CUDA Driver kernels
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@ -107,6 +107,37 @@ class UCL_Texture {
  }
 };

+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+  UCL_Const() {}
+  ~UCL_Const() {}
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    _cq=prog.cq();
+    CU_SAFE_CALL(hipModuleGetGlobal(&_global, &_global_bytes, prog._module,
+				    global_name)); 
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    CU_SAFE_CALL(hipMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
+				    _cq));
+  }
+  /// Get device ptr associated with object
+  inline const void* begin() const { return &_global; }
+  inline void clear() {}
+
+ private:
+  hipStream_t _cq;
+  void* _global;
+  size_t _global_bytes;
+  friend class UCL_Kernel;
+};
+
 } // namespace

 #endif
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@ -37,6 +37,8 @@ namespace ucl_cudadr {
 // --------------------------------------------------------------------------
 typedef CUstream command_queue;

+inline void ucl_flush(command_queue &cq) {}
+
 inline void ucl_sync(CUstream &stream) {
  CU_SAFE_CALL(cuStreamSynchronize(stream));
 }
@ -156,15 +158,26 @@ class UCL_Device {
  inline std::string device_type_name(const int i) { return "GPU"; }

  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i) { return UCL_GPU; }
+  inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }

  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory() { return shared_memory(_device); }
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }

+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].SIMDWidth;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].SIMDWidth;}
+
  /// Returns true if double precision is support for the current device
  inline bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
@ -228,6 +241,18 @@ class UCL_Device {
  /// Get the maximum number of threads per block
  inline size_t group_size(const int i)
    { return _properties[i].maxThreadsPerBlock; }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].maxThreadsDim[dim]; }
+  
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].sharedMemPerBlock; }

  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
@ -268,11 +293,22 @@ class UCL_Device {
  inline int max_sub_devices(const int i)
    { return 0; }

+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return arch(i)>=3.0; }
+
  /// List all devices along with all properties
  inline void print_all(std::ostream &out);

-  /// Select the platform that has accelerators (for compatibility with OpenCL)
-  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+  /// For compatability with OCL API
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="",
+			       const int ndevices=-1,
+			       const int first_device=-1)
+    { return set_platform(0); }

 private:
  int _device, _num_devices;
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@ -26,6 +26,7 @@

 #include "nvd_device.h"
 #include <fstream>
+#include <cstdio>

 namespace ucl_cudadr {

@ -77,7 +78,7 @@ class UCL_Program {

  /// Load a program from a string and compile with flags
  inline int load_string(const void *program, const char *flags="",
-                         std::string *log=nullptr) {
+                         std::string *log=nullptr, FILE* foutput=nullptr) {
    if (std::string(flags)=="BINARY")
      return load_binary((const char *)program);
    const unsigned int num_opts=2;
@ -100,12 +101,25 @@ class UCL_Program {

    if (err != CUDA_SUCCESS) {
      #ifndef UCL_NO_EXIT
-      std::cerr << std::endl
+      std::cerr << std::endl << std::endl
                << "----------------------------------------------------------\n"
                << " UCL Error: Error compiling PTX Program...\n"
                << "----------------------------------------------------------\n";
-      std::cerr << log << std::endl;
+      std::cerr << log << std::endl
+                << "----------------------------------------------------------\n\n";
      #endif
+      if (foutput != NULL) {
+	fprintf(foutput,"\n\n");
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput," UCL Error: Error compiling PTX Program...\n");
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput,"%s\n",log);
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput,"\n\n");
+      }
      return UCL_COMPILE_ERROR;
    }

@ -139,11 +153,15 @@ class UCL_Program {
    return UCL_SUCCESS;
  }

+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+
  friend class UCL_Kernel;
 private:
  CUmodule _module;
  CUstream _cq;
  friend class UCL_Texture;
+  friend class UCL_Const;
 };

 /// Class for dealing with CUDA Driver kernels
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@ -38,8 +38,11 @@ class UCL_Texture {
  inline UCL_Texture(UCL_Program &prog, const char *texture_name)
    { get_texture(prog,texture_name); }
  /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name)
-    { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
+  inline void get_texture(UCL_Program &prog, const char *texture_name) {
+    #if (CUDA_VERSION < 11000)
+    CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name));
+    #endif
+  }

  /// Bind a float array where each fetch grabs a vector of length numel
  template<class numtyp>
@ -72,11 +75,14 @@ class UCL_Texture {
  }

 private:
+  #if (CUDA_VERSION < 11000)
  CUtexref _tex;
+  #endif
  friend class UCL_Kernel;

  template<class mat_typ>
  inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #if (CUDA_VERSION < 11000)
    #ifdef UCL_DEBUG
    assert(numel!=0 && numel<5);
    #endif
@ -90,10 +96,42 @@ class UCL_Texture {
      else
        CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
    }
+    #endif
  }

 };

+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+  UCL_Const() {}
+  ~UCL_Const() {}
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    _cq=prog.cq();
+    CU_SAFE_CALL(cuModuleGetGlobal(&_global, &_global_bytes, prog._module,
+				   global_name)); 
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    CU_SAFE_CALL(cuMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
+				   _cq));
+  }
+  /// Get device ptr associated with object
+  inline const CUdeviceptr * begin() const { return &_global; }
+  inline void clear() {}
+
+ private:
+  CUstream _cq;
+  CUdeviceptr _global;
+  size_t _global_bytes;
+  friend class UCL_Kernel;
+};
+
 } // namespace

 #endif
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -28,12 +28,8 @@
 #include <vector>
 #include <iostream>

-/* We default to OpenCL 1.2 as target version for now as
- * there are known issues with OpenCL 2.0 and later.
- * This is also to silence warnings from generic OpenCL headers */
-
-#if !defined(CL_TARGET_OPENCL_VERSION)
-#define CL_TARGET_OPENCL_VERSION 120
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 210
 #endif

 #ifdef __APPLE__
@ -55,17 +51,36 @@ namespace ucl_opencl {
 typedef cl_command_queue command_queue;
 typedef cl_context context_type;

+inline void ucl_flush(command_queue &cq) { CL_SAFE_CALL(clFlush(cq)); }
+
 inline void ucl_sync(cl_command_queue &cq) {
  CL_SAFE_CALL(clFinish(cq));
 }

-inline bool _shared_mem_device(cl_device_type &device_type) {
+#if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON)
+inline bool _shared_mem_device(cl_device_id &device) { return true; }
+#elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF)
+inline bool _shared_mem_device(cl_device_id &device) { return false; }
+#else
+inline bool _shared_mem_device(cl_device_id &device) {
+  #ifdef CL_VERSION_1_2
+  cl_bool br;
+  CL_SAFE_CALL(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY,
+                               sizeof(cl_bool), &br,NULL));
+  return (br == CL_TRUE);
+  #else
+  cl_device_type device_type;
+  CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
+			       sizeof(device_type),&device_type,NULL));
  return (device_type==CL_DEVICE_TYPE_CPU);
+  #endif
 }
+#endif

 struct OCLProperties {
  std::string name;
  cl_device_type device_type;
+  bool is_subdevice;
  cl_ulong global_mem;
  cl_ulong shared_mem;
  cl_ulong const_mem;
@ -74,12 +89,16 @@ struct OCLProperties {
  size_t work_group_size;
  size_t work_item_size[3];
  bool double_precision;
+  int preferred_vector_width32, preferred_vector_width64;
  int alignment;
  size_t timer_resolution;
  bool ecc_support;
  std::string c_version;
  bool partition_equal, partition_counts, partition_affinity;
  cl_uint max_sub_devices;
+  int cl_device_version;
+  bool has_subgroup_support;
+  bool has_shuffle_support;
 };

 /// Class for looking at data parallel device properties
@ -182,15 +201,26 @@ class UCL_Device {
  inline std::string device_type_name(const int i);

  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i);
+  inline enum UCL_DEVICE_TYPE device_type(const int i);

  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory() { return shared_memory(_device); }
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory(const int i)
-    { return _shared_mem_device(_properties[i].device_type); }
+    { return _shared_mem_device(_cl_devices[i]); }
+
+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].preferred_vector_width32;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].preferred_vector_width64;}
  
  /// Returns true if double precision is support for the current device
  inline bool double_precision() { return double_precision(_device); }
@ -242,6 +272,18 @@ class UCL_Device {
  /// Get the maximum number of threads per block
  inline size_t group_size(const int i)
    { return _properties[i].work_group_size; }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].work_item_size[dim]; }
+
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].shared_mem; }

  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
@ -256,6 +298,12 @@ class UCL_Device {
  inline bool sharing_supported(const int i)
    { return true; }

+  /// True if the device is a sub-device
+  inline bool is_subdevice()
+    { return is_subdevice(_device); }
+  /// True if the device is a sub-device
+  inline bool is_subdevice(const int i)
+    { return _properties[i].is_subdevice; }
  /// True if splitting device into equal subdevices supported
  inline bool fission_equal()
    { return fission_equal(_device); }
@ -274,6 +322,18 @@ class UCL_Device {
  /// True if splitting device into subdevices by affinity domains supported
  inline bool fission_by_affinity(const int i)
    { return _properties[i].partition_affinity; }
+  /// True if the device has subgroup support
+  inline bool has_subgroup_support()
+    { return has_subgroup_support(_device); }
+  /// True if the device has subgroup support
+  inline bool has_subgroup_support(const int i)
+    { return _properties[i].has_subgroup_support; }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return _properties[i].has_shuffle_support; }

  /// Maximum number of subdevices allowed from device fission
  inline int max_sub_devices()
@ -281,6 +341,12 @@ class UCL_Device {
  /// Maximum number of subdevices allowed from device fission
  inline int max_sub_devices(const int i)
    { return _properties[i].max_sub_devices; }
+  /// OpenCL version supported by the device
+  inline int cl_device_version()
+    { return cl_device_version(_device); }
+  /// OpenCL version supported by the device
+  inline int cl_device_version(const int i)
+    { return _properties[i].cl_device_version; }

  /// List all devices along with all properties
  inline void print_all(std::ostream &out);
@ -288,8 +354,14 @@ class UCL_Device {
  /// Return the OpenCL type for the device
  inline cl_device_id & cl_device() { return _cl_device; }

-  /// Select the platform that has accelerators
-  inline int set_platform_accelerator(int pid=-1);
+  /// Automatically set the platform by type, vendor, and/or CU count
+  /** If first_device is positive, search restricted to platforms containing
+    * this device IDs. If ndevices is positive, search is restricted 
+    * to platforms with at least that many devices  **/
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="",
+			       const int ndevices=-1,
+			       const int first_device=-1);

 private:
  int _num_platforms;          // Number of platforms
@ -322,8 +394,7 @@ UCL_Device::UCL_Device() {
    return;
  } else
    _num_platforms=static_cast<int>(nplatforms);
-  // note that platform 0 may not necessarily be associated with accelerators
-  set_platform_accelerator();
+  set_platform(0);
 }

 UCL_Device::~UCL_Device() {
@ -332,6 +403,14 @@ UCL_Device::~UCL_Device() {

 void UCL_Device::clear() {
  _properties.clear();
+
+  #ifdef GERYON_NUMA_FISSION
+  #ifdef CL_VERSION_1_2
+  for (int i=0; i<_cl_devices.size(); i++)
+    CL_DESTRUCT_CALL(clReleaseDevice(_cl_devices[i]));
+  #endif
+  #endif
+
  _cl_devices.clear();
  if (_device>-1) {
    for (size_t i=0; i<_cq.size(); i++) {
@ -341,6 +420,7 @@ void UCL_Device::clear() {
    CL_DESTRUCT_CALL(clReleaseContext(_context));
  }
  _device=-1;
+  _num_devices=0;
 }

 int UCL_Device::set_platform(int pid) {
@ -370,11 +450,51 @@ int UCL_Device::set_platform(int pid) {
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
                              &n));

+  #ifndef GERYON_NUMA_FISSION
  // --- Store properties for each device
  for (int i=0; i<_num_devices; i++) {
    _cl_devices.push_back(device_list[i]);
    add_properties(device_list[i]);
  }
+  #else
+  // --- Create sub-devices for anything partitionable by NUMA and store props
+  int num_unpart = _num_devices;
+  _num_devices = 0;
+  for (int i=0; i<num_unpart; i++) {
+    cl_uint num_subdevices = 1;
+    cl_device_id *subdevice_list = device_list + i;
+
+    #ifdef CL_VERSION_1_2
+    cl_device_affinity_domain adomain;
+    CL_SAFE_CALL(clGetDeviceInfo(device_list[i],
+				 CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
+				 sizeof(cl_device_affinity_domain),
+				 &adomain,NULL));
+
+    cl_device_partition_property props[3];
+    props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
+    props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
+    props[2]=0;
+    if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
+      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
+				      &num_subdevices));
+    if (num_subdevices > 1) {
+      subdevice_list = new cl_device_id[num_subdevices];
+      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
+				      subdevice_list, &num_subdevices));
+    }
+    #endif
+
+    for (int j=0; j<num_subdevices; j++) {
+      _num_devices++;
+      _cl_devices.push_back(subdevice_list[j]);
+      add_properties(subdevice_list[j]);
+    }
+
+    if (num_subdevices > 1) delete[] subdevice_list;
+  } // for i
+  #endif
+
  delete[] device_list;
  return UCL_SUCCESS;
 }
@ -429,11 +549,18 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                               sizeof(cl_uint),&op.alignment,nullptr));
  op.alignment/=8;

+  cl_uint float_width;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
+                               sizeof(float_width),&float_width,nullptr));
+  op.preferred_vector_width32=float_width;
+
  // Determine if double precision is supported
  cl_uint double_width;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                               sizeof(double_width),&double_width,nullptr));
+  op.preferred_vector_width64=double_width;
  if (double_width==0)
    op.double_precision=false;
  else
@ -452,9 +579,14 @@ void UCL_Device::add_properties(cl_device_id device_list) {
    op.ecc_support=true;

  op.c_version="";
+  op.is_subdevice=false;
  op.partition_equal=false;
  op.partition_counts=false;
  op.partition_affinity=false;
+  op.max_sub_devices=1;
+  op.cl_device_version=0;
+  op.has_subgroup_support=false;
+  op.has_shuffle_support=false;

  #ifdef CL_VERSION_1_2
  size_t return_bytes;
@ -463,6 +595,13 @@ void UCL_Device::add_properties(cl_device_id device_list) {
  op.c_version=buffer;

  cl_device_partition_property pinfo[4];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_TYPE,
+			       4*sizeof(cl_device_partition_property),
+			       &pinfo, &return_bytes));
+  if (return_bytes == 0) op.is_subdevice=false;
+  else if (pinfo[0]) op.is_subdevice=true;
+  else op.is_subdevice=false;
+
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PARTITION_PROPERTIES,
                               4*sizeof(cl_device_partition_property),
@ -480,6 +619,46 @@ void UCL_Device::add_properties(cl_device_id device_list) {
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                               sizeof(cl_uint),&op.max_sub_devices,nullptr));
+
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_VERSION,1024,buffer,nullptr));
+  int cl_version_maj = buffer[7] - '0';
+  int cl_version_min = buffer[9] - '0';
+  op.cl_device_version = cl_version_maj * 100 + cl_version_min * 10;
+
+  size_t ext_str_size_ret;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, 0, nullptr,
+			       &ext_str_size_ret));
+  char buffer2[ext_str_size_ret];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS,
+			       ext_str_size_ret, buffer2, nullptr));
+  #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
+  if (op.cl_device_version >= 210) {
+    if ((std::string(buffer2).find("cl_khr_subgroups") != std::string::npos) ||
+        (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos))
+      op.has_subgroup_support=true;
+    if (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos)
+      op.has_shuffle_support=true;
+  }
+  #endif
+  if (std::string(buffer2).find("cl_nv_device_attribute_query") !=
+      std::string::npos) {
+    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+    #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
+    #endif
+    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+    #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
+    #endif
+    cl_uint major, minor;
+    CL_SAFE_CALL(clGetDeviceInfo(device_list,
+				 CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                                 sizeof(cl_uint), &major, nullptr));
+    CL_SAFE_CALL(clGetDeviceInfo(device_list,
+				 CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                                 sizeof(cl_uint), &minor, nullptr));
+    double arch = static_cast<double>(minor)/10+major;
+    if (arch >= 3.0)
+      op.has_shuffle_support=true;
+  }
  #endif

  _properties.push_back(op);
@ -516,7 +695,7 @@ std::string UCL_Device::device_type_name(const int i) {
 }

 // Get a string telling the type of the device
-int UCL_Device::device_type(const int i) {
+enum UCL_DEVICE_TYPE UCL_Device::device_type(const int i) {
  if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
    return UCL_CPU;
  else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@ -529,14 +708,8 @@ int UCL_Device::device_type(const int i) {

 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
-  cl_device_id *device_list = new cl_device_id[_num_devices];
-  cl_uint n;
-  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
-                               device_list,&n));
-
  _device=num;
-  _cl_device=device_list[_device];
-  delete[] device_list;
+  _cl_device=_cl_devices[_device];
  return create_context();
 }

@ -555,6 +728,11 @@ void UCL_Device::print_all(std::ostream &out) {
      out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
      out << "  Type of device:                                "
          << device_type_name(i).c_str() << std::endl;
+      out << "  Is a subdevice:                                ";
+      if (is_subdevice(i))
+	out << "Yes\n";
+      else
+	out << "No\n";
      out << "  Double precision support:                      ";
      if (double_precision(i))
        out << "Yes\n";
@ -613,31 +791,91 @@ void UCL_Device::print_all(std::ostream &out) {
        out << "No\n";
      out << "  Maximum subdevices from fission:               "
          << max_sub_devices(i) << std::endl;
+      out << "  Shared memory system:                          ";
+      if (shared_memory(i))
+        out << "Yes\n";
+      else
+        out << "No\n";
    }
  }
 }

-// Select the platform that is associated with accelerators
-// if pid < 0, select the first platform
-int UCL_Device::set_platform_accelerator(int pid) {
-  if (pid < 0) {
-    int found = 0;
+int UCL_Device::auto_set_platform(const enum UCL_DEVICE_TYPE type,
+				  const std::string vendor,
+				  const int ndevices,
+				  const int first_device) {
+  if (_num_platforms < 2) return set_platform(0);
+
+  int last_device = -1;
+  if (first_device > -1) {
+    if (ndevices)
+      last_device = first_device + ndevices - 1;
+    else
+      last_device = first_device;
+  }
+  
+  bool vendor_match=false;
+  bool type_match=false;
+  int max_cus=0;
+  int best_platform=0;
+
+  std::string vendor_upper=vendor;
+  for (int i=0; i<vendor.length(); i++)
+    if (vendor_upper[i]<='z' && vendor_upper[i]>='a')
+      vendor_upper[i]=toupper(vendor_upper[i]);
+
  for (int n=0; n<_num_platforms; n++) {
    set_platform(n);
-      for (int i=0; i<num_devices(); i++) {
-        if ((_properties[i].device_type & CL_DEVICE_TYPE_CPU) ||
-            (_properties[i].device_type & CL_DEVICE_TYPE_GPU) ||
-            (_properties[i].device_type & CL_DEVICE_TYPE_ACCELERATOR)) {
-          found = 1;
-          break;
+    if (last_device > -1 && last_device >= num_devices()) continue;
+    if (ndevices > num_devices()) continue;
+
+    int first_id=0;
+    int last_id=num_devices()-1;
+    if (last_device > -1) {
+      first_id=first_device;
+      last_id=last_device;
+    }
+
+    if (vendor_upper!="") {
+      std::string pname = platform_name();
+      for (int i=0; i<pname.length(); i++)
+	if (pname[i]<='z' && pname[i]>='a')
+	  pname[i]=toupper(pname[i]);
+
+      if (pname.find(vendor_upper)!=std::string::npos) {
+	if (vendor_match == false) {
+	  best_platform=n;
+	  max_cus=0;
+	  vendor_match=true;
+	}
+      } else if (vendor_match)
+	continue;
+    }
+
+    if (type != UCL_DEFAULT) {
+      bool ptype_matched=false;
+      for (int d=first_id; d<=last_id; d++) {
+	if (type==device_type(d)) {
+	  if (type_match == false) {
+	    best_platform=n;
+	    max_cus=0;
+	    type_match=true;
+	    ptype_matched=true;
 	  }
 	}
-      if (found) return UCL_SUCCESS;
      }
-    return UCL_ERROR;
-  } else {
-    return set_platform(pid);
+      if (type_match==true && ptype_matched==false)
+	continue;
    }
+
+    for (int d=first_id; d<=last_id; d++) {
+      if (cus(d) > max_cus) {
+	best_platform=n;
+	max_cus=cus(d);
+      }
+    }
+  }
+  return set_platform(best_platform);
 }

 } // namespace ucl_opencl
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@ -2,6 +2,7 @@
                                ocl_kernel.h
                             -------------------
                               W. Michael Brown
+                            Nitin Dhamankar (Intel)

  Utilities for dealing with OpenCL kernels

@ -26,6 +27,7 @@

 #include "ocl_device.h"
 #include <fstream>
+#include <cstdio>

 namespace ucl_opencl {

@ -93,7 +95,7 @@ class UCL_Program {

  /// Load a program from a string and compile with flags
  inline int load_string(const void *program, const char *flags="",
-                         std::string *log=nullptr) {
+                         std::string *log=nullptr, FILE* foutput=nullptr) {
    cl_int error_flag;
    const char *prog=(const char *)program;
    _program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag);
@ -107,26 +109,65 @@ class UCL_Program {
                                       sizeof(cl_build_status),&build_status,
                                       nullptr));

-    if (build_status != CL_SUCCESS || log!=nullptr) {
+    #ifdef GERYON_KERNEL_DUMP
+    {
      size_t ms;
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
-                                         nullptr, &ms));
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 0,NULL,&ms));
      char *build_log = new char[ms];
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
-                                         build_log, nullptr));
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 ms,build_log, NULL));
+      std::cout << std::endl << std::endl
+		<< "--------------------------------------------------------\n"
+		<< "   UCL PROGRAM DUMP\n"
+		<< "--------------------------------------------------------\n"
+		<< flags << std::endl
+		<< "--------------------------------------------------------\n"
+		<< prog << std::endl
+		<< "--------------------------------------------------------\n"
+		<< build_log
+		<< "--------------------------------------------------------\n"
+		<< std::endl << std::endl;
+    }
+    #endif
+    
+    if (build_status != CL_SUCCESS || log!=NULL) {
+      size_t ms;
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 0,NULL,&ms));
+      char *build_log = new char[ms];
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 ms,build_log, NULL));

      if (log!=nullptr)
        *log=std::string(build_log);

      if (build_status != CL_SUCCESS) {
        #ifndef UCL_NO_EXIT
-        std::cerr << std::endl
+        std::cerr << std::endl << std::endl
          << "----------------------------------------------------------\n"
          << " UCL Error: Error compiling OpenCL Program ("
          << build_status << ") ...\n"
          << "----------------------------------------------------------\n";
        std::cerr << build_log << std::endl;
+	std::cerr <<
+	  "----------------------------------------------------------\n"
+	  << std::endl << std::endl;
        #endif
+	if (foutput != NULL) {
+	  fprintf(foutput,"\n\n");
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,
+		  " UCL Error: Error compiling OpenCL Program (%d) ...\n",
+		  build_status);
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,"%s\n",build_log);
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,"\n\n");
+	}
 	delete[] build_log;
        return UCL_COMPILE_ERROR;
      } else delete[] build_log;
@ -141,6 +182,7 @@ class UCL_Program {
  inline void cq(command_queue &cq_in) { _cq=cq_in; }

  friend class UCL_Kernel;
+  friend class UCL_Const;
 private:
  bool _init_done;
  cl_program _program;
@ -322,9 +364,45 @@ class UCL_Kernel {
  inline void cq(command_queue &cq_in) { _cq=cq_in; }
  #include "ucl_arg_kludge.h"

+  #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
+  inline size_t max_subgroup_size(const size_t block_size_x) {
+    size_t block_size = block_size_x;
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+
+  inline size_t max_subgroup_size(const size_t block_size_x,
+                                  const size_t block_size_y) {
+    size_t block_size[2] { block_size_x, block_size_y };
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+
+  inline size_t max_subgroup_size(const size_t block_size_x,
+                                  const size_t block_size_y,
+                                  const size_t block_size_z) {
+    size_t block_size[3] { block_size_x, block_size_y, block_size_z };
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+  #endif
+
 private:
  cl_kernel _kernel;
  cl_program _program;
+  cl_device_id _device;
  cl_uint _dimensions;
  size_t _block_size[3];
  size_t _num_blocks[3];
@ -338,6 +416,11 @@ class UCL_Kernel {
  unsigned _kernel_info_nargs;
  //std::string _kernel_info_args[256];
  #endif
+
+  #ifdef CL_VERSION_2_1
+  size_t _mx_subgroup_sz;      // Maximum sub-group size for this kernel
+  #endif
+
 };

 inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@ -347,6 +430,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
  CL_SAFE_CALL(clRetainCommandQueue(_cq));
  _program=program._program;
  CL_SAFE_CALL(clRetainProgram(_program));
+  _device=program._device;
  cl_int error_flag;
  _kernel=clCreateKernel(program._program,function,&error_flag);

@ -380,8 +464,11 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
 }

 void UCL_Kernel::run() {
-  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,nullptr,
-                                      _num_blocks,_block_size,0,nullptr,nullptr));
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
+                                      _num_blocks,_block_size,0,NULL,NULL));
+  #ifdef GERYON_OCL_FLUSH
+  ucl_flush(_cq);
+  #endif
 }

 } // namespace
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@ -4,12 +4,8 @@
 #include <cstdio>
 #include <cassert>

-/* We default to OpenCL 1.2 as target version for now as
- * there are known issues with OpenCL 2.0 and later.
- * This is also to silence warnings from generic OpenCL headers */
-
-#if !defined(CL_TARGET_OPENCL_VERSION)
-#define CL_TARGET_OPENCL_VERSION 120
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 210
 #endif

 #ifdef __APPLE__
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@ -108,7 +108,7 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
    return UCL_MEMORY_ERROR;
  *mat.host_ptr() = (typename mat_type::data_type*)
    clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
-                                         map_perm,0,n,0,nullptr,nullptr,nullptr);
+		       map_perm,0,n,0,NULL,NULL,NULL);

  mat.cq()=cm.cq();
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -116,18 +116,15 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
 }

 template <class mat_type, class copy_type>
-inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
+inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
+                      const size_t n) {
  cl_int error_flag;
-  cl_context context;
-  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
-                                  &context,nullptr));
-  cl_mem_flags orig_flags;
-  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
-                                  &orig_flags,nullptr));
-  orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
-
-  mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
-                              *mat.host_ptr(), &error_flag);
+  cl_buffer_region subbuffer;
+  subbuffer.origin = o;
+  subbuffer.size = n;
+  mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
+                                 CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
+                                 &error_flag);

  CL_CHECK_ERR(error_flag);
  CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@ -470,6 +467,9 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
  size_t kn=n/sizeof(typename mat_type::data_type);
  CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
  #endif
+  #ifdef GERYON_OCL_FLUSH
+  ucl_flush(cq);
+  #endif
 }

 // --------------------------------------------------------------------------
@ -585,7 +585,10 @@ template <> struct _ucl_memcpy<1,0> {
    std::cerr << "UCL_COPY 1NS\n";
    #endif
    CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
-                                     dst.begin(),0,nullptr,nullptr));
+                                     dst.begin(),0,NULL,NULL));
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
  }
  template <class p1, class p2>
  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@ -617,6 +620,9 @@ template <> struct _ucl_memcpy<1,0> {
        src_offset+=spitch;
        dst_offset+=dpitch;
      }
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
  }
 };

@ -637,7 +643,10 @@ template <> struct _ucl_memcpy<0,1> {
    std::cerr << "UCL_COPY 3NS\n";
    #endif
    CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
-                                      src.begin(),0,nullptr,nullptr));
+                                      src.begin(),0,NULL,NULL));
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
  }
  template <class p1, class p2>
  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@ -669,6 +678,9 @@ template <> struct _ucl_memcpy<0,1> {
        src_offset+=spitch;
        dst_offset+=dpitch;
      }
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
  }
 };

@ -690,6 +702,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
    #endif

    if (block==CL_TRUE) ucl_sync(cq);
+    #ifdef GERYON_OCL_FLUSH
+    else ucl_flush(cq);
+    #endif
  }
  template <class p1, class p2>
  static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@ -720,6 +735,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
    #endif

    if (block==CL_TRUE) ucl_sync(cq);
+    #ifdef GERYON_OCL_FLUSH
+    else ucl_flush(cq);
+    #endif
  }
 };

--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@ -53,6 +53,59 @@ class UCL_Texture {
  friend class UCL_Kernel;
 };

+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+   UCL_Const() : _global_bytes(0), _active(false) {}
+  ~UCL_Const() { clear(); }
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    if (_active) {
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+    }
+    _active = true;
+    _context = prog._context;
+    _cq = prog._cq;
+    CL_SAFE_CALL(clRetainContext(_context));
+    CL_SAFE_CALL(clRetainCommandQueue(_cq));
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    const int bytes=numel*sizeof(numtyp);
+    if (_global_bytes < bytes) {
+      if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
+      cl_int e;
+      _global = clCreateBuffer(_context, CL_MEM_READ_ONLY, bytes, NULL, &e);
+      CL_SAFE_CALL(e);
+    }
+    CL_SAFE_CALL(clEnqueueWriteBuffer(_cq, _global, CL_FALSE, 0, bytes,
+				      (void *)src.begin(), 0, NULL, NULL));
+  }
+  /// Get device ptr associated with object
+  inline const cl_mem * begin() const { return &_global; }
+  inline void clear() {
+    if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
+    if (_active) {
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+    }
+    _global_bytes=0;
+    _active=false;
+  }
+
+ private:
+  cl_mem _global;
+  size_t _global_bytes;
+  cl_context _context;
+  cl_command_queue _cq;
+  bool _active;
+};
+
 } // namespace

 #endif
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@ -61,7 +61,6 @@ class UCL_Timer {
  /// Initialize command queue for timing
  inline void init(UCL_Device &dev, command_queue &cq) {
    clear();
-    t_factor=dev.timer_resolution()/1000000000.0;
    _cq=cq;
    clRetainCommandQueue(_cq);
    _initialized=true;
@ -124,17 +123,17 @@ class UCL_Timer {
    clReleaseEvent(start_event);
    clReleaseEvent(stop_event);
    has_measured_time = false;
-    return (tend-tstart)*t_factor;
+    return (tend-tstart)*1e-6;
  }

  /// Return the time (s) of last start to stop - Forces synchronization
-  inline double seconds() { return time()/1000.0; }
+  inline double seconds() { return time()*1e-3; }

  /// Return the total time in ms
  inline double total_time() { return _total_time; }

  /// Return the total time in seconds
-  inline double total_seconds() { return _total_time/1000.0; }
+  inline double total_seconds() { return _total_time*1e-3; }

 private:
  cl_event start_event, stop_event;
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@ -69,17 +69,17 @@ class UCL_BaseMat {
  /// Return the type/permissions of memory allocation
  /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
    * or UCL_VIEW **/
+  /// Assert that any ops in associate command queue have been issued to device
+  inline void flush() { ucl_flush(_cq); }
+
  inline enum UCL_MEMOPT kind() const { return _kind; }

  inline bool shared_mem_device() {
    #ifdef _OCL_MAT
    cl_device_id device;
    CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
-                                       sizeof(cl_device_id),&device,nullptr));
-    cl_device_type device_type;
-    CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
-                                 sizeof(device_type),&device_type,nullptr));
-    return _shared_mem_device(device_type);
+                                       sizeof(cl_device_id),&device,NULL));
+    return _shared_mem_device(device);
    #else
    return false;
    #endif
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@ -39,7 +39,7 @@ class UCL_D_Vec : public UCL_BaseMat {
  };
  typedef numtyp data_type;

-  UCL_D_Vec() : _cols(0) {}
+ UCL_D_Vec() : _cols(0), _row_bytes(0) {}
  ~UCL_D_Vec() { _device_free(*this); }

  /// Construct with n columns
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@ -44,10 +44,8 @@ using namespace ucl_hip;
 int main(int argc, char** argv) {
  UCL_Device cop;
  std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
-  if (cop.num_platforms()>0) {
-    std::cout << "Using platform: " << cop.platform_name() << std::endl;
+  if (cop.num_platforms()>0)
    cop.print_all(std::cout);
-  }
  return 0;
 }

--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@ -241,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
    _array=input.begin()+offset;
    _end=_array+_cols;
    #ifdef _OCL_MAT
-    _host_view(*this,input,_row_bytes*_rows);
+    _host_view(*this,input,offset*sizeof(numtyp),_row_bytes*_rows);
    #endif
  }

--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@ -39,7 +39,7 @@ class UCL_H_Vec : public UCL_BaseMat {
   };
   typedef numtyp data_type;

-  UCL_H_Vec() : _cols(0) {
+ UCL_H_Vec() : _cols(0), _row_bytes(0) {
    #ifdef _OCL_MAT
    _carray=(cl_mem)(0);
    #endif
@ -135,7 +135,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    _cols=cols;
    _row_bytes=_cols*sizeof(numtyp);
    this->_cq=input.cq();
-    _array=input.begin();
+    _array=(numtyp *)input.begin();
    _end=_array+_cols;
    #ifdef _OCL_MAT
    _carray=input.cbegin();
@ -240,10 +240,10 @@ class UCL_H_Vec : public UCL_BaseMat {
    _cols=cols;
    _row_bytes=_cols*sizeof(numtyp);
    this->_cq=input.cq();
-    _array=input.begin()+offset;
+    _array=(numtyp *)input.begin()+offset;
    _end=_array+_cols;
    #ifdef _OCL_MAT
-    _host_view(*this,input,_row_bytes);
+    _host_view(*this,input,offset*sizeof(numtyp),_row_bytes);
    #endif
  }

--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@ -162,6 +162,8 @@ class UCL_Vector {
  inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
  /// Block until command_queue associated with matrix is complete
  inline void sync() { host.sync(); }
+  /// Assert that any ops in associate command queue have been issued to device
+  inline void flush() { ucl_flush(host.cq()); }
  
  ///Get the size of a row on the host (including any padding) in elements
  inline size_t row_size() const { return host.row_size(); }
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@ -14,6 +14,9 @@
 ***************************************************************************/

 #include "lal_answer.h"
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif

 namespace LAMMPS_AL {
 #define AnswerT Answer<numtyp,acctyp>
@ -81,6 +84,10 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
  _time_cast=0.0;
  _time_cpu_idle=0.0;

+  success=success && (error_flag.alloc(1,*dev,UCL_READ_WRITE,
+                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
+  if (success) error_flag.zero();
+
  return success && alloc(ef_inum);
 }

@ -111,6 +118,7 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
 template <class numtyp, class acctyp>
 void AnswerT::clear() {
  _gpu_bytes=0;
+  error_flag.clear();
  if (!_allocated)
    return;
  _allocated=false;
@ -138,12 +146,21 @@ double AnswerT::host_memory_usage() const {

 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
-                               const bool ef_atom, const bool vf_atom) {
+                           const bool ef_atom, const bool vf_atom,
+                           const int red_blocks) {
  time_answer.start();
  _eflag=eflag;
  _vflag=vflag;
  _ef_atom=ef_atom;
  _vf_atom=vf_atom;
+  #ifdef LAL_NO_BLOCK_REDUCE
+  _ev_stride=_inum;
+  #else
+  if (ef_atom || vf_atom)
+    _ev_stride=_inum;
+  else
+    _ev_stride=red_blocks;
+  #endif

  int csize=_ev_fields;
  if (!eflag)
@ -152,20 +169,24 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
    csize-=6;

  if (csize>0)
-    engv.update_host(_inum*csize,true);
+    engv.update_host(_ev_stride*csize,true);
  if (_rot)
    force.update_host(_inum*4*2,true);
  else
    force.update_host(_inum*4,true);
  time_answer.stop();
+
+  #ifndef GERYON_OCL_FLUSH
+  force.flush();
+  #endif
 }

 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
                           const bool ef_atom, const bool vf_atom,
-                               int *ilist) {
+                           int *ilist, const int red_blocks) {
  _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
+  copy_answers(eflag,vflag,ef_atom,vf_atom,red_blocks);
 }

 template <class numtyp, class acctyp>
@ -177,21 +198,24 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
  double evdwl=0.0;
  int vstart=0;
  if (_eflag) {
-    for (int i=0; i<_inum; i++)
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:evdwl)
+    #endif
+    for (int i=0; i<_ev_stride; i++)
      evdwl+=engv[i];
    if (_ef_atom) {
      if (_ilist==nullptr) {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
          eatom[i]+=engv[i];
      } else {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
          eatom[_ilist[i]]+=engv[i];
      }
    }
-    vstart=_inum;
+    vstart=_ev_stride;
  }
  if (_vflag) {
-    int iend=vstart+_inum;
+    int iend=vstart+_ev_stride;
    for (int j=0; j<6; j++) {
      for (int i=vstart; i<iend; i++)
        virial[j]+=engv[i];
@ -206,8 +230,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
            vatom[_ilist[ii++]][j]+=engv[i];
        }
      }
-      vstart+=_inum;
-      iend+=_inum;
+      vstart+=_ev_stride;
+      iend+=_ev_stride;
    }
  }

@ -224,28 +248,36 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
    return energy_virial(eatom,vatom,virial);

  double evdwl=0.0;
-  int ii, vstart=0, iend=_inum;
+  int ii, vstart=0, iend=_ev_stride;
  if (_eflag) {
-    iend=_inum*2;
-    for (int i=0; i<_inum; i++)
+    iend=_ev_stride*2;
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:evdwl)
+    #endif
+    for (int i=0; i<_ev_stride; i++)
      evdwl+=engv[i];
-    for (int i=_inum; i<iend; i++)
-      ecoul+=engv[i];
+    double ecv=0.0;
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:ecv)
+    #endif
+    for (int i=_ev_stride; i<iend; i++)
+      ecv+=engv[i];
+    ecoul+=ecv;
    if (_ef_atom) {
      if (_ilist==nullptr) {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
          eatom[i]+=engv[i];
-        for (int i=_inum; i<iend; i++)
+        for (int i=_ev_stride; i<iend; i++)
          eatom[i]+=engv[i];
      } else {
-        for (int i=0, ii=0; i<_inum; i++)
+        for (int i=0, ii=0; i<_ev_stride; i++)
          eatom[_ilist[ii++]]+=engv[i];
-        for (int i=_inum, ii=0; i<iend; i++)
+        for (int i=_ev_stride, ii=0; i<iend; i++)
          eatom[_ilist[ii++]]+=engv[i];
      }
    }
    vstart=iend;
-    iend+=_inum;
+    iend+=_ev_stride;
  }
  if (_vflag) {
    for (int j=0; j<6; j++) {
@ -260,8 +292,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
            vatom[_ilist[ii++]][j]+=engv[i];
        }
      }
-      vstart+=_inum;
-      iend+=_inum;
+      vstart+=_ev_stride;
+      iend+=_ev_stride;
    }
  }

@ -270,24 +302,63 @@ double AnswerT::energy_virial(double *eatom, double **vatom,

 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
-  int fl=0;
  if (_ilist==nullptr) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=force[fl];
-      f[i][1]+=force[fl+1];
-      f[i][2]+=force[fl+2];
-      fl+=4;
+    typedef struct { double x,y,z; } vec3d;
+    typedef struct { acctyp x,y,z,w; } vec4d_t;
+    vec3d *fp=reinterpret_cast<vec3d*>(&(f[0][0]));
+    vec4d_t *forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
+
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = omp_get_num_threads();
+      const int tid = omp_get_thread_num();
+      const int idelta = _inum / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = std::min(ifrom + idelta, _inum);
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = _inum;
+      #endif
+
+      for (int i=ifrom; i<ito; i++) {
+        fp[i].x+=forcep[i].x;
+        fp[i].y+=forcep[i].y;
+        fp[i].z+=forcep[i].z;
      }
      if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=force[fl];
-        tor[i][1]+=force[fl+1];
-        tor[i][2]+=force[fl+2];
-        fl+=4;
+        vec3d *torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
+        forcep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
+        for (int i=ifrom; i<ito; i++) {
+          torp[i].x+=forcep[i].x;
+          torp[i].y+=forcep[i].y;
+          torp[i].z+=forcep[i].z;
+        }
      }
    }
  } else {
-    for (int i=0; i<_inum; i++) {
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = omp_get_num_threads();
+      const int tid = omp_get_thread_num();
+      const int idelta = _inum / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = std::min(ifrom + idelta, _inum);
+      int fl=ifrom*4;
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = _inum;
+      int fl=0;
+      #endif
+
+      for (int i=ifrom; i<ito; i++) {
        int ii=_ilist[i];
        f[ii][0]+=force[fl];
        f[ii][1]+=force[fl+1];
@ -295,7 +366,8 @@ void AnswerT::get_answers(double **f, double **tor) {
        fl+=4;
      }
      if (_rot) {
-      for (int i=0; i<_inum; i++) {
+        fl=_inum*4 + ifrom*4;
+        for (int i=ifrom; i<ito; i++) {
          int ii=_ilist[i];
          tor[ii][0]+=force[fl];
          tor[ii][1]+=force[fl+1];
@ -304,6 +376,7 @@ void AnswerT::get_answers(double **f, double **tor) {
        }
      }
    }
+  }
 }

 template <class numtyp, class acctyp>
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@ -110,12 +110,12 @@ class Answer {
  // -------------------------COPY FROM GPU -------------------------------

  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
+  void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
+                    const bool vf_atom, const int red_blocks);

  /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
+  void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
+                    const bool vf_atom, int *ilist, const int red_blocks);

  /// Copy energy and virial data into LAMMPS memory
  double energy_virial(double *eatom, double **vatom, double *virial);
@ -128,11 +128,13 @@ class Answer {
  void get_answers(double **f, double **tor);

  inline double get_answers(double **f, double **tor, double *eatom,
-                            double **vatom, double *virial, double &ecoul) {
+                            double **vatom, double *virial, double &ecoul,
+                            int &error_flag_in) {
    double ta=MPI_Wtime();
    time_answer.sync_stop();
    _time_cpu_idle+=MPI_Wtime()-ta;
    double ts=MPI_Wtime();
+    if (error_flag[0]) error_flag_in=error_flag[0];
    double evdw=energy_virial(eatom,vatom,virial,ecoul);
    get_answers(f,tor);
    _time_cast+=MPI_Wtime()-ts;
@ -151,6 +153,8 @@ class Answer {
  UCL_Vector<acctyp,acctyp> force;
  /// Energy and virial per-atom storage
  UCL_Vector<acctyp,acctyp> engv;
+  /// Error flag
+  UCL_Vector<int,int> error_flag;

  /// Device timers
  UCL_Timer time_answer;
@ -162,7 +166,7 @@ class Answer {
  bool alloc(const int inum);

  bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
+  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields, _ev_stride;
  int *_ilist;
  double _time_cast, _time_cpu_idle;

--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@ -414,9 +414,9 @@ const char *atom=0;

 template <class numtyp, class acctyp>
 void AtomT::compile_kernels(UCL_Device &dev) {
-  std::string flags = "-D"+std::string(OCL_VENDOR);
+  std::string flags = "";
  atom_program=new UCL_Program(dev);
-  atom_program->load_string(atom,flags);
+  atom_program->load_string(atom,flags,nullptr,screen);
  k_cast_x.set_function(*atom_program,"kernel_cast_x");
  _compiled=true;
 }
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@ -24,6 +24,9 @@
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
+#ifndef LAL_NO_OCL_EV_JIT
+#define LAL_OCL_EV_JIT
+#endif
 #elif defined(USE_CUDART)
 #include "geryon/nvc_timer.h"
 #include "geryon/nvc_mat.h"
@ -178,7 +181,7 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

@ -197,7 +200,26 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
+    ucl_copy(dev_v,view,false);
+  }
+
+  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
+  template <class dev_typ, class t1, class t2>
+  inline void type_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
+                         UCL_H_Vec<numtyp> &buffer, t1 ***one, t2 ***two) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        for (int k=0; k<n; k++) {
+          buffer[ii*2]=static_cast<numtyp>(one[i][j][k]);
+          buffer[ii*2+1]=static_cast<numtyp>(two[i][j][k]);
+          ii++;
+        }
+      }
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view_offset(0,buffer,n*n*n);
    ucl_copy(dev_v,view,false);
  }

@ -217,7 +239,7 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

@ -238,7 +260,7 @@ class Atom {
      ii+=m_size-n;
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
    ucl_copy(dev_v,view,false);
  }

@ -251,7 +273,7 @@ class Atom {
      buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
    }
    UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),n,*dev);
+    view.view_offset(0,buffer,n);
    ucl_copy(dev_v,view,false);
  }

@ -261,6 +283,9 @@ class Atom {
  inline void data_unavail()
    { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }

+  typedef struct { double x,y,z; } vec3d;
+  typedef struct { numtyp x,y,z,w; } vec4d_t;
+
  /// Cast positions and types to write buffer
  inline void cast_x_data(double **host_ptr, const int *host_type) {
    if (_x_avail==false) {
@ -269,13 +294,16 @@ class Atom {
      memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
      #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
      for (int i=0; i<_nall; i++) {
-        x[wl]=host_ptr[i][0];
-        x[wl+1]=host_ptr[i][1];
-        x[wl+2]=host_ptr[i][2];
-        x[wl+3]=host_type[i];
-        wl+=4;
+        xp[i].x=host_p[i].x;
+        xp[i].y=host_p[i].y;
+        xp[i].z=host_p[i].z;
+        xp[i].w=host_type[i];
      }
      #endif
      _time_cast+=MPI_Wtime()-t;
@ -320,6 +348,11 @@ class Atom {
      } else if (sizeof(numtyp)==sizeof(double))
        memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
        for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
@ -346,6 +379,11 @@ class Atom {
      } else if (sizeof(numtyp)==sizeof(double))
        memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
      else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
        for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
      _time_cast+=MPI_Wtime()-t;
    }
@ -370,13 +408,16 @@ class Atom {
      memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
      memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
      #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
      for (int i=0; i<_nall; i++) {
-        v[wl]=host_ptr[i][0];
-        v[wl+1]=host_ptr[i][1];
-        v[wl+2]=host_ptr[i][2];
-        v[wl+3]=host_tag[i];
-        wl+=4;
+        vp[i].x=host_p[i].x;
+        vp[i].y=host_p[i].y;
+        vp[i].z=host_p[i].z;
+        vp[i].w=host_tag[i];
      }
      #endif
      _time_cast+=MPI_Wtime()-t;
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@ -40,170 +40,521 @@
    nbor_begin+=offset;                                                      \
  }

-#if (ARCH < 300)
+#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
+                    i, numj, stride, nbor_end, nbor_begin)                   \
+    i=nbor_mem[ii];                                                          \
+    nbor_begin=ii+nbor_stride;                                               \
+    numj=nbor_mem[nbor_begin];                                               \
+    nbor_begin+=nbor_stride+ii*(t_per_atom-1);                               \
+    stride=fast_mul(t_per_atom,nbor_stride);                                 \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj &             \
+                                                          (t_per_atom-1));   \
+    nbor_begin+=offset;

-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+#if (SHUFFLE_AVAIL == 0)
+
+#define simd_reduce_add1(width, local, offset, tid, one)                    \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) local[0][tid] += local[0][tid+s];                       \
+  }                                                                         \
+  if (offset==0) one=local[0][tid];
+
+#define simd_reduce_add2(width, local, offset, tid, one, two)               \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
    if (offset < s) {                                                       \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
    }                                                                       \
  }                                                                         \
  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+  }
+
+#define simd_reduce_add3(width, local, offset, tid, one, two, three)        \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+  }
+
+#define simd_reduce_add6(width, local, offset, tid, one, two, three,        \
+                         four, five, six)                                   \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  local[3][tid]=four;                                                       \
+  local[4][tid]=five;                                                       \
+  local[5][tid]=six;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
+      local[3][tid] += local[3][tid+s];                                     \
+      local[4][tid] += local[4][tid+s];                                     \
+      local[5][tid] += local[5][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+    four=local[3][tid];                                                     \
+    five=local[4][tid];                                                     \
+    six=local[5][tid];                                                      \
+  }
+
+#define simd_reduce_arr(trip, width, local, offset, tid, arr)               \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r]=local[r][tid];                                                 \
+  }
+
+#define block_reduce_add1(width, local, tid, one)                           \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) local[0][tid] += local[0][tid+s];                          \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) local[0][tid] += local[0][tid+s];                        \
+    }                                                                       \
+    if (tid==0) one=local[0][tid];                                          \
+  }
+
+#define block_reduce_add2(width, local, tid, one, two)                      \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        local[0][tid] += local[0][tid+s];                                   \
+        local[1][tid] += local[1][tid+s];                                   \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      one=local[0][tid];                                                    \
+      two=local[1][tid];                                                    \
+    }                                                                       \
+  }
+
+#define block_reduce_arr(trip, width, local, tid, arr)                      \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        for (int r=0; r<trip; r++)                                          \
+          local[r][tid] += local[r][tid+s];                                 \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      for (int r=0; r<trip; r++)                                            \
+        arr[r]=local[r][tid];                                               \
+    }                                                                       \
+  }
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+#define local_allocate_store_ellipse()                                      \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
-    if (eflag>0) {                                                          \
+      if (EVFLAG && eflag) {                                                \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
-    if (vflag>0) {                                                          \
+      if (EVFLAG && vflag) {                                                \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
-    ans[ii]=f;                                                              \
+    }                                                                       \
  }

 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    red_acc[4][tid]=e_coul;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<5; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
      }                                                                     \
    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    e_coul=red_acc[4][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
+  }                                                                         \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    const int ev_stride=NUM_BLOCKS_X;                                       \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
        }                                                                   \
      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
          }                                                                 \
        }                                                                   \
-  if (offset==0) {                                                          \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
-    if (eflag>0) {                                                          \
+      if (EVFLAG && eflag) {                                                \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
-    if (vflag>0) {                                                          \
+      if (EVFLAG && vflag) {                                                \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
-    ans[ii]=f;                                                              \
+    }                                                                       \
  }

 #else

-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define simd_reduce_add1(width, one)                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
+
+#define simd_reduce_add2(width, one, two)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+  }
+
+#define simd_reduce_add3(width, one, two, three)                            \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+  }
+
+#define simd_reduce_add6(width, one, two, three, four, five, six)           \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+    four += shfl_down(four, s, width);                                      \
+    five += shfl_down(five, s, width);                                      \
+    six += shfl_down(six, s, width);                                        \
+  }
+
+#define simd_reduce_arr(trip, width, arr)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r] += shfl_down(arr[r], s, width);                                \
+  }
+
+#if (EVFLAG == 1)
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+  }                                                                         \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
            }                                                               \
          }                                                                 \
        }                                                                   \
-  if (offset==0) {                                                          \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
-    if (eflag>0) {                                                          \
+      if (eflag) {                                                          \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
-    if (vflag>0) {                                                          \
+      if (vflag) {                                                          \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
-    ans[ii]=f;                                                              \
+    }                                                                       \
  }

 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                        t_per_atom, offset, eflag, vflag, ans, engv)        \
  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-      e_coul += shfl_xor(e_coul, s, t_per_atom);                            \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+  }                                                                         \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
            }                                                               \
          }                                                                 \
        }                                                                   \
-  if (offset==0) {                                                          \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
      int ei=ii;                                                            \
-    if (eflag>0) {                                                          \
+      if (eflag) {                                                          \
        engv[ei]=energy*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
        engv[ei]=e_coul*(acctyp)0.5;                                        \
        ei+=inum;                                                           \
      }                                                                     \
-    if (vflag>0) {                                                          \
+      if (vflag) {                                                          \
        for (int i=0; i<6; i++) {                                           \
          engv[ei]=virial[i]*(acctyp)0.5;                                   \
          ei+=inum;                                                         \
        }                                                                   \
      }                                                                     \
-    ans[ii]=f;                                                              \
+    }                                                                       \
  }

+#else
+
+#define local_allocate_store_pair()
+#define local_allocate_store_charge()
+#define local_allocate_store_bio()
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#endif
+
 #endif

--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -21,12 +21,15 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;

 template <class numtyp, class acctyp>
-BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0)  {
+BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0), _onetype(0) {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
  pair_program=nullptr;
  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseAtomicT::~BaseAtomic() {
  k_pair_fast.clear();
  k_pair.clear();
  if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -49,7 +56,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
                             const int max_nbors, const int maxspecial,
                             const double cell_size, const double gpu_split,
                             FILE *_screen, const void *pair_program,
-                             const char *k_name) {
+                             const char *k_name, const int onetype) {
  screen=_screen;

  int gpu_nbor=0;
@ -64,28 +71,29 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
    _gpu_host=1;

  _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);

  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
  if (ucl_device!=device->gpu) _compiled=false;

  ucl_device=device->gpu;
  atom=&device->atom;

  _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name,onetype);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@ -102,8 +110,8 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
 }

 template <class numtyp, class acctyp>
-void BaseAtomicT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseAtomicT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
 }

 template <class numtyp, class acctyp>
@ -164,8 +172,8 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
  atom->cast_copy_x(host_x,host_type);

  int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
@ -179,11 +187,25 @@ template <class numtyp, class acctyp>
 void BaseAtomicT::compute(const int f_ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
+                          const bool eflag_in, const bool vflag_in,
                          const bool eatom, const bool vatom,
                          int &host_start, const double cpu_time,
                          bool &success) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -207,8 +229,8 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }
@ -220,12 +242,26 @@ template <class numtyp, class acctyp>
 int ** BaseAtomicT::compute(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag,
-                                 const bool vflag, const bool eatom,
-                                 const bool vatom, int &host_start,
-                                 int **ilist, int **jnum,
+                            int **nspecial, tagint **special,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom,
+                            int &host_start, int **ilist, int **jnum,
                            const double cpu_time, bool &success) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -254,8 +290,8 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();

@ -270,19 +306,46 @@ double BaseAtomicT::host_memory_usage_atomic() const {

 template <class numtyp, class acctyp>
 void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
-  if (_compiled)
+                                  const char *kname, const int onetype) {
+  if (_compiled && _onetype==onetype)
    return;
+  _onetype=onetype;

  std::string s_fast=std::string(kname)+"_fast";
  if (pair_program) delete pair_program;
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");

+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }

 template class BaseAtomic<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@ -53,10 +53,11 @@ class BaseAtomic {
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const void *pair_program, const char *k_name,
+                  const int onetype=0);

  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
@ -100,7 +101,7 @@ class BaseAtomic {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
@ -179,23 +180,31 @@ class BaseAtomic {
  Neighbor *nbor;

  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;

 protected:
  bool _compiled;
-  int _block_size, _threads_per_atom;
+  int _block_size, _threads_per_atom, _onetype;
  double _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k,
+                       const int onetype);

-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };

 }
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -27,6 +27,9 @@ BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) {
  nbor=new Neighbor();
  pair_program=nullptr;
  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseChargeT::~BaseCharge() {
  k_pair_fast.clear();
  k_pair.clear();
  if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -64,21 +71,11 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
    _gpu_host=1;

  _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);

  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
  if (ucl_device!=device->gpu) _compiled=false;

  ucl_device=device->gpu;
@ -88,6 +85,17 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
  _block_bio_size=device->block_bio_pair();
  compile_kernels(*ucl_device,pair_program,k_name);

+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);

@ -104,8 +112,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
 }

 template <class numtyp, class acctyp>
-void BaseChargeT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseChargeT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
 }

 template <class numtyp, class acctyp>
@ -166,8 +174,8 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
  atom->cast_copy_x(host_x,host_type);

  int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
@ -181,12 +189,26 @@ template <class numtyp, class acctyp>
 void BaseChargeT::compute(const int f_ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
+                          const bool eflag_in, const bool vflag_in,
                          const bool eatom, const bool vatom,
                          int &host_start, const double cpu_time,
                          bool &success, double *host_q,
                          const int nlocal, double *boxlo, double *prd) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -215,8 +237,8 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                     boxlo, prd);

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }
@ -228,13 +250,27 @@ template <class numtyp, class acctyp>
 int** BaseChargeT::compute(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special, const bool eflag,
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
+                           int **nspecial, tagint **special,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum,
                           const double cpu_time, bool &success,
                           double *host_q, double *boxlo, double *prd) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -269,8 +305,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                     boxlo, prd);

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();

@ -292,13 +328,37 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
  std::string s_fast=std::string(kname)+"_fast";
  if (pair_program) delete pair_program;
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
  q_tex.get_texture(*pair_program,"q_tex");

+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }

 template class BaseCharge<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@ -57,7 +57,7 @@ class BaseCharge {
                  const void *pair_program, const char *k_name);

  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
@ -103,7 +103,7 @@ class BaseCharge {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
@ -177,9 +177,15 @@ class BaseCharge {
  Neighbor *nbor;

  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
@ -194,7 +200,7 @@ class BaseCharge {

  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);

-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };

 }
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -27,6 +27,9 @@ BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
  nbor=new Neighbor();
  pair_program=nullptr;
  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseDipoleT::~BaseDipole() {
  k_pair_fast.clear();
  k_pair.clear();
  if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -65,30 +72,30 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
    _gpu_host=1;

  _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);

  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
  if (ucl_device!=device->gpu) _compiled=false;

  ucl_device=device->gpu;
  atom=&device->atom;

  _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
  compile_kernels(*ucl_device,pair_program,k_name);

+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);

@ -168,8 +175,8 @@ inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
  atom->cast_copy_x(host_x,host_type);

  int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
@ -183,12 +190,26 @@ template <class numtyp, class acctyp>
 void BaseDipoleT::compute(const int f_ago, const int inum_full,
                          const int nall, double **host_x, int *host_type,
                          int *ilist, int *numj, int **firstneigh,
-                          const bool eflag, const bool vflag,
+                          const bool eflag_in, const bool vflag_in,
                          const bool eatom, const bool vatom,
                          int &host_start, const double cpu_time,
                          bool &success, double *host_q, double **host_mu,
                          const int nlocal, double *boxlo, double *prd) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -219,8 +240,8 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
  device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                     boxlo, prd);

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }
@ -232,14 +253,28 @@ template <class numtyp, class acctyp>
 int** BaseDipoleT::compute(const int ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special, const bool eflag,
-                           const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum,
+                           int **nspecial, tagint **special,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom,
+                           int &host_start, int **ilist, int **jnum,
                           const double cpu_time, bool &success,
                           double *host_q, double **host_mu,
                           double *boxlo, double *prd) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -277,8 +312,8 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
  device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                     boxlo, prd);

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();

@ -300,14 +335,38 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
  std::string s_fast=std::string(kname)+"_fast";
  if (pair_program) delete pair_program;
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
  q_tex.get_texture(*pair_program,"q_tex");
  mu_tex.get_texture(*pair_program,"mu_tex");

+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }

 template class BaseDipole<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@ -102,7 +102,7 @@ class BaseDipole {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
@ -176,9 +176,16 @@ class BaseDipole {
  Neighbor *nbor;

  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
@ -187,14 +194,14 @@ class BaseDipole {

 protected:
  bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
+  int _block_size, _threads_per_atom;
  double  _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);

-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };

 }
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -27,6 +27,9 @@ BaseDPDT::BaseDPD() : _compiled(false), _max_bytes(0) {
  nbor=new Neighbor();
  pair_program=nullptr;
  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -36,6 +39,10 @@ BaseDPDT::~BaseDPD() {
  k_pair_fast.clear();
  k_pair.clear();
  if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -47,9 +54,9 @@ int BaseDPDT::bytes_per_atom_atomic(const int max_nbors) const {
 template <class numtyp, class acctyp>
 int BaseDPDT::init_atomic(const int nlocal, const int nall,
                          const int max_nbors, const int maxspecial,
-                          const double cell_size,
-                          const double gpu_split, FILE *_screen,
-                          const void *pair_program, const char *k_name) {
+                          const double cell_size, const double gpu_split,
+                          FILE *_screen, const void *pair_program,
+                          const char *k_name, const int onetype) {
  screen=_screen;

  int gpu_nbor=0;
@ -63,31 +70,30 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
  if (host_nlocal>0)
    _gpu_host=1;

-  _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
+  _threads_per_atom=device->threads_per_atom();

  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
  if (success!=0)
    return success;

-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-
-  if (success!=0)
-    return success;
-
  if (ucl_device!=device->gpu) _compiled=false;

  ucl_device=device->gpu;
  atom=&device->atom;

  _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name,onetype);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@ -167,8 +173,8 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
  atom->cast_copy_x(host_x,host_type);

  int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  if (bytes>_max_an_bytes)
@ -179,16 +185,30 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseDPDT::compute(const int f_ago, const int inum_full,
-                       const int nall, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag,
-                       const bool eatom, const bool vatom,
-                       int &host_start, const double cpu_time,
-                       bool &success, tagint *tag, double **host_v,
-                       const double dtinvsqrt, const int seed, const int timestep,
+void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag_in,
+                       const bool vflag_in, const bool eatom,
+                       const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, tagint *tag,
+                       double **host_v, const double dtinvsqrt,
+                       const int seed, const int timestep,
                       const int nlocal, double *boxlo, double *prd) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -218,8 +238,8 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
  _seed = seed;
  _timestep = timestep;

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
 }
@ -231,8 +251,8 @@ template <class numtyp, class acctyp>
 int** BaseDPDT::compute(const int ago, const int inum_full,
                        const int nall, double **host_x, int *host_type,
                        double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag,
-                        const bool vflag, const bool eatom,
+                        int **nspecial, tagint **special, const bool eflag_in,
+                        const bool vflag_in, const bool eatom,
                        const bool vatom, int &host_start,
                        int **ilist, int **jnum,
                        const double cpu_time, bool &success,
@ -240,6 +260,20 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
                        const int seed, const int timestep,
                        double *boxlo, double *prd) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -275,8 +309,8 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
  _seed = seed;
  _timestep = timestep;

-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();

@ -291,20 +325,48 @@ double BaseDPDT::host_memory_usage_atomic() const {

 template <class numtyp, class acctyp>
 void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
-  if (_compiled)
+                               const char *kname, const int onetype) {
+  if (_compiled && _onetype==onetype)
    return;

+  _onetype=onetype;
+
  std::string s_fast=std::string(kname)+"_fast";
  if (pair_program) delete pair_program;
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
  k_pair_fast.set_function(*pair_program,s_fast.c_str());
  k_pair.set_function(*pair_program,kname);
  pos_tex.get_texture(*pair_program,"pos_tex");
  vel_tex.get_texture(*pair_program,"vel_tex");

+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }

 template class BaseDPD<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@ -52,7 +52,8 @@ class BaseDPD {
  int init_atomic(const int nlocal, const int nall, const int max_nbors,
                  const int maxspecial, const double cell_size,
                  const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const void *pair_program, const char *k_name,
+                  const int onetype=0);

  /// Estimate the overhead for GPU context changes and CPU driver
  void estimate_gpu_overhead();
@ -101,7 +102,7 @@ class BaseDPD {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
@ -177,9 +178,16 @@ class BaseDPD {
  Neighbor *nbor;

  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
@ -191,13 +199,14 @@ class BaseDPD {

 protected:
  bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
+  int _block_size, _threads_per_atom, _onetype;
  double  _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
  UCL_D_Vec<int> *_nbor_data;

-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+                       const char *k, const int onetype);
+  virtual int loop(const int eflag, const int vflag) = 0;
 };

 }
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -29,7 +29,8 @@ const char *ellipsoid_nbor=0;
 extern Device<PRECISION,ACC_PRECISION> global_device;

 template <class numtyp, class acctyp>
-BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
+BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0),
+                                  host_olist_size(0) {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
@ -37,6 +38,10 @@ BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
  ellipsoid_program=nullptr;
  lj_program=nullptr;
  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  ellipsoid_program_noev=nullptr;
+  lj_program_noev=nullptr;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -53,6 +58,14 @@ BaseEllipsoidT::~BaseEllipsoid() {
  if (nbor_program) delete nbor_program;
  if (ellipsoid_program) delete ellipsoid_program;
  if (lj_program) delete lj_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_ellipsoid_noev.clear();
+  k_ellipsoid_sphere_noev.clear();
+  k_sphere_ellipsoid_noev.clear();
+  k_lj_fast.clear();
+  if (ellipsoid_program_noev) delete ellipsoid_program_noev;
+  if (lj_program_noev) delete lj_program_noev;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -89,11 +102,6 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
  if (success!=0)
    return success;

-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,true,1);
-  if (success!=0)
-    return success;
-
  if (ucl_device!=device->gpu) _compiled=false;

  ucl_device=device->gpu;
@ -102,6 +110,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
  _block_size=device->block_ellipse();
  compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);

+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,true,1);
+  if (success!=0)
+    return success;
+
  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);

@ -133,12 +146,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
  if (_multiple_forms && gpu_nbor!=0)
    return -9;

-  if (_multiple_forms)
+  if (_multiple_forms) {
    ans->force.zero();
-
-  // Memory for ilist ordered by particle type
-  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
-    return -3;
+    host_olist_size = nbor->max_atoms();
+    host_olist = new int[nbor->max_atoms()];
+  }

  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();

@ -160,7 +172,10 @@ template <class numtyp, class acctyp>
 void BaseEllipsoidT::clear_base() {
  // Output any timing information
  output_times();
-  host_olist.clear();
+  if (host_olist_size) {
+    host_olist_size = 0;
+    delete []host_olist;
+  }

  time_nbor1.clear();
  time_ellipsoid.clear();
@ -206,10 +221,14 @@ void BaseEllipsoidT::output_times() {
  MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
             device->replica());
  double max_mb=mpi_max_bytes/(1024*1024);
-  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
+
+  #ifdef USE_OPENCL
+  // Workaround for timing issue on Intel OpenCL
+  if (times[3] > 80e6) times[3]=0.0;
+  #endif

  if (device->replica_me()==0)
-    if (screen && times[5]>0.0) {
+    if (screen && times[7]>0.0) {
      int replica_size=device->replica_size();

      fprintf(screen,"\n\n-------------------------------------");
@ -218,9 +237,8 @@ void BaseEllipsoidT::output_times() {
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

-      if (device->procs_per_gpu()==1 && t_time>0) {
+      if (device->procs_per_gpu()==1 && times[3]>0) {
        fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
        fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
        if (nbor->gpu_nbor()>0)
          fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
@ -229,13 +247,15 @@ void BaseEllipsoidT::output_times() {
        fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
        fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
      }
-      if (nbor->gpu_nbor()==2)
-        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[9]/replica_size);
      if (times[6]>0)
        fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
      fprintf(screen,"Average split:   %.4f.\n",avg_split);
      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
+      fprintf(screen,"Vector width:    %d.\n", device->simd_size());
      fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      if (nbor->gpu_nbor()==2)
+        fprintf(screen,"CPU Neighbor:    %.4f s.\n",times[9]/replica_size);
+      fprintf(screen,"CPU Cast/Pack:   %.4f s.\n",times[5]/replica_size);
      fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
      fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
      fprintf(screen,"-------------------------------------");
@ -256,11 +276,13 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
  if (shared_types) {
    k_nbor_fast.set_size(GX,BX);
    k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
-                    &inum, &nbor->dev_packed, &form_low, &form_high);
+                    &inum, &nbor->dev_packed, &form_low, &form_high,
+                    &_threads_per_atom);
  } else {
    k_nbor.set_size(GX,BX);
    k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
-               &start, &inum, &nbor->dev_packed, &form_low, &form_high);
+               &start, &inum, &nbor->dev_packed, &form_low, &form_high,
+               &_threads_per_atom);
  }
 }

@ -298,7 +320,7 @@ void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
        p++;
      }
    }
-    nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
+    nbor->get_host(inum,host_olist,numj,firstneigh,block_size());
    nbor->copy_unpacked(inum,mn);
    return;
  }
@ -330,8 +352,8 @@ inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
  atom->cast_copy_x(host_x,host_type);

  int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
  nbor->copy_unpacked(inum,mn);
  _last_ellipse=inum;
  _max_last_ellipse=inum;
@ -348,11 +370,18 @@ template <class numtyp, class acctyp>
 int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
                             const int nall, double **host_x, int *host_type,
                             int *ilist, int *numj, int **firstneigh,
-                             const bool eflag, const bool vflag,
+                             const bool eflag_in, const bool vflag_in,
                             const bool eatom, const bool vatom,
                             int &host_start, const double cpu_time,
                             bool &success, double **host_quat) {
  acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    zero_timers();
@ -373,7 +402,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
  }
  int *list;
  if (_multiple_forms)
-    list=host_olist.begin();
+    list=host_olist;
  else
    list=ilist;

@ -384,7 +413,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
  atom->add_quat_data();

  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();
  return list;
@ -394,15 +423,23 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
-                              double **host_x, int *host_type, double *sublo,
-                              double *subhi, tagint *tag, int **nspecial,
-                              tagint **special, const bool eflag, const bool vflag,
+int** BaseEllipsoidT::compute(const int ago, const int inum_full,
+                              const int nall, double **host_x, int *host_type,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              const bool eflag_in, const bool vflag_in,
                              const bool eatom, const bool vatom,
                              int &host_start, int **ilist, int **jnum,
                              const double cpu_time, bool &success,
                              double **host_quat) {
  acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    zero_timers();
@ -435,7 +472,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
  *jnum=nbor->host_acc.begin();

  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
  device->add_ans_object(ans);
  hd_balancer.stop_timer();

@ -462,25 +499,26 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
  std::string s_lj=kns+"_lj";
  std::string s_lj_fast=kns+"_lj_fast";

-  std::string flags=device->compile_string();
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";

  if (nbor_program) delete nbor_program;
  nbor_program=new UCL_Program(dev);
-  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
+  nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen);
  k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
  k_nbor.set_function(*nbor_program,"kernel_nbor");
  neigh_tex.get_texture(*nbor_program,"pos_tex");

  if (ellipsoid_program) delete ellipsoid_program;
  ellipsoid_program=new UCL_Program(dev);
-  ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
+  ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(),
+                                 nullptr,screen);
  k_ellipsoid.set_function(*ellipsoid_program,kname);
  pos_tex.get_texture(*ellipsoid_program,"pos_tex");
  quat_tex.get_texture(*ellipsoid_program,"quat_tex");

  if (lj_program) delete lj_program;
  lj_program=new UCL_Program(dev);
-  lj_program->load_string(lj_string,flags.c_str());
+  lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen);
  k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
  k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
  k_lj.set_function(*lj_program,s_lj.c_str());
@ -489,7 +527,52 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
  lj_pos_tex.get_texture(*lj_program,"pos_tex");
  lj_quat_tex.get_texture(*lj_program,"quat_tex");

+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (ellipsoid_program_noev) delete ellipsoid_program_noev;
+  ellipsoid_program_noev=new UCL_Program(dev);
+  ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(),
+                                      nullptr,screen);
+  k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname);
+
+  if (lj_program_noev) delete lj_program_noev;
+  lj_program_noev=new UCL_Program(dev);
+  lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen);
+  k_sphere_ellipsoid_noev.set_function(*lj_program_noev,
+                                       s_sphere_ellipsoid.c_str());
+  k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str());
+  if (e_s)
+    k_ellipsoid_sphere_noev.set_function(*lj_program_noev,
+                                         s_ellipsoid_sphere.c_str());
+  #else
+  k_elps_sel = &k_ellipsoid;
+  k_elps_sphere_sel = &k_ellipsoid_sphere;
+  k_sphere_elps_sel = &k_sphere_ellipsoid;
+  k_lj_sel = &k_lj_fast;
+  #endif
+
  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
+    if (e_s)
+      mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size));
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size));
+    if (e_s)
+      mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }

 template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@ -88,10 +88,10 @@ class BaseEllipsoid {
    ans->resize(nlocal, success);
    if (_multiple_forms) ans->force.zero();

-    if (olist_size>static_cast<int>(host_olist.numel())) {
-      host_olist.clear();
-      int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
-      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
+    if (olist_size>host_olist_size) {
+      if (host_olist_size) delete []host_olist;
+      host_olist_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
+      host_olist = new int[host_olist_size];
    }

    nbor->resize(nlocal,host_inum,max_nbors,success);
@ -116,7 +116,7 @@ class BaseEllipsoid {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
      time_nbor1.add_to_total();
      time_ellipsoid.add_to_total();
      if (_multiple_forms) {
@ -223,14 +223,40 @@ class BaseEllipsoid {
  /// Neighbor data
  Neighbor *nbor;
  /// ilist with particles sorted by type
-  UCL_H_Vec<int> host_olist;
+  int *host_olist;
+  int host_olist_size;

  // ------------------------- DEVICE KERNELS -------------------------
  UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
+  UCL_Program *ellipsoid_program_noev, *lj_program_noev;
  UCL_Kernel k_nbor_fast, k_nbor;
  UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
  UCL_Kernel k_lj_fast, k_lj;
+  UCL_Kernel k_ellipsoid_noev, k_ellipsoid_sphere_noev;
+  UCL_Kernel k_sphere_ellipsoid_noev, k_lj_fast_noev;
+  UCL_Kernel *k_elps_sel, *k_elps_sphere_sel, *k_sphere_elps_sel, *k_lj_sel;
  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (_multiple_forms == false) {
+      if (eflag || vflag) k_elps_sel = &k_ellipsoid;
+      else k_elps_sel = &k_ellipsoid_noev;
+    } else {
+      if (eflag || vflag) {
+        k_elps_sel = &k_ellipsoid;
+        k_elps_sphere_sel = &k_ellipsoid_sphere;
+        k_sphere_elps_sel = &k_sphere_ellipsoid;
+        k_lj_sel = &k_lj_fast;
+      } else {
+        k_elps_sel = &k_ellipsoid_noev;
+        k_elps_sphere_sel = &k_ellipsoid_sphere_noev;
+        k_sphere_elps_sel = &k_sphere_ellipsoid_noev;
+        k_lj_sel = &k_lj_fast_noev;
+      }
+    }
+    #endif
+  }
+

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
@ -240,7 +266,6 @@ class BaseEllipsoid {
  int _block_size, _threads_per_atom;
  double  _max_bytes, _max_an_bytes;
  double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;

  // True if we want to use fast GB-sphere or sphere-sphere calculations
  bool _multiple_forms;
@ -250,7 +275,7 @@ class BaseEllipsoid {
  void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
                       const void *lj_string, const char *kname,const bool e_s);

-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };

 }
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -20,7 +20,7 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;

 template <class numtyp, class acctyp>
-BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
+BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _onetype(-1) {
  device=&global_device;
  ans=new Answer<numtyp,acctyp>();
  nbor=new Neighbor();
@ -29,6 +29,9 @@ BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
  #endif
  pair_program=nullptr;
  ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -44,6 +47,12 @@ BaseThreeT::~BaseThree() {
  k_pair.clear();
  k_short_nbor.clear();
  if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_three_center_noev.clear();
+  k_three_end_noev.clear();
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }

 template <class numtyp, class acctyp>
@ -62,7 +71,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
                           const double cell_size, const double gpu_split,
                           FILE *_screen, const void *pair_program,
                           const char *two, const char *three_center,
-                           const char *three_end, const char *short_nbor) {
+                           const char *three_end, const char *short_nbor,
+                           const int onetype, const int onetype3,
+                           const int spq, const int tpa_override) {
  screen=_screen;

  int gpu_nbor=0;
@ -77,24 +88,16 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  if (host_nlocal>0)
    _gpu_host=1;

-  _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else  // neigh yes or tpa == 1
-    _nbor_data=&(nbor->dev_nbor);
-  if (_threads_per_atom*_threads_per_atom>device->warp_size())
-    return -10;
+  // Allow forcing threads per atom to 1 for tersoff due to subg sync issue
+  if (tpa_override)
+    _threads_per_atom=tpa_override;
+  else
+    _threads_per_atom=device->threads_per_three();

  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
  if (ucl_device!=device->gpu) _compiled=false;

  ucl_device=device->gpu;
@ -110,7 +113,19 @@ int BaseThreeT::init_three(const int nlocal, const int nall,

  _block_pair=device->pair_block_size();
  _block_size=device->block_ellipse();
-  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
+  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,
+                  short_nbor,onetype,onetype3,spq);
+
+  while (_threads_per_atom*_threads_per_atom>device->simd_size())
+    _threads_per_atom = _threads_per_atom / 2;
+
+  if (_threads_per_atom*_threads_per_atom>device->simd_size())
+    return -10;
+
+  success = device->init_nbor(nbor,nall,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,true,1,true);
+  if (success!=0)
+    return success;

  // Initialize host-device load balancer
  hd_balancer.init(device,gpu_nbor,gpu_split);
@ -121,22 +136,21 @@ int BaseThreeT::init_three(const int nlocal, const int nall,

  pos_tex.bind_float(atom->x,4);

+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
  _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
  _max_an_bytes+=ans2->gpu_bytes();
  #endif

-  int ef_nall=nall;
-  if (ef_nall==0)
-    ef_nall=2000;
-  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
-
  return 0;
 }

 template <class numtyp, class acctyp>
-void BaseThreeT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseThreeT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(4+add_kernels,_gpu_overhead,_driver_overhead);
 }

 template <class numtyp, class acctyp>
@ -152,7 +166,6 @@ void BaseThreeT::clear_atomic() {
  time_pair.clear();
  hd_balancer.clear();

-  dev_short_nbor.clear();
  nbor->clear();
  ans->clear();
  #ifdef THREE_CONCURRENT
@ -186,6 +199,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,

  // now the requirement is removed, allowing to work within pair hybrid
  nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
+  nbor->copy_unpacked(nlist,mn);

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
@ -201,7 +215,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
+inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum,
                                        const int nall, double **host_x,
                                        int *host_type, double *sublo,
                                        double *subhi, tagint *tag,
@ -211,14 +225,22 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
  resize_atom(inum,nall,success);
  resize_local(nall,host_inum,nbor->max_nbors(),success);
  if (!success)
-    return 0;
+    return;
  atom->cast_copy_x(host_x,host_type);

  _nall = nall;

+  // Increase the effective sub-domain size for neighbors of ghosts
+  // This is still inefficient because we are calculating neighbors for more
+  // ghosts than necessary due to increased ghost cutoff
+  const double ncut=nbor->cutoff()*2.0;
+  for (int i=0; i<3; i++) sublo[i]-=ncut;
+  for (int i=0; i<3; i++) subhi[i]+=ncut;
+
  int mn;
-  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+  nbor->copy_unpacked(nall,mn);

  double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
  #ifdef THREE_CONCURRENT
@ -226,7 +248,6 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
  #endif
  if (bytes>_max_an_bytes)
    _max_an_bytes=bytes;
-  return mn;
 }

 // ---------------------------------------------------------------------------
@ -236,10 +257,24 @@ template <class numtyp, class acctyp>
 void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                         const int nlist, double **host_x, int *host_type,
                         int *ilist, int *numj, int **firstneigh,
-                         const bool eflag, const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
+                         const bool eflag_in, const bool vflag_in,
+                         const bool eatom, const bool vatom, int &host_start,
                         const double cpu_time, bool &success) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -260,19 +295,12 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
    reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
    if (!success)
      return;
-    _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
  }

  atom->cast_x_data(host_x,host_type);
  hd_balancer.start_timer();
  atom->add_x_data(host_x,host_type);

-  // re-allocate dev_short_nbor if necessary
-  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    dev_short_nbor.resize((2+_max_nbors)*_nmax);
-  }
-
  // _ainum to be used in loop() for short neighbor list build
  _ainum = nlist;

@ -282,11 +310,11 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
  #ifdef THREE_CONCURRENT
  ucl_device->sync();
  #endif
-  loop(eflag,vflag,evatom);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag,evatom,success);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans);
  #ifdef THREE_CONCURRENT
-  ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
  device->add_ans_object(ans2);
  #endif
  hd_balancer.stop_timer();
@ -296,15 +324,29 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int ** BaseThreeT::compute(const int ago, const int inum_full,
-                                 const int nall, double **host_x, int *host_type,
-                                 double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag,
-                                 const bool vflag, const bool eatom,
+int ** BaseThreeT::compute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, const bool eflag_in,
+                           const bool vflag_in, const bool eatom,
                           const bool vatom, int &host_start,
                           int **ilist, int **jnum,
                           const double cpu_time, bool &success) {
  acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
  if (inum_full==0) {
    host_start=0;
    // Make sure textures are correct if realloc by a different hybrid style
@ -323,7 +365,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,

  // Build neighbor list on GPU if necessary
  if (ago==0) {
-    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                    sublo, subhi, tag, nspecial, special, success);
    if (!success)
      return nullptr;
@ -336,12 +378,6 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
  *ilist=nbor->host_ilist.begin();
  *jnum=nbor->host_acc.begin();

-  // re-allocate dev_short_nbor if necessary
-  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    dev_short_nbor.resize((2+_max_nbors)*_nmax);
-  }
-
  // _ainum to be used in loop() for short neighbor list build
  _ainum = nall;

@ -351,11 +387,11 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
  #ifdef THREE_CONCURRENT
  ucl_device->sync();
  #endif
-  loop(eflag,vflag,evatom);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag,evatom,success);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans);
  #ifdef THREE_CONCURRENT
-  ans2->copy_answers(eflag,vflag,eatom,vatom);
+  ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
  device->add_ans_object(ans2);
  #endif
  hd_balancer.stop_timer();
@ -372,14 +408,24 @@ double BaseThreeT::host_memory_usage_atomic() const {
 template <class numtyp, class acctyp>
 void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                 const char *two, const char *three_center,
-                                 const char *three_end, const char* short_nbor) {
-  if (_compiled)
+                                 const char *three_end, const char* short_nbor,
+                                 const int onetype, const int onetype3,
+                                 const int spq) {
+  if (_compiled && _onetype==onetype && _onetype3==onetype3 && _spq==spq)
    return;

+  _onetype=onetype;
+  _onetype3=onetype3;
+  _spq=spq;
+
  std::string vatom_name=std::string(three_end)+"_vatom";
  if (pair_program) delete pair_program;
  pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
+                     " -DONETYPE3="+device->toa(_onetype3);
+  if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
  k_three_center.set_function(*pair_program,three_center);
  k_three_end.set_function(*pair_program,three_end);
  k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
@ -387,12 +433,50 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
  k_short_nbor.set_function(*pair_program,short_nbor);
  pos_tex.get_texture(*pair_program,"pos_tex");

+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
+                     " -DONETYPE3="+device->toa(_onetype3);
+  if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_three_center_noev.set_function(*pair_program_noev,three_center);
+  k_three_end_noev.set_function(*pair_program_noev,three_end);
+  k_pair_noev.set_function(*pair_program_noev,two);
+  #else
+  k_sel = &k_pair;
+  k_3center_sel = &k_three_center;
+  k_3end_sel = &k_three_end;
+  #endif
+
  #ifdef THREE_CONCURRENT
  k_three_end.cq(ucl_device->cq(_end_command_queue));
  k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
+  #if defined(LAL_OCL_EV_JIT)
+  k_three_end_noev.cq(ucl_device->cq(_end_command_queue));
+  #endif
  #endif

  _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair.max_subgroup_size(_block_size);
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_vatom.max_subgroup_size(_block_size));
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }

 template class BaseThree<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@ -59,10 +59,12 @@ class BaseThree {
                 const double gpu_split, FILE *screen,
                 const void *pair_program, const char *k_two,
                 const char *k_three_center, const char *k_three_end,
-                 const char *k_short_nbor=nullptr);
+                 const char *k_short_nbor=nullptr, const int onetype=-1,
+                 const int onetype3=-1, const int spq=0,
+                 const int tpa_override=0);

  /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);

  /// Check if there is enough storage for atom arrays and realloc if not
  /** \param success set to false if insufficient memory **/
@ -109,7 +111,7 @@ class BaseThree {
  /// Accumulate timers
  inline void acc_timers() {
    if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
      time_pair.add_to_total();
      atom->acc_timers();
      ans->acc_timers();
@ -134,9 +136,9 @@ class BaseThree {
                    int *numj, int **firstneigh, bool &success);

  /// Build neighbor list on device
-  int build_nbor_list(const int inum, const int host_inum,
-                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+  void build_nbor_list(const int inum, const int host_inum, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, tagint *tag, int **nspecial,
                       tagint **special, bool &success);

  /// Pair loop with host neighboring
@ -147,12 +149,12 @@ class BaseThree {
               int &host_start, const double cpu_time, bool &success);

  /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
+  int ** compute(const int ago, const int inum_full, const int nall,
+                 double **host_x, int *host_type, double *sublo,
+                 double *subhi, tagint *tag, int **nspecial, tagint **special,
+                 const bool eflag, const bool vflag, const bool eatom,
+                 const bool vatom, int &host_start, int **ilist,
+                 int **numj, const double cpu_time, bool &success);

  // -------------------------- DEVICE DATA -------------------------

@ -188,14 +190,29 @@ class BaseThree {
  /// Neighbor data
  Neighbor *nbor;

-  UCL_D_Vec<int> dev_short_nbor;
  UCL_Kernel k_short_nbor;

  // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
+  UCL_Program *pair_program, *pair_program_noev;
  UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
+  UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev;
+  UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel;
  inline int block_pair() { return _block_pair; }
  inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) {
+      k_sel = &k_pair;
+      k_3center_sel = &k_three_center;
+      k_3end_sel = &k_three_end;
+    } else {
+      k_sel = &k_pair_noev;
+      k_3center_sel = &k_three_center_noev;
+      k_3end_sel = &k_three_end_noev;
+    }
+    #endif
+  }
+

  // --------------------------- TEXTURES -----------------------------
  UCL_Texture pos_tex;
@ -203,18 +220,19 @@ class BaseThree {
 protected:
  bool _compiled;
  int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
-  int _gpu_nbor;
+  int _gpu_nbor, _onetype, _onetype3, _spq;
  double _max_bytes, _max_an_bytes;
-  int _max_nbors, _ainum, _nall;
+  int _ainum, _nall;
  double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;

  void compile_kernels(UCL_Device &dev, const void *pair_string,
                       const char *two, const char *three_center,
-                       const char *three_end, const char* short_nbor);
+                       const char *three_end, const char* short_nbor,
+                       const int onetype, const int onetype3,
+                       const int spq);

-  virtual void loop(const bool _eflag, const bool _vflag,
-                    const int evatom) = 0;
+  virtual int loop(const int eflag, const int vflag, const int evatom,
+                   bool &success) = 0;
 };

 }
--- a/lib/gpu/lal_beck.cpp
+++ b/lib/gpu/lal_beck.cpp
@ -113,20 +113,9 @@ double BeckT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BeckT::loop(const bool _eflag, const bool _vflag) {
+int BeckT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -134,8 +123,8 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &beck1, &beck2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->_threads_per_atom);
@ -147,6 +136,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class Beck<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@ -39,22 +39,25 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -98,14 +101,14 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp term6 = pow(term1,(numtyp)-3);
          numtyp term1inv = ucl_recip(term1);
          numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
          e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
          energy+=factor_lj*e;
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -116,9 +119,9 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

 __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
@ -137,6 +140,9 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@ -144,19 +150,19 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
    beck2[tid]=beck2_in[tid];
  }

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -200,14 +206,14 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp term6 = pow(term1,(numtyp)-3);
          numtyp term1inv = ucl_recip(term1);
          numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
          e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
          energy+=factor_lj*e;
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -218,8 +224,8 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@ -72,7 +72,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {

 private:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@ -55,7 +55,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
  int init_ok=0;
  if (world_me==0)
    init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
-                      AA, BB, special_lj, inum, nall, 300,
+                      AA, BB, special_lj, inum, nall, max_nbors,
                      maxspecial, cell_size, gpu_split, screen);

  BLMF.device->world_barrier();
@ -73,7 +73,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
-                        special_lj, inum, nall, 300, maxspecial,
+                        special_lj, inum, nall, max_nbors, maxspecial,
                        cell_size, gpu_split, screen);

    BLMF.device->gpu_barrier();
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@ -138,20 +138,9 @@ double BornT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornT::loop(const bool _eflag, const bool _vflag) {
+int BornT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -159,8 +148,8 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1,&coeff2,
                          &cutsq_sigma, &sp_lj,
                          &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
@ -176,6 +165,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
                     &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class Born<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@ -40,22 +40,25 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -92,12 +95,12 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
            + coeff2[mtype].z*r2inv*r6inv;
          energy+=factor_lj*(e-coeff2[mtype].w);
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -108,9 +111,9 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

 __kernel void k_born_fast(const __global numtyp4 *restrict x_,
@ -130,27 +133,30 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -187,12 +193,12 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
            + coeff2[mtype].z*r2inv*r6inv;
          energy+=factor_lj*(e-coeff2[mtype].w);
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -203,8 +209,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@ -82,7 +82,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {

 private:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -129,20 +129,9 @@ double BornCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int BornCoulLongT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -150,8 +139,8 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor,
                          &this->_nbor_data->begin(),
                          &this->ans->force,
@ -170,6 +159,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
                   &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class BornCoulLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -48,6 +48,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
@ -57,18 +60,18 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -124,7 +127,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < cutsq_sigma[mtype].y) {
@ -133,7 +136,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -144,9 +147,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

 __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
@ -169,28 +172,31 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -246,7 +252,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < cutsq_sigma[mtype].y) {
@ -255,7 +261,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -266,8 +272,8 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@ -80,7 +80,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {

 protected:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -63,6 +63,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
@ -72,18 +75,18 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -155,7 +158,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq) {
            numtyp e = prefactor*_erfc;
            if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -167,7 +170,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -178,9 +181,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

 __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
@ -203,28 +206,31 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -296,7 +302,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq) {
            numtyp e = prefactor*_erfc;
            if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -308,7 +314,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -319,8 +325,8 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_born_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_cs_ext.cpp
@ -60,7 +60,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  if (world_me==0)
    init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                          host_born3, host_a, host_c, host_d, sigma, offset,
-                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                          host_special_coul, qqrd2e, g_ewald);

@ -80,7 +80,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    if (gpu_rank==i && world_me!=0)
      init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                            host_born3, host_a, host_c, host_d, sigma, offset,
-                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                            host_special_coul, qqrd2e, g_ewald);

--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@ -60,7 +60,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  if (world_me==0)
    init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                          host_born3, host_a, host_c, host_d, sigma, offset,
-                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                          host_special_coul, qqrd2e, g_ewald);

@ -80,7 +80,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    if (gpu_rank==i && world_me!=0)
      init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                            host_born3, host_a, host_c, host_d, sigma, offset,
-                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                            host_special_coul, qqrd2e, g_ewald);

--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -131,20 +131,9 @@ double BornCoulWolfT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
+int BornCoulWolfT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -152,8 +141,8 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
                          &ainum, &nbor_pitch, &this->atom->q,
@ -171,6 +160,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
                   &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class BornCoulWolf<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -51,6 +51,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
@ -60,18 +63,18 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -79,7 +82,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    if (eflag>0) {
+    if (EVFLAG && eflag) {
      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
@ -137,7 +140,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq) {
            numtyp e=v_sh;
            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -149,7 +152,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -160,9 +163,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

 __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
@ -186,28 +189,31 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -216,7 +222,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    if (eflag>0) {
+    if (EVFLAG && eflag) {
      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
@ -273,7 +279,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq) {
            numtyp e=v_sh;
            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -285,7 +291,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -296,8 +302,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }
-
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@ -81,7 +81,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {

 protected:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -52,6 +52,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
@ -61,18 +64,18 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -80,7 +83,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
    numtyp qtmp; fetch(qtmp,i,q_tex);
    int itype=ix.w;

-    if (eflag>0) {
+    if (EVFLAG && eflag) {
      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
@ -139,7 +142,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq) {
            acctyp e=v_sh;
            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -151,7 +154,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -162,9 +165,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

 __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
@ -188,28 +191,31 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -218,7 +224,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
    int iw=ix.w;
    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);

-    if (eflag>0) {
+    if (EVFLAG && eflag) {
      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
      e_coul += (acctyp)2.0*e_self;
@ -276,7 +282,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq) {
            acctyp e=v_sh;
            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@ -288,7 +294,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].w);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -299,8 +305,8 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
@ -60,7 +60,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  if (world_me==0)
    init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                          host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e,
                          alf, e_shift, f_shift);
@ -81,7 +81,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    if (gpu_rank==i && world_me!=0)
      init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                            host_born3, host_a, host_c, host_d, sigma,
-                            offset, special_lj, inum, nall, 300,
+                            offset, special_lj, inum, nall, max_nbors,
                            maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                            host_cut_coulsq, host_special_coul, qqrd2e,
                            alf, e_shift, f_shift);
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@ -60,7 +60,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  if (world_me==0)
    init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                          host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e,
                          alf, e_shift, f_shift);
@ -81,7 +81,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    if (gpu_rank==i && world_me!=0)
      init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                            host_born3, host_a, host_c, host_d, sigma,
-                            offset, special_lj, inum, nall, 300,
+                            offset, special_lj, inum, nall, max_nbors,
                            maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                            host_cut_coulsq, host_special_coul, qqrd2e,
                            alf, e_shift, f_shift);
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@ -58,7 +58,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  if (world_me==0)
    init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                        host_born3, host_a, host_c, host_d, sigma,
-                        offset, special_lj, inum, nall, 300,
+                        offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);

  BORNMF.device->world_barrier();
@ -77,7 +77,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    if (gpu_rank==i && world_me!=0)
      init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                          host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen);

    BORNMF.device->gpu_barrier();
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@ -130,20 +130,9 @@ double BuckT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckT::loop(const bool _eflag, const bool _vflag) {
+int BuckT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -151,8 +140,8 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch,
@ -165,6 +154,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class Buck<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@ -39,22 +39,25 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -91,11 +94,11 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
          energy+=factor_lj*(e-coeff2[mtype].z);
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -106,9 +109,9 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

 __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
@ -127,27 +130,30 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -184,11 +190,11 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
          energy+=factor_lj*(e-coeff2[mtype].z);
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -199,8 +205,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@ -77,7 +77,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {

 private:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@ -122,20 +122,9 @@ double BuckCoulT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
+int BuckCoulT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -143,8 +132,8 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -158,6 +147,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
                     &cutsq, &_qqrd2e, &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class BuckCoul<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@ -47,6 +47,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
@ -56,18 +59,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -119,14 +122,14 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          e_coul += forcecoul;
          if (rsq < cutsq[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].z);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -137,9 +140,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

 __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
@ -162,29 +165,32 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
    cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -236,14 +242,14 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          e_coul += forcecoul;
          if (rsq < cutsq[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].z);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -254,8 +260,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@ -78,7 +78,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {

 private:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@ -58,7 +58,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  int init_ok=0;
  if (world_me==0)
    init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen,
                       host_cut_ljsq, host_cut_coulsq,
                       host_special_coul, qqrd2e);
@ -78,7 +78,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen,
                       host_cut_ljsq, host_cut_coulsq,
                       host_special_coul, qqrd2e);
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@ -126,20 +126,9 @@ double BuckCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int BuckCoulLongT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -147,8 +136,8 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -163,6 +152,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
                   &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class BuckCoulLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@ -48,6 +48,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
@ -57,18 +60,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
  sp_lj[6]=sp_lj_in[6];
  sp_lj[7]=sp_lj_in[7];

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -126,7 +129,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < coeff1[mtype].w) {
@ -134,7 +137,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].z);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -145,9 +148,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

 __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
@ -171,28 +174,31 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
    coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      coeff2[tid]=coeff2_in[tid];
  }

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -250,7 +256,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < coeff1[mtype].w) {
@ -258,7 +264,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
            energy+=factor_lj*(e-coeff2[mtype].z);
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -269,8 +275,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@ -78,7 +78,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {

 private:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@ -59,7 +59,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  int init_ok=0;
  if (world_me==0)
    init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                        host_a, host_c, offset, special_lj, inum, nall, 300,
+                        host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

@ -78,7 +78,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                        host_a, host_c, offset, special_lj, inum, nall, 300,
+                        host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);

--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@ -56,7 +56,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
  int init_ok=0;
  if (world_me==0)
    init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);

  BUCKMF.device->world_barrier();
@ -74,7 +74,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);

    BUCKMF.device->gpu_barrier();
--- a/lib/gpu/lal_charmm.cpp
+++ b/lib/gpu/lal_charmm.cpp
@ -0,0 +1,166 @@
+/***************************************************************************
+                               charmm.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "charmm_cl.h"
+#elif defined(USE_CUDART)
+const char *charmm_long=0;
+#else
+#include "charmm_cubin.h"
+#endif
+
+#include "lal_charmm.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define CHARMMT CHARMM<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+CHARMMT::CHARMM() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CHARMMT::~CHARMM() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int CHARMMT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CHARMMT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double *host_special_lj, const int nlocal, const int nall,
+                   const int max_nbors, const int maxspecial,
+                   const double cell_size, const double gpu_split,
+                   FILE *_screen, double host_cut_ljsq,
+                   const double host_cut_coulsq, double *host_special_coul,
+                   const double qqrd2e, const double cut_lj_innersq,
+                   const double cut_coul_innersq, const double denom_lj,
+                   const double denom_coul, double **epsilon,
+                   double **sigma, const bool mix_arithmetic) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,charmm,"k_charmm");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (this->_block_bio_size>=64 && mix_arithmetic &&
+      lj_types<=max_bio_shared_types)
+    shared_types=true;
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  int h_size=lj_types*lj_types;
+  if (h_size<max_bio_shared_types)
+    h_size=max_bio_shared_types;
+  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  for (int i=0; i<h_size*32; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_lj3,host_lj4);
+
+  if (shared_types) {
+    ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
+    this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+  }
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_bothsq = host_cut_bothsq;
+  _cut_coulsq = host_cut_coulsq;
+  _cut_ljsq = host_cut_ljsq;
+  _cut_lj_innersq = cut_lj_innersq;
+  _cut_coul_innersq = cut_coul_innersq;
+  _qqrd2e=qqrd2e;
+  _denom_lj=denom_lj;
+  _denom_coul=denom_coul;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CHARMMT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  ljd.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CHARMMT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CHARMM<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int CHARMMT::loop(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->_block_bio_size;
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
+                          &this->nbor->dev_nbor, this->_nbor_data,
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                          &_cut_coul_innersq, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &ljd, &sp_lj,
+                     &this->nbor->dev_nbor, this->_nbor_data,
+                     &this->ans->force, &this->ans->engv, &eflag,
+                     &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                     &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                     &_cut_coul_innersq, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+  return GX;
+}
+
+template class CHARMM<PRECISION,ACC_PRECISION>;
+}
--- a/lib/gpu/lal_charmm.cu
+++ b/lib/gpu/lal_charmm.cu
@ -0,0 +1,303 @@
+// **************************************************************************
+//                               charmm.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the charmm/coul pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_charmm(const __global numtyp4 *restrict x_,
+                       const __global numtyp2 *restrict ljd,
+                       const __global numtyp *restrict sp_lj,
+                       const __global int *dev_nbor,
+                       const __global int *dev_packed,
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv,
+                       const int eflag, const int vflag,
+                       const int inum, const int nbor_pitch,
+                       const __global numtyp *restrict q_,
+                       const numtyp cut_coulsq, const numtyp qqrd2e,
+                       const numtyp denom_lj,
+                       const numtyp denom_coul,
+                       const numtyp cut_bothsq,
+                       const numtyp cut_ljsq,
+                       const numtyp cut_lj_innersq,
+                       const numtyp cut_coul_innersq,
+                       const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_bio();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp rinv = ucl_rsqrt(rsq);
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
+          if (rsq > cut_coul_innersq) {
+            numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
+              denom_coul;
+            forcecoul *= switch3;
+          }
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (EVFLAG && eflag) {
+          e_coul += forcecoul;
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (EVFLAG && vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
+
+__kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
+                            const __global numtyp2 *restrict ljd_in,
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag,
+                            const int inum, const int nbor_pitch,
+                            const __global numtyp *restrict q_,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp denom_lj,
+                            const numtyp denom_coul,
+                            const numtyp cut_bothsq,
+                            const numtyp cut_ljsq,
+                            const numtyp cut_lj_innersq,
+                            const numtyp cut_coul_innersq,
+                            const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_bio();
+
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_BIO_SHARED_TYPES)
+    ljd[tid]=ljd_in[tid];
+  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
+    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp rinv = ucl_rsqrt(rsq);
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
+          if (rsq > cut_coul_innersq) {
+            numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
+              denom_coul;
+            forcecoul *= switch3;
+          }
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (EVFLAG && eflag) {
+          e_coul += forcecoul;
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (EVFLAG && vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
--- a/lib/gpu/lal_charmm.h
+++ b/lib/gpu/lal_charmm.h
@ -0,0 +1,89 @@
+/***************************************************************************
+                                charmm.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_CHARMM_
+#define LAL_CHARMM_
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CHARMM : public BaseCharge<numtyp, acctyp> {
+ public:
+  CHARMM();
+  ~CHARMM();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double cut_lj_innersq,
+           const double cut_coul_innersq, const double denom_lj,
+           const double denom_coul, double **epsilon, double **sigma,
+           const bool mix_arithmetic);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// x = lj1, y = lj2, z = lj3, w = lj4
+  UCL_D_Vec<numtyp4> lj1;
+  /// x = epsilon, y = sigma
+  UCL_D_Vec<numtyp2> ljd;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _qqrd2e, _denom_lj, _denom_coul;
+
+  numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
+  numtyp _cut_coul_innersq;
+
+ private:
+  bool _allocated;
+  int loop(const int eflag, const int vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_charmm_ext.cpp
+++ b/lib/gpu/lal_charmm_ext.cpp
@ -0,0 +1,137 @@
+/***************************************************************************
+                             charmm_long_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to charmm/coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_charmm.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CHARMM<PRECISION,ACC_PRECISION> CRMMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double *special_lj, const int inum,
+                   const int nall, const int max_nbors, const int maxspecial,
+                   const double cell_size, int &gpu_mode, FILE *screen,
+                   double host_cut_ljsq, double host_cut_coulsq,
+                   double *host_special_coul, const double qqrd2e,
+                   const double cut_lj_innersq, const double cut_coul_innersq,
+                   const double denom_lj, const double denom_coul,
+                   double **epsilon, double **sigma,
+                   const bool mix_arithmetic) {
+  CRMMF.clear();
+  gpu_mode=CRMMF.device->gpu_mode();
+  double gpu_split=CRMMF.device->particle_split();
+  int first_gpu=CRMMF.device->first_device();
+  int last_gpu=CRMMF.device->last_device();
+  int world_me=CRMMF.device->world_me();
+  int gpu_rank=CRMMF.device->gpu_rank();
+  int procs_per_gpu=CRMMF.device->procs_per_gpu();
+
+  CRMMF.device->init_message(screen,"lj/charmm/coul/charmm",first_gpu,
+                              last_gpu);
+
+  bool message=false;
+  if (CRMMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                special_lj, inum, nall, max_nbors, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, cut_lj_innersq, cut_coul_innersq,
+                denom_lj, denom_coul, epsilon, sigma, mix_arithmetic);
+
+  CRMMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, special_lj, inum, nall, max_nbors,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, cut_lj_innersq, cut_coul_innersq, denom_lj,
+                          denom_coul, epsilon, sigma, mix_arithmetic);
+
+    CRMMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CRMMF.estimate_gpu_overhead();
+
+  return init_ok;
+}
+
+void crm_gpu_clear() {
+  CRMMF.clear();
+}
+
+int** crm_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success, double *host_q, double *boxlo,
+                          double *prd) {
+  return CRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}
+
+void crm_gpu_compute(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       const int nlocal, double *boxlo, double *prd) {
+  CRMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                 nlocal,boxlo,prd);
+}
+
+double crm_gpu_bytes() {
+  return CRMMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@ -131,20 +131,9 @@ double CHARMMLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
+int CHARMMLongT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->_block_bio_size;
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -152,8 +141,8 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag,
                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
@ -171,6 +160,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
                     &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class CHARMMLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@ -47,18 +47,21 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
  int tid, ii, offset;
  atom_info(t_per_atom,ii,tid,offset);

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
+  int n_stride;
+  local_allocate_store_bio();
+
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -122,7 +125,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < cut_ljsq) {
@ -132,7 +135,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
            energy+=factor_lj*e;
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -143,9 +146,9 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

 __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
@ -168,6 +171,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,

  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_bio();
+
  if (tid<8)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_BIO_SHARED_TYPES)
@ -175,20 +181,20 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];

-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -258,7 +264,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
          if (rsq < cut_ljsq) {
@ -268,7 +274,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
            energy+=factor_lj*e;
          }
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -277,10 +283,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
          virial[5] += dely*delz*force;
        }
      }
-
    } // for nbor
+  } // if ii
  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
                  vflag,ans,engv);
-  } // if ii
 }

--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@ -79,7 +79,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {

 private:
  bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };

 }
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@ -60,7 +60,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
  int init_ok=0;
  if (world_me==0)
    CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
                epsilon,sigma,mix_arithmetic);
@ -80,7 +80,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
    }
    if (gpu_rank==i && world_me!=0)
      init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum, nall, 300,
+                          host_lj4, offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen,
                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
                          qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@ -140,20 +140,9 @@ double ColloidT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ColloidT::loop(const bool _eflag, const bool _vflag) {
+int ColloidT::loop(const int eflag, const int vflag) {
  // Compute the block size and grid size to keep all cores busy
  const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                               (BX/this->_threads_per_atom)));

@ -161,8 +150,8 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
  int nbor_pitch=this->nbor->nbor_pitch();
  this->time_pair.start();
  if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                          &colloid1, &colloid2, &form,
                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
@ -176,6 +165,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
                     &ainum, &nbor_pitch, &this->_threads_per_atom);
  }
  this->time_pair.stop();
+  return GX;
 }

 template class Colloid<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@ -42,22 +42,25 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
  atom_info(t_per_atom,ii,tid,offset);

  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  sp_lj[0]=sp_lj_in[0];
  sp_lj[1]=sp_lj_in[1];
  sp_lj[2]=sp_lj_in[2];
  sp_lj[3]=sp_lj_in[3];

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -146,7 +149,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp e=(numtyp)0.0;
          if (form[mtype]==0) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@ -160,7 +163,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
          }
          energy+=factor_lj*(e-lj3[mtype].z);
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -171,9 +174,9 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

 __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
@ -198,6 +201,9 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
  __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
  __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
  if (tid<4)
    sp_lj[tid]=sp_lj_in[tid];
  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@ -205,23 +211,23 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
    colloid1[tid]=colloid1_in[tid];
    colloid2[tid]=colloid2_in[tid];
    form[tid]=form_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
      lj3[tid]=lj3_in[tid];
  }

-  acctyp energy=(acctyp)0;
  acctyp4 f;
  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }

  __syncthreads();

  if (ii<inum) {
    int nbor, nbor_end;
    int i, numj;
-    __local int n_stride;
    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
              n_stride,nbor_end,nbor);

@ -310,7 +316,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
        f.y+=dely*force;
        f.z+=delz*force;

-        if (eflag>0) {
+        if (EVFLAG && eflag) {
          numtyp e=(numtyp)0.0;
          if (form[mtype]==0) {
            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@ -325,7 +331,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
          }
          energy+=factor_lj*(e-lj3[mtype].z);
        }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
          virial[0] += delx*delx*force;
          virial[1] += dely*dely*force;
          virial[2] += delz*delz*force;
@ -336,8 +342,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
      }

    } // for nbor
+  } // if ii
  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                ans,engv);
-  } // if ii
 }

--- a/Show More
+++ b/Show More