diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 2d259791f2..f67699c54d 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -156,8 +156,7 @@ if(BUILD_MPI)
     endif()
   endif()
 else()
-  enable_language(C)
-  file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c)
+  file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.cpp)
   add_library(mpi_stubs STATIC ${MPI_SOURCES})
   set_target_properties(mpi_stubs PROPERTIES OUTPUT_NAME lammps_mpi_stubs${LAMMPS_MACHINE})
   target_include_directories(mpi_stubs PUBLIC $<BUILD_INTERFACE:${LAMMPS_SOURCE_DIR}/STUBS>)
@@ -778,9 +777,7 @@ if(PKG_GPU)
   message(STATUS "<<< GPU package settings >>>
 -- GPU API:          ${GPU_API}")
   if(GPU_API STREQUAL "CUDA")
-    message(STATUS "GPU architecture: ${GPU_ARCH}")
-  elseif(GPU_API STREQUAL "OPENCL")
-    message(STATUS "OpenCL tuning:    ${OCL_TUNE}")
+    message(STATUS "GPU default architecture: ${GPU_ARCH}")
   elseif(GPU_API STREQUAL "HIP")
     message(STATUS "HIP platform:     ${HIP_PLATFORM}")
     message(STATUS "HIP architecture: ${HIP_ARCH}")
diff --git a/cmake/Modules/Documentation.cmake b/cmake/Modules/Documentation.cmake
index 189c32e301..5a42244b9e 100644
--- a/cmake/Modules/Documentation.cmake
+++ b/cmake/Modules/Documentation.cmake
@@ -50,9 +50,9 @@ if(BUILD_DOC)
     OUTPUT ${DOC_BUILD_DIR}/requirements.txt
     DEPENDS docenv ${DOCENV_REQUIREMENTS_FILE}
     COMMAND ${CMAKE_COMMAND} -E copy ${DOCENV_REQUIREMENTS_FILE} ${DOC_BUILD_DIR}/requirements.txt
-    COMMAND ${DOCENV_BINARY_DIR}/pip install --upgrade pip
-    COMMAND ${DOCENV_BINARY_DIR}/pip install --upgrade ${LAMMPS_DOC_DIR}/utils/converters
-    COMMAND ${DOCENV_BINARY_DIR}/pip install --use-feature=2020-resolver -r ${DOC_BUILD_DIR}/requirements.txt --upgrade
+    COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install --upgrade pip
+    COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install --upgrade ${LAMMPS_DOC_DIR}/utils/converters
+    COMMAND ${DOCENV_BINARY_DIR}/pip $ENV{PIP_OPTIONS} install -r ${DOC_BUILD_DIR}/requirements.txt --upgrade
   )
 
   # download mathjax distribution and unpack to folder "mathjax"
diff --git a/cmake/Modules/GTest.cmake b/cmake/Modules/GTest.cmake
index 060a7e42f9..0c62291d5e 100644
--- a/cmake/Modules/GTest.cmake
+++ b/cmake/Modules/GTest.cmake
@@ -20,10 +20,10 @@ ExternalProject_Add(googletest
                                     -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
                                     -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
                                     -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
-                    BUILD_BYPRODUCTS <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest${GTEST_LIB_POSTFIX}.a
-                                     <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock${GTEST_LIB_POSTFIX}.a
-                                     <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest_main${GTEST_LIB_POSTFIX}.a
-                                     <BINARY_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock_main${GTEST_LIB_POSTFIX}.a
+                    BUILD_BYPRODUCTS <BINARY_DIR>/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
+                                     <BINARY_DIR>/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
+                                     <BINARY_DIR>/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
+                                     <BINARY_DIR>/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
                     LOG_DOWNLOAD ON
                     LOG_CONFIGURE ON
                     LOG_BUILD ON
@@ -39,10 +39,10 @@ file(MAKE_DIRECTORY ${GTEST_INCLUDE_DIR})
 file(MAKE_DIRECTORY ${GMOCK_INCLUDE_DIR})
 
 ExternalProject_Get_Property(googletest BINARY_DIR)
-set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest${GTEST_LIB_POSTFIX}.a)
-set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock${GTEST_LIB_POSTFIX}.a)
-set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gtest_main${GTEST_LIB_POSTFIX}.a)
-set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}gmock_main${GTEST_LIB_POSTFIX}.a)
+set(GTEST_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(GMOCK_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(GTEST_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgtest_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
+set(GMOCK_MAIN_LIBRARY_PATH ${BINARY_DIR}/lib/libgmock_main${GTEST_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX})
 
 # Prevent GoogleTest from overriding our compiler/linker options
 # when building with Visual Studio
diff --git a/cmake/Modules/OpenCLLoader.cmake b/cmake/Modules/OpenCLLoader.cmake
new file mode 100644
index 0000000000..ecd9204d24
--- /dev/null
+++ b/cmake/Modules/OpenCLLoader.cmake
@@ -0,0 +1,54 @@
+message(STATUS "Downloading and building OpenCL loader library")
+
+if(CMAKE_BUILD_TYPE STREQUAL Debug)
+  set(OPENCL_LOADER_LIB_POSTFIX d)
+else()
+  set(OPENCL_LOADER_LIB_POSTFIX)
+endif()
+
+include(ExternalProject)
+set(OPENCL_LOADER_URL "https://download.lammps.org/thirdparty/opencl-loader-2020.12.18.tar.gz" CACHE STRING "URL for OpenCL loader tarball")
+mark_as_advanced(OPENCL_LOADER_URL)
+ExternalProject_Add(opencl_loader
+                    URL ${OPENCL_LOADER_URL}
+                    URL_MD5         011cdcbd41030be94f3fced6d763a52a
+                    SOURCE_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-src"
+                    BINARY_DIR      "${CMAKE_BINARY_DIR}/opencl_loader-build"
+                    CMAKE_ARGS      ${CMAKE_REQUEST_PIC} ${CMAKE_EXTRA_OPENCL_LOADER_OPTS}
+                                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                                    -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                                    -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+                                    -DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}
+                                    -DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}
+                    BUILD_BYPRODUCTS <BINARY_DIR>/libOpenCL${OPENCL_LOADER_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}
+                    LOG_DOWNLOAD ON
+                    LOG_CONFIGURE ON
+                    LOG_BUILD ON
+                    INSTALL_COMMAND ""
+                    TEST_COMMAND    "")
+
+ExternalProject_Get_Property(opencl_loader SOURCE_DIR)
+set(OPENCL_LOADER_INCLUDE_DIR ${SOURCE_DIR}/inc)
+
+# workaround for CMake 3.10 on ubuntu 18.04
+file(MAKE_DIRECTORY ${OPENCL_LOADER_INCLUDE_DIR})
+
+ExternalProject_Get_Property(opencl_loader BINARY_DIR)
+set(OPENCL_LOADER_LIBRARY_PATH "${BINARY_DIR}/libOpenCL${OPENCL_LOADER_LIB_POSTFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}")
+
+find_package(Threads QUIET)
+if(NOT WIN32)
+  set(OPENCL_LOADER_DEP_LIBS "Threads::Threads;${CMAKE_DL_LIBS}")
+else()
+  set(OPENCL_LOADER_DEP_LIBS "cfgmgr32;runtimeobject")
+endif()
+
+add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
+add_dependencies(OpenCL::OpenCL opencl_loader)
+
+set_target_properties(OpenCL::OpenCL PROPERTIES
+  IMPORTED_LOCATION ${OPENCL_LOADER_LIBRARY_PATH}
+  INTERFACE_INCLUDE_DIRECTORIES ${OPENCL_LOADER_INCLUDE_DIR}
+  INTERFACE_LINK_LIBRARIES "${OPENCL_LOADER_DEP_LIBS}")
+
+
diff --git a/cmake/Modules/Packages/GPU.cmake b/cmake/Modules/Packages/GPU.cmake
index 4c52eee68b..e2586881ef 100644
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@@ -1,7 +1,9 @@
 set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
 set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
                 ${GPU_SOURCES_DIR}/fix_gpu.h
-                ${GPU_SOURCES_DIR}/fix_gpu.cpp)
+                ${GPU_SOURCES_DIR}/fix_gpu.cpp
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.h
+                ${GPU_SOURCES_DIR}/fix_nh_gpu.cpp)
 target_compile_definitions(lammps PRIVATE -DLMP_GPU)
 
 set(GPU_API "opencl" CACHE STRING "API used by GPU package")
@@ -139,27 +141,13 @@ if(GPU_API STREQUAL "CUDA")
   target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
 
 elseif(GPU_API STREQUAL "OPENCL")
-  if(${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
-    # download and unpack support binaries for compilation of windows binaries.
-    set(LAMMPS_THIRDPARTY_URL "https://download.lammps.org/thirdparty")
-    file(DOWNLOAD "${LAMMPS_THIRDPARTY_URL}/opencl-win-devel.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/opencl-win-devel.tar.gz"
-            EXPECTED_MD5 2c00364888d5671195598b44c2e0d44d)
-    execute_process(COMMAND ${CMAKE_COMMAND} -E tar xzf opencl-win-devel.tar.gz WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    add_library(OpenCL::OpenCL UNKNOWN IMPORTED)
-    if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86")
-      set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win32/libOpenCL.dll")
-    elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64")
-      set_target_properties(OpenCL::OpenCL PROPERTIES IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/lib_win64/libOpenCL.dll")
-    endif()
-    set_target_properties(OpenCL::OpenCL PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${CMAKE_CURRENT_BINARY_DIR}/OpenCL/include")
+  option(USE_STATIC_OPENCL_LOADER "Download and include a static OpenCL ICD loader" ON)
+  mark_as_advanced(USE_STATIC_OPENCL_LOADER)
+  if (USE_STATIC_OPENCL_LOADER)
+    include(OpenCLLoader)
   else()
     find_package(OpenCL REQUIRED)
   endif()
-  set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
-  set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
-  set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
-  validate_option(OCL_TUNE OCL_TUNE_VALUES)
-  string(TOUPPER ${OCL_TUNE} OCL_TUNE)
 
   include(OpenCLUtils)
   set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
@@ -203,7 +191,7 @@ elseif(GPU_API STREQUAL "OPENCL")
   add_library(gpu STATIC ${GPU_LIB_SOURCES})
   target_link_libraries(gpu PRIVATE OpenCL::OpenCL)
   target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu)
-  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
+  target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT)
   target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
 
   target_link_libraries(lammps PRIVATE gpu)
@@ -211,6 +199,7 @@ elseif(GPU_API STREQUAL "OPENCL")
   add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
   target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL)
   target_link_libraries(ocl_get_devices PRIVATE OpenCL::OpenCL)
+  add_dependencies(ocl_get_devices OpenCL::OpenCL)
 elseif(GPU_API STREQUAL "HIP")
   if(NOT DEFINED HIP_PATH)
       if(NOT DEFINED ENV{HIP_PATH})
@@ -393,13 +382,10 @@ elseif(GPU_API STREQUAL "HIP")
   target_link_libraries(lammps PRIVATE gpu)
 endif()
 
-# GPU package
-FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX)
-
 set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
-
-# detects styles which have GPU version
+# detect styles which have a GPU version
 RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
+RegisterFixStyle(${GPU_SOURCES_DIR}/fix_gpu.h)
 
 get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
 
diff --git a/cmake/Modules/Packages/KIM.cmake b/cmake/Modules/Packages/KIM.cmake
index 83a96d02b8..5482d3071c 100644
--- a/cmake/Modules/Packages/KIM.cmake
+++ b/cmake/Modules/Packages/KIM.cmake
@@ -69,14 +69,14 @@ if(DOWNLOAD_KIM)
     BUILD_RPATH "${_rpath_prefix}/kim_build-prefix/lib"
     )
 else()
-  if(KIM-API_FOUND AND KIM_API_VERSION VERSION_GREATER_EQUAL 2.2.0)
+  if(KIM-API_FOUND AND KIM-API_VERSION VERSION_GREATER_EQUAL 2.2.0)
     # For kim-api >= 2.2.0
-    find_package(KIM-API ${KIM-API_MIN_VERSION} CONFIG REQUIRED)
+    find_package(KIM-API 2.2.0 CONFIG REQUIRED)
     target_link_libraries(lammps PRIVATE KIM-API::kim-api)
   else()
     # For kim-api 2.1.3 (consistent with previous version of this file)
     find_package(PkgConfig REQUIRED)
-    pkg_check_modules(KIM-API REQUIRED IMPORTED_TARGET libkim-api>=KIM-API_MIN_VERSION)
+    pkg_check_modules(KIM-API REQUIRED IMPORTED_TARGET libkim-api>=${KIM-API_MIN_VERSION})
     target_link_libraries(lammps PRIVATE PkgConfig::KIM-API)
   endif()
 endif()
diff --git a/cmake/Modules/Packages/MESSAGE.cmake b/cmake/Modules/Packages/MESSAGE.cmake
index fb62763828..6ff4e322aa 100644
--- a/cmake/Modules/Packages/MESSAGE.cmake
+++ b/cmake/Modules/Packages/MESSAGE.cmake
@@ -2,9 +2,8 @@ if(LAMMPS_SIZES STREQUAL BIGBIG)
   message(FATAL_ERROR "The MESSAGE Package is not compatible with -DLAMMPS_BIGBIG")
 endif()
 option(MESSAGE_ZMQ "Use ZeroMQ in MESSAGE package" OFF)
-file(GLOB_RECURSE cslib_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.F
-    ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.c
-    ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.cpp)
+file(GLOB_RECURSE cslib_SOURCES
+        ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.cpp)
 
 add_library(cslib STATIC ${cslib_SOURCES})
 target_compile_definitions(cslib PRIVATE -DLAMMPS_${LAMMPS_SIZES})
diff --git a/cmake/Modules/YAML.cmake b/cmake/Modules/YAML.cmake
index a080b566be..f2ba34e1b6 100644
--- a/cmake/Modules/YAML.cmake
+++ b/cmake/Modules/YAML.cmake
@@ -12,7 +12,7 @@ ExternalProject_Add(libyaml
                                       CXX=${CMAKE_CXX_COMPILER}
                                       CC=${CMAKE_C_COMPILER}
                                       --prefix=<INSTALL_DIR> --disable-shared
-                    BUILD_BYPRODUCTS  <INSTALL_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a
+                    BUILD_BYPRODUCTS  <INSTALL_DIR>/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX}
                     TEST_COMMAND      "")
 
 ExternalProject_Get_Property(libyaml INSTALL_DIR)
@@ -23,7 +23,7 @@ set(YAML_LIBRARY_DIR ${INSTALL_DIR}/lib)
 file(MAKE_DIRECTORY ${YAML_INCLUDE_DIR})
 file(MAKE_DIRECTORY ${YAML_LIBRARY_DIR})
 
-set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a)
+set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/libyaml${CMAKE_STATIC_LIBRARY_SUFFIX})
 
 add_library(Yaml::Yaml UNKNOWN IMPORTED)
 set_target_properties(Yaml::Yaml PROPERTIES
diff --git a/doc/Makefile b/doc/Makefile
index 6032aff45f..7deaaf2a2e 100644
--- a/doc/Makefile
+++ b/doc/Makefile
@@ -47,6 +47,8 @@ HAS_PDFLATEX = YES
 endif
 endif
 
+# override settings for PIP commands
+# PIP_OPTIONS = --cert /etc/pki/ca-trust/extracted/openssl/ca-bundle.trust.crt --proxy http://proxy.mydomain.org
 
 #SPHINXEXTRA = -j $(shell $(PYTHON) -c 'import multiprocessing;print(multiprocessing.cpu_count())') $(shell test -f $(BUILDDIR)/doxygen/xml/run.stamp && printf -- "-E")
 
@@ -228,13 +230,13 @@ $(VENV):
 	@( \
 		$(VIRTUALENV) -p $(PYTHON) $(VENV); \
 		. $(VENV)/bin/activate; \
-		pip install --upgrade pip; \
-		pip install -r $(BUILDDIR)/utils/requirements.txt; \
+		pip $(PIP_OPTIONS) install --upgrade pip; \
+		pip $(PIP_OPTIONS) install -r $(BUILDDIR)/utils/requirements.txt; \
 		deactivate;\
 	)
 
 $(MATHJAX):
-	@git clone --depth 1 https://github.com/mathjax/MathJax.git $@
+	@git clone --depth 1 git://github.com/mathjax/MathJax.git $@
 
 $(TXT2RST) $(ANCHORCHECK): $(VENV)
 	@( \
diff --git a/doc/src/Build_basics.rst b/doc/src/Build_basics.rst
index cb6bd9f6aa..c7baa21e62 100644
--- a/doc/src/Build_basics.rst
+++ b/doc/src/Build_basics.rst
@@ -95,7 +95,7 @@ standard. A more detailed discussion of that is below.
 
       .. note::
 
-         The file ``src/STUBS/mpi.c`` provides a CPU timer function
+         The file ``src/STUBS/mpi.cpp`` provides a CPU timer function
          called ``MPI_Wtime()`` that calls ``gettimeofday()``.  If your
          operating system does not support ``gettimeofday()``, you will
          need to insert code to call another timer.  Note that the
diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst
index 60d5ad09af..9180933007 100644
--- a/doc/src/Build_extras.rst
+++ b/doc/src/Build_extras.rst
@@ -120,8 +120,6 @@ CMake build
    -D GPU_API=value             # value = opencl (default) or cuda or hip
    -D GPU_PREC=value            # precision setting
                                 # value = double or mixed (default) or single
-   -D OCL_TUNE=value            # hardware choice for GPU_API=opencl
-                                # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
    -D GPU_ARCH=value            # primary GPU hardware choice for GPU_API=cuda
                                 # value = sm_XX, see below
                                 # default is sm_50
@@ -135,6 +133,8 @@ CMake build
                                 # value = yes (default) or no
    -D CUDA_MPS_SUPPORT=value    # enables some tweaks required to run with active nvidia-cuda-mps daemon
                                 # value = yes or no (default)
+   -D USE_STATIC_OPENCL_LOADER=value  # downloads/includes OpenCL ICD loader library, no local OpenCL headers/libs needed
+                                      # value = yes (default) or no
 
 :code:`GPU_ARCH` settings for different GPU hardware is as follows:
 
@@ -161,6 +161,12 @@ When building with CMake, you **must NOT** build the GPU library in ``lib/gpu``
 using the traditional build procedure. CMake will detect files generated by that
 process and will terminate with an error and a suggestion for how to remove them.
 
+If you are compiling for OpenCL, the default setting is to download, build, and
+link with a static OpenCL ICD loader library and standard OpenCL headers.  This
+way no local OpenCL development headers or library needs to be present and only
+OpenCL compatible drivers need to be installed to use OpenCL.  If this is not
+desired, you can set :code:`USE_STATIC_OPENCL_LOADER` to :code:`no`.
+
 If you are compiling with HIP, note that before running CMake you will have to
 set appropriate environment variables. Some variables such as
 :code:`HCC_AMDGPU_TARGET` or :code:`CUDA_PATH` are necessary for :code:`hipcc`
diff --git a/doc/src/Build_link.rst b/doc/src/Build_link.rst
index 3d66371304..5255620231 100644
--- a/doc/src/Build_link.rst
+++ b/doc/src/Build_link.rst
@@ -20,16 +20,8 @@ the suffix ``.so.0`` (or some other number).
 .. note::
 
    Care should be taken to use the same MPI library for the calling code
-   and the LAMMPS library.  The ``library.h`` file includes ``mpi.h``
-   and uses definitions from it so those need to be available and
-   consistent.  When LAMMPS is compiled with the included STUBS MPI
-   library, then its ``mpi.h`` file needs to be included.  While it is
-   technically possible to use a full MPI library in the calling code
-   and link to a serial LAMMPS library compiled with MPI STUBS, it is
-   recommended to use the *same* MPI library for both, and then use
-   ``MPI_Comm_split()`` in the calling code to pass a suitable
-   communicator with a subset of MPI ranks to the function creating the
-   LAMMPS instance.
+   and the LAMMPS library unless LAMMPS is to be compiled without (real)
+   MPI support using the include STUBS MPI library.
 
 Link with LAMMPS as a static library
 ------------------------------------
@@ -110,7 +102,7 @@ executable, that are also required to link the LAMMPS executable.
 
       .. code-block:: bash
 
-         gcc -c -O -I${HOME}/lammps/src/STUBS -I${HOME}/lammps/src -caller.c
+         gcc -c -O -I${HOME}/lammps/src -caller.c
          g++ -o caller caller.o -L${HOME}/lammps/lib/poems \
                       -L${HOME}/lammps/src/STUBS -L${HOME}/lammps/src \
                       -llammps_serial -lpoems -lmpi_stubs
@@ -174,7 +166,7 @@ the POEMS package installed becomes:
 
       .. code-block:: bash
 
-         gcc -c -O -I${HOME}/lammps/src/STUBS -I${HOME}/lammps/src -caller.c
+         gcc -c -O -I${HOME}/lammps/src -caller.c
          g++ -o caller caller.o -L${HOME}/lammps/src -llammps_serial
 
 Locating liblammps.so at runtime
diff --git a/doc/src/Build_manual.rst b/doc/src/Build_manual.rst
index 59e4e3235b..3bf0337b31 100644
--- a/doc/src/Build_manual.rst
+++ b/doc/src/Build_manual.rst
@@ -74,7 +74,11 @@ For the documentation build a python virtual environment is set up in
 the folder ``doc/docenv`` and various python packages are installed into
 that virtual environment via the ``pip`` tool.  For rendering embedded
 LaTeX code also the `MathJax <https://www.mathjax.org/>`_ JavaScript
-engine needs to be downloaded.
+engine needs to be downloaded.  If you need to pass additional options
+to the pip commands to work (e.g. to use a web proxy or to point to
+additional SSL certificates) you can set them via the ``PIP_OPTIONS``
+environment variable or uncomment and edit the ``PIP_OPTIONS`` setting
+at beginning of the makefile.
 
 The actual translation is then done via ``make`` commands in the doc
 folder.  The following ``make`` commands are available:
@@ -108,7 +112,10 @@ installation of the HTML manual pages into the "install" step when
 installing LAMMPS after the CMake build via ``cmake --build . --target
 install``.  The documentation build is included in the default build
 target, but can also be requested independently with
-``cmake --build . --target doc``.
+``cmake --build . --target doc``.  If you need to pass additional options
+to the pip commands to work (e.g. to use a web proxy or to point to
+additional SSL certificates) you can set them via the ``PIP_OPTIONS``
+environment variable.
 
 .. code-block:: bash
 
diff --git a/doc/src/Commands_fix.rst b/doc/src/Commands_fix.rst
index 26dcc1101c..4793568288 100644
--- a/doc/src/Commands_fix.rst
+++ b/doc/src/Commands_fix.rst
@@ -114,7 +114,7 @@ OPT.
    * :doc:`nph/eff <fix_nh_eff>`
    * :doc:`nph/sphere (o) <fix_nph_sphere>`
    * :doc:`nphug <fix_nphug>`
-   * :doc:`npt (iko) <fix_nh>`
+   * :doc:`npt (giko) <fix_nh>`
    * :doc:`npt/asphere (o) <fix_npt_asphere>`
    * :doc:`npt/body <fix_npt_body>`
    * :doc:`npt/cauchy <fix_npt_cauchy>`
@@ -122,8 +122,8 @@ OPT.
    * :doc:`npt/sphere (o) <fix_npt_sphere>`
    * :doc:`npt/uef <fix_nh_uef>`
    * :doc:`numdiff <fix_numdiff>`
-   * :doc:`nve (iko) <fix_nve>`
-   * :doc:`nve/asphere (i) <fix_nve_asphere>`
+   * :doc:`nve (giko) <fix_nve>`
+   * :doc:`nve/asphere (gi) <fix_nve_asphere>`
    * :doc:`nve/asphere/noforce <fix_nve_asphere_noforce>`
    * :doc:`nve/awpmd <fix_nve_awpmd>`
    * :doc:`nve/body <fix_nve_body>`
@@ -138,7 +138,7 @@ OPT.
    * :doc:`nve/spin <fix_nve_spin>`
    * :doc:`nve/tri <fix_nve_tri>`
    * :doc:`nvk <fix_nvk>`
-   * :doc:`nvt (iko) <fix_nh>`
+   * :doc:`nvt (giko) <fix_nh>`
    * :doc:`nvt/asphere (o) <fix_nvt_asphere>`
    * :doc:`nvt/body <fix_nvt_body>`
    * :doc:`nvt/eff <fix_nh_eff>`
diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index f5b1ef9b38..e7277e2bbb 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -122,7 +122,7 @@ OPT.
    * :doc:`lebedeva/z <pair_lebedeva_z>`
    * :doc:`lennard/mdf <pair_mdf>`
    * :doc:`line/lj <pair_line_lj>`
-   * :doc:`lj/charmm/coul/charmm (iko) <pair_charmm>`
+   * :doc:`lj/charmm/coul/charmm (giko) <pair_charmm>`
    * :doc:`lj/charmm/coul/charmm/implicit (ko) <pair_charmm>`
    * :doc:`lj/charmm/coul/long (gikot) <pair_charmm>`
    * :doc:`lj/charmm/coul/long/soft (o) <pair_fep_soft>`
diff --git a/doc/src/Install_tarball.rst b/doc/src/Install_tarball.rst
index 7c9e834104..6f87df8a21 100644
--- a/doc/src/Install_tarball.rst
+++ b/doc/src/Install_tarball.rst
@@ -33,22 +33,19 @@ in its name, e.g. lammps-23Jun18.
 
 ----------
 
-You can also download a zip file via the "Clone or download" button on
-the `LAMMPS GitHub site <git_>`_.  The file name will be lammps-master.zip
-which can be unzipped with the following command, to create
-a lammps-master dir:
+You can also download a compressed tar or zip archives from the
+"Assets" sections of the `LAMMPS GitHub releases site <git_>`_.
+The file name will be lammps-<version>.zip which can be unzipped
+with the following command, to create a lammps-<version> dir:
 
 .. code-block:: bash
 
    $ unzip lammps*.zip
 
-This version is the most up-to-date LAMMPS development version.  It
-will have the date of the most recent patch release (see the file
-src/version.h).  But it will also include any new bug-fixes or
-features added since the last patch release.  They will be included in
-the next patch release tarball.
+This version corresponds to the selected LAMMPS patch or stable
+release.
 
-.. _git: https://github.com/lammps/lammps
+.. _git: https://github.com/lammps/lammps/releases
 
 ----------
 
diff --git a/doc/src/Python_atoms.rst b/doc/src/Python_atoms.rst
index 92b9677d16..be0d4ff800 100644
--- a/doc/src/Python_atoms.rst
+++ b/doc/src/Python_atoms.rst
@@ -50,7 +50,7 @@ against invalid accesses.
 
       **Numpy Methods**:
 
-      * :py:meth:`numpy.extract_atom() <lammps.numpy_wrapper.extract_atom()>`: extract a per-atom quantity as numpy array
+      * :py:meth:`numpy.extract_atom() <lammps.numpy_wrapper.numpy_wrapper.extract_atom()>`: extract a per-atom quantity as numpy array
 
    .. tab:: PyLammps/IPyLammps API
 
diff --git a/doc/src/Python_module.rst b/doc/src/Python_module.rst
index 59be645cbd..d2564986de 100644
--- a/doc/src/Python_module.rst
+++ b/doc/src/Python_module.rst
@@ -61,7 +61,7 @@ functions. Below is a detailed documentation of the API.
 .. autoclass:: lammps.lammps
    :members:
 
-.. autoclass:: lammps.numpy::numpy_wrapper
+.. autoclass:: lammps.numpy_wrapper::numpy_wrapper
    :members:
 
 ----------
@@ -134,8 +134,8 @@ Style Constants
    to request from computes or fixes. See :cpp:enum:`_LMP_STYLE_CONST`
    for the equivalent constants in the C library interface. Used in
    :py:func:`lammps.extract_compute`, :py:func:`lammps.extract_fix`, and their NumPy variants
-   :py:func:`lammps.numpy.extract_compute() <lammps.numpy.numpy_wrapper.extract_compute>` and
-   :py:func:`lammps.numpy.extract_fix() <lammps.numpy.numpy_wrapper.extract_fix>`.
+   :py:func:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute>` and
+   :py:func:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix>`.
 
 .. _py_type_constants:
 
@@ -149,8 +149,8 @@ Type Constants
    to request  from computes  or fixes.  See :cpp:enum:`_LMP_TYPE_CONST`
    for the equivalent constants in the C library interface. Used in
    :py:func:`lammps.extract_compute`, :py:func:`lammps.extract_fix`, and their NumPy variants
-   :py:func:`lammps.numpy.extract_compute() <lammps.numpy.numpy_wrapper.extract_compute>` and
-   :py:func:`lammps.numpy.extract_fix() <lammps.numpy.numpy_wrapper.extract_fix>`.
+   :py:func:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute>` and
+   :py:func:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix>`.
 
 .. _py_vartype_constants:
 
@@ -170,6 +170,6 @@ Classes representing internal objects
    :members:
    :no-undoc-members:
 
-.. autoclass:: lammps.numpy::NumPyNeighList
+.. autoclass:: lammps.numpy_wrapper::NumPyNeighList
    :members:
    :no-undoc-members:
diff --git a/doc/src/Python_neighbor.rst b/doc/src/Python_neighbor.rst
index 80651b608f..cba117ad20 100644
--- a/doc/src/Python_neighbor.rst
+++ b/doc/src/Python_neighbor.rst
@@ -14,5 +14,5 @@ Neighbor list access
 
 **NumPy Methods:**
 
-* :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.get_neighlist()>`: Get neighbor list for given index, which uses NumPy arrays for its element neighbor arrays
-* :py:meth:`lammps.numpy.get_neighlist_element_neighbors() <lammps.numpy_wrapper.get_neighlist_element_neighbors()>`: Get element in neighbor list and its neighbors (as numpy array)
+* :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.numpy_wrapper.get_neighlist()>`: Get neighbor list for given index, which uses NumPy arrays for its element neighbor arrays
+* :py:meth:`lammps.numpy.get_neighlist_element_neighbors() <lammps.numpy_wrapper.numpy_wrapper.get_neighlist_element_neighbors()>`: Get element in neighbor list and its neighbors (as numpy array)
diff --git a/doc/src/Python_objects.rst b/doc/src/Python_objects.rst
index ec29863d38..4c8161b8bd 100644
--- a/doc/src/Python_objects.rst
+++ b/doc/src/Python_objects.rst
@@ -36,9 +36,9 @@ computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module.
       Python subscripting. The values will be zero for atoms not in the
       specified group.
 
-      :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.extract_compute()>`,
-      :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.extract_fix()>`, and
-      :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.extract_variable()>` are
+      :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute()>`,
+      :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix()>`, and
+      :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.numpy_wrapper.extract_variable()>` are
       equivalent NumPy implementations that return NumPy arrays instead of ``ctypes`` pointers.
 
       The :py:meth:`lammps.set_variable() <lammps.lammps.set_variable()>` method sets an
@@ -54,9 +54,9 @@ computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module.
 
       **NumPy Methods**:
 
-      * :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.extract_compute()>`: extract value(s) from a compute, return arrays as numpy arrays
-      * :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.extract_fix()>`: extract value(s) from a fix, return arrays as numpy arrays
-      * :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.extract_variable()>`: extract value(s) from a variable, return arrays as numpy arrays
+      * :py:meth:`lammps.numpy.extract_compute() <lammps.numpy_wrapper.numpy_wrapper.extract_compute()>`: extract value(s) from a compute, return arrays as numpy arrays
+      * :py:meth:`lammps.numpy.extract_fix() <lammps.numpy_wrapper.numpy_wrapper.extract_fix()>`: extract value(s) from a fix, return arrays as numpy arrays
+      * :py:meth:`lammps.numpy.extract_variable() <lammps.numpy_wrapper.numpy_wrapper.extract_variable()>`: extract value(s) from a variable, return arrays as numpy arrays
 
 
    .. tab:: PyLammps/IPyLammps API
diff --git a/doc/src/Speed_gpu.rst b/doc/src/Speed_gpu.rst
index 56eb48cd0e..709a3ad3bb 100644
--- a/doc/src/Speed_gpu.rst
+++ b/doc/src/Speed_gpu.rst
@@ -1,11 +1,14 @@
 GPU package
 ===========
 
-The GPU package was developed by Mike Brown while at SNL and ORNL
-and his collaborators, particularly Trung Nguyen (now at Northwestern).
-It provides GPU versions of many pair styles and for parts of the
-:doc:`kspace_style pppm <kspace_style>` for long-range Coulombics.
-It has the following general features:
+The GPU package was developed by Mike Brown while at SNL and ORNL (now
+at Intel Corp.) and his collaborators, particularly Trung Nguyen (now at
+Northwestern).  Support for AMD GPUs via HIP was added by Vsevolod Nikolskiy
+and coworkers at HSE University.
+
+The GPU package provides GPU versions of many pair styles and for
+parts of the :doc:`kspace_style pppm <kspace_style>` for long-range
+Coulombics.  It has the following general features:
 
 * It is designed to exploit common GPU hardware configurations where one
   or more GPUs are coupled to many cores of one or more multi-core CPUs,
@@ -24,8 +27,9 @@ It has the following general features:
   force vectors.
 * LAMMPS-specific code is in the GPU package.  It makes calls to a
   generic GPU library in the lib/gpu directory.  This library provides
-  NVIDIA support as well as more general OpenCL support, so that the
-  same functionality is supported on a variety of hardware.
+  either Nvidia support, AMD support, or more general OpenCL support
+  (for Nvidia GPUs, AMD GPUs, Intel GPUs, and multi-core CPUs).
+  so that the same functionality is supported on a variety of hardware.
 
 **Required hardware/software:**
 
@@ -45,12 +49,23 @@ to have the OpenCL headers and the (vendor neutral) OpenCL library installed.
 In OpenCL mode, the acceleration depends on having an `OpenCL Installable Client Driver (ICD) <https://www.khronos.org/news/permalink/opencl-installable-client-driver-icd-loader>`_
 installed. There can be multiple of them for the same or different hardware
 (GPUs, CPUs, Accelerators) installed at the same time. OpenCL refers to those
-as 'platforms'.  The GPU library will select the **first** suitable platform,
-but this can be overridden using the device option of the :doc:`package <package>`
+as 'platforms'.  The GPU library will try to auto-select the best suitable platform,
+but this can be overridden using the platform option of the :doc:`package <package>`
 command. run lammps/lib/gpu/ocl_get_devices to get a list of available
 platforms and devices with a suitable ICD available.
 
-To compute and use this package in HIP mode, you have to have the AMD ROCm
+To compile and use this package for Intel GPUs, OpenCL or the Intel oneAPI
+HPC Toolkit can be installed using linux package managers. The latter also
+provides optimized C++, MPI, and many other libraries and tools. See:
+
+* https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit/download.html
+
+If you do not have a discrete GPU card installed, this package can still provide
+significant speedups on some CPUs that include integrated GPUs. Additionally, for
+many macs, OpenCL is already included with the OS and Makefiles are available
+in the lib/gpu directory.
+
+To compile and use this package in HIP mode, you have to have the AMD ROCm
 software installed. Versions of ROCm older than 3.5 are currently deprecated
 by AMD.
 
@@ -75,10 +90,20 @@ automatically if you create more MPI tasks/node than there are
 GPUs/mode.  E.g. with 8 MPI tasks/node and 2 GPUs, each GPU will be
 shared by 4 MPI tasks.
 
+The GPU package also has limited support for OpenMP for both
+multi-threading and vectorization of routines that are run on the CPUs.
+This requires that the GPU library and LAMMPS are built with flags to
+enable OpenMP support (e.g. -fopenmp). Some styles for time integration
+are also available in the GPU package. These run completely on the CPUs
+in full double precision, but exploit multi-threading and vectorization
+for faster performance.
+
 Use the "-sf gpu" :doc:`command-line switch <Run_options>`, which will
 automatically append "gpu" to styles that support it.  Use the "-pk
 gpu Ng" :doc:`command-line switch <Run_options>` to set Ng = # of
-GPUs/node to use.
+GPUs/node to use. If Ng is 0, the number is selected automatically as
+the number of matching GPUs that have the highest number of compute
+cores.
 
 .. code-block:: bash
 
@@ -87,8 +112,8 @@ GPUs/node to use.
    mpirun -np 48 -ppn 12 lmp_machine -sf gpu -pk gpu 2 -in in.script   # ditto on 4 16-core nodes
 
 Note that if the "-sf gpu" switch is used, it also issues a default
-:doc:`package gpu 1 <package>` command, which sets the number of
-GPUs/node to 1.
+:doc:`package gpu 0 <package>` command, which will result in
+automatic selection of the number of GPUs to use.
 
 Using the "-pk" switch explicitly allows for setting of the number of
 GPUs/node to use and additional options.  Its syntax is the same as
@@ -138,6 +163,13 @@ Likewise, you should experiment with the precision setting for the GPU
 library to see if single or mixed precision will give accurate
 results, since they will typically be faster.
 
+MPI parallelism typically outperforms OpenMP parallelism, but in some
+cases using fewer MPI tasks and multiple OpenMP threads with the GPU
+package can give better performance. 3-body potentials can often perform
+better with multiple OMP threads because the inter-process communication
+is higher for these styles with the GPU package in order to allow
+deterministic results.
+
 **Guidelines for best performance:**
 
 * Using multiple MPI tasks per GPU will often give the best performance,
@@ -161,6 +193,12 @@ results, since they will typically be faster.
   :doc:`angle <angle_style>`, :doc:`dihedral <dihedral_style>`,
   :doc:`improper <improper_style>`, and :doc:`long-range <kspace_style>`
   calculations will not be included in the "Pair" time.
+* Since only part of the pppm kspace style is GPU accelerated, it
+  may be faster to only use GPU acceleration for Pair styles with
+  long-range electrostatics.  See the "pair/only" keyword of the
+  package command for a shortcut to do that.  The work between kspace
+  on the CPU and non-bonded interactions on the GPU can be balanced
+  through adjusting the coulomb cutoff without loss of accuracy.
 * When the *mode* setting for the package gpu command is force/neigh,
   the time for neighbor list calculations on the GPU will be added into
   the "Pair" time, not the "Neigh" time.  An additional breakdown of the
diff --git a/doc/src/Speed_packages.rst b/doc/src/Speed_packages.rst
index 600c4ac2b4..6210242413 100644
--- a/doc/src/Speed_packages.rst
+++ b/doc/src/Speed_packages.rst
@@ -16,7 +16,7 @@ These are the accelerator packages currently in LAMMPS, either as
 standard or user packages:
 
 +-----------------------------------------+-------------------------------------------------------+
-| :doc:`GPU Package <Speed_gpu>`          | for NVIDIA GPUs as well as OpenCL support             |
+| :doc:`GPU Package <Speed_gpu>`          | for GPUs via CUDA, OpenCL, or ROCm HIP                |
 +-----------------------------------------+-------------------------------------------------------+
 | :doc:`USER-INTEL Package <Speed_intel>` | for Intel CPUs and Intel Xeon Phi                     |
 +-----------------------------------------+-------------------------------------------------------+
@@ -43,7 +43,7 @@ three kinds of hardware, via the listed packages:
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
 | Many-core CPUs  | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>`, :doc:`USER-OMP <Speed_omp>`, :doc:`OPT <Speed_opt>` packages |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
-| NVIDIA/AMD GPUs | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                               |
+| GPUs            | :doc:`GPU <Speed_gpu>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                               |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
 | Intel Phi/AVX   | :doc:`USER-INTEL <Speed_intel>`, :doc:`KOKKOS <Speed_kokkos>` packages                                                      |
 +-----------------+-----------------------------------------------------------------------------------------------------------------------------+
@@ -154,8 +154,8 @@ Here is a brief summary of what the various packages provide.  Details
 are in the individual accelerator sections.
 
 * Styles with a "gpu" suffix are part of the GPU package and can be run
-  on NVIDIA or AMD GPUs.  The speed-up on a GPU depends on a variety of
-  factors, discussed in the accelerator sections.
+  on Intel, NVIDIA, or AMD GPUs.  The speed-up on a GPU depends on a
+  variety of factors, discussed in the accelerator sections.
 * Styles with an "intel" suffix are part of the USER-INTEL
   package. These styles support vectorized single and mixed precision
   calculations, in addition to full double precision.  In extreme cases,
diff --git a/doc/src/compute_temp_chunk.rst b/doc/src/compute_temp_chunk.rst
index 77e2568fce..f1c34b42fa 100644
--- a/doc/src/compute_temp_chunk.rst
+++ b/doc/src/compute_temp_chunk.rst
@@ -153,7 +153,7 @@ temp/chunk calculation to a file is to use the :doc:`fix ave/time <fix_ave_time>
 
    compute cc1 all chunk/atom molecule
    compute myChunk all temp/chunk cc1 temp
-   fix 1 all ave/time 100 1 100 c_myChunk file tmp.out mode vector
+   fix 1 all ave/time 100 1 100 c_myChunk[1] file tmp.out mode vector
 
 ----------
 
diff --git a/doc/src/fix_nh.rst b/doc/src/fix_nh.rst
index 590211eda7..f40ce0c463 100644
--- a/doc/src/fix_nh.rst
+++ b/doc/src/fix_nh.rst
@@ -1,8 +1,10 @@
 .. index:: fix nvt
+.. index:: fix nvt/gpu
 .. index:: fix nvt/intel
 .. index:: fix nvt/kk
 .. index:: fix nvt/omp
 .. index:: fix npt
+.. index:: fix npt/gpu
 .. index:: fix npt/intel
 .. index:: fix npt/kk
 .. index:: fix npt/omp
@@ -13,12 +15,12 @@
 fix nvt command
 ===============
 
-Accelerator Variants: *nvt/intel*, *nvt/kk*, *nvt/omp*
+Accelerator Variants: *nvt/gpu*, *nvt/intel*, *nvt/kk*, *nvt/omp*
 
 fix npt command
 ===============
 
-Accelerator Variants: *npt/intel*, *npt/kk*, *npt/omp*
+Accelerator Variants: *npt/gpu*, *npt/intel*, *npt/kk*, *npt/omp*
 
 fix nph command
 ===============
diff --git a/doc/src/fix_nve.rst b/doc/src/fix_nve.rst
index 71f8ec300f..ae472b1a38 100644
--- a/doc/src/fix_nve.rst
+++ b/doc/src/fix_nve.rst
@@ -1,4 +1,5 @@
 .. index:: fix nve
+.. index:: fix nve/gpu
 .. index:: fix nve/intel
 .. index:: fix nve/kk
 .. index:: fix nve/omp
@@ -6,7 +7,7 @@
 fix nve command
 ===============
 
-Accelerator Variants: *nve/intel*, *nve/kk*, *nve/omp*
+Accelerator Variants: *nve/gpu*, *nve/intel*, *nve/kk*, *nve/omp*
 
 Syntax
 """"""
diff --git a/doc/src/fix_nve_asphere.rst b/doc/src/fix_nve_asphere.rst
index af80460b32..c49de34d0b 100644
--- a/doc/src/fix_nve_asphere.rst
+++ b/doc/src/fix_nve_asphere.rst
@@ -1,10 +1,11 @@
 .. index:: fix nve/asphere
+.. index:: fix nve/asphere/gpu
 .. index:: fix nve/asphere/intel
 
 fix nve/asphere command
 =======================
 
-Accelerator Variants: *nve/asphere/intel*
+Accelerator Variants: *nve/asphere/gpu*, *nve/asphere/intel*
 
 Syntax
 """"""
diff --git a/doc/src/package.rst b/doc/src/package.rst
index 6a5ff44077..1613ff2fae 100644
--- a/doc/src/package.rst
+++ b/doc/src/package.rst
@@ -18,7 +18,7 @@ Syntax
        *gpu* args = Ngpu keyword value ...
          Ngpu = # of GPUs per node
          zero or more keyword/value pairs may be appended
-         keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *device* or *blocksize*
+         keywords = *neigh* or *newton* or *pair/only* or *binsize* or *split* or *gpuID* or *tpa* or *blocksize* or *platform* or *device_type* or *ocl_args*
            *neigh* value = *yes* or *no*
              yes = neighbor list build on GPU (default)
              no = neighbor list build on CPU
@@ -32,17 +32,20 @@ Syntax
              size = bin size for neighbor list construction (distance units)
            *split* = fraction
              fraction = fraction of atoms assigned to GPU (default = 1.0)
-           *gpuID* values = first last
-             first = ID of first GPU to be used on each node
-             last = ID of last GPU to be used on each node
-           *tpa* value = Nthreads
-             Nthreads = # of GPU threads used per atom
-           *device* value = device_type or platform_id:device_type or platform_id:custom,val1,val2,val3,..,val13
-             platform_id = numerical OpenCL platform id (default: -1)
-             device_type = *kepler* or *fermi* or *cypress* or *intel* or *phi* or *generic* or *custom*
-             val1,val2,... = custom OpenCL tune parameters (see below for details)
+           *tpa* value = Nlanes
+             Nlanes = # of GPU vector lanes (CUDA threads) used per atom
            *blocksize* value = size
              size = thread block size for pair force computation
+           *omp* value = Nthreads
+             Nthreads = number of OpenMP threads to use on CPU (default = 0)
+           *platform* value = id
+             id = For OpenCL, platform ID for the GPU or accelerator
+           *gpuID* values = id
+             id = ID of first GPU to be used on each node
+           *device_type* value = *intelgpu* or *nvidiagpu* or *amdgpu* or *applegpu* or *generic* or *custom,val1,val2,...*
+             val1,val2,... = custom OpenCL accelerator configuration parameters (see below for details)
+           *ocl_args* value = args
+             args = List of additional OpenCL compiler arguments delimited by colons
        *intel* args = NPhi keyword value ...
          Nphi = # of co-processors per node
          zero or more keyword/value pairs may be appended
@@ -100,7 +103,7 @@ Syntax
              off = use device acceleration (e.g. GPU) for all available styles in the KOKKOS package (default)
              on  = use device acceleration only for pair styles (and host acceleration for others)
        *omp* args = Nthreads keyword value ...
-         Nthread = # of OpenMP threads to associate with each MPI process
+         Nthreads = # of OpenMP threads to associate with each MPI process
          zero or more keyword/value pairs may be appended
          keywords = *neigh*
            *neigh* value = *yes* or *no*
@@ -112,12 +115,10 @@ Examples
 
 .. code-block:: LAMMPS
 
-   package gpu 1
+   package gpu 0
    package gpu 1 split 0.75
    package gpu 2 split -1.0
-   package gpu 1 device kepler
-   package gpu 1 device 2:generic
-   package gpu 1 device custom,32,4,8,256,11,128,256,128,32,64,8,128,128
+   package gpu 0 omp 2 device_type intelgpu
    package kokkos neigh half comm device
    package omp 0 neigh no
    package omp 4
@@ -174,10 +175,18 @@ simulations.
 The *gpu* style invokes settings associated with the use of the GPU
 package.
 
-The *Ngpu* argument sets the number of GPUs per node.  There must be
-at least as many MPI tasks per node as GPUs, as set by the mpirun or
-mpiexec command.  If there are more MPI tasks (per node)
-than GPUs, multiple MPI tasks will share each GPU.
+The *Ngpu* argument sets the number of GPUs per node. If *Ngpu* is 0
+and no other keywords are specified, GPU or accelerator devices are
+auto-selected. In this process, all platforms are searched for
+accelerator devices and GPUs are chosen if available. The device with
+the highest number of compute cores is selected. The number of devices
+is increased to be the number of matching accelerators with the same
+number of compute cores. If there are more devices than MPI tasks,
+the additional devices will be unused. The auto-selection of GPUs/
+accelerator devices and platforms can be restricted by specifying
+a non-zero value for *Ngpu* and / or using the *gpuID*, *platform*,
+and *device_type* keywords as described below. If there are more MPI
+tasks (per node) than GPUs, multiple MPI tasks will share each GPU.
 
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
@@ -212,18 +221,8 @@ overlapped with all other computations on the CPU.
 
 The *binsize* keyword sets the size of bins used to bin atoms in
 neighbor list builds performed on the GPU, if *neigh* = *yes* is set.
-If *binsize* is set to 0.0 (the default), then bins = the size of the
-pairwise cutoff + neighbor skin distance.  This is 2x larger than the
-LAMMPS default used for neighbor list building on the CPU.  This will
-be close to optimal for the GPU, so you do not normally need to use
-this keyword.  Note that if you use a longer-than-usual pairwise
-cutoff, e.g. to allow for a smaller fraction of KSpace work with a
-:doc:`long-range Coulombic solver <kspace_style>` because the GPU is
-faster at performing pairwise interactions, then it may be optimal to
-make the *binsize* smaller than the default.  For example, with a
-cutoff of 20\*sigma in LJ :doc:`units <units>` and a neighbor skin
-distance of sigma, a *binsize* = 5.25\*sigma can be more efficient than
-the default.
+If *binsize* is set to 0.0 (the default), then the binsize is set
+automatically using heuristics in the GPU package.
 
 The *split* keyword can be used for load balancing force calculations
 between CPU and GPU cores in GPU-enabled pair styles. If 0 < *split* <
@@ -257,63 +256,79 @@ cores would perform force calculations for some fraction of the
 particles at the same time the GPUs performed force calculation for
 the other particles.
 
-The *gpuID* keyword allows selection of which GPUs on each node will
-be used for a simulation.  The *first* and *last* values specify the
-GPU IDs to use (from 0 to Ngpu-1).  By default, first = 0 and last =
-Ngpu-1, so that all GPUs are used, assuming Ngpu is set to the number
-of physical GPUs.  If you only wish to use a subset, set Ngpu to a
-smaller number and first/last to a sub-range of the available GPUs.
+The *gpuID* keyword is used to specify the first ID for the GPU or
+other accelerator that LAMMPS will use. For example, if the ID is
+1 and *Ngpu* is 3, GPUs 1-3 will be used. Device IDs should be
+determined from the output of nvc_get_devices, ocl_get_devices,
+or hip_get_devices
+as provided in the lib/gpu directory. When using OpenCL with
+accelerators that have main memory NUMA, the accelerators can be
+split into smaller virtual accelerators for more efficient use
+with MPI.
 
-The *tpa* keyword sets the number of GPU thread per atom used to
+The *tpa* keyword sets the number of GPU vector lanes per atom used to
 perform force calculations.  With a default value of 1, the number of
-threads will be chosen based on the pair style, however, the value can
+lanes will be chosen based on the pair style, however, the value can
 be set explicitly with this keyword to fine-tune performance.  For
 large cutoffs or with a small number of particles per GPU, increasing
-the value can improve performance. The number of threads per atom must
-be a power of 2 and currently cannot be greater than 32.
-
-The *device* keyword can be used to tune parameters optimized for a
-specific accelerator and platform when using OpenCL. OpenCL supports
-the concept of a **platform**\ , which represents one or more devices that
-share the same driver (e.g. there would be a different platform for
-GPUs from different vendors or for CPU based accelerator support).
-In LAMMPS only one platform can be active at a time and by default
-the first platform with an accelerator is selected. This is equivalent
-to using a platform ID of -1. The platform ID is a number corresponding
-to the output of the ocl_get_devices tool. The platform ID is passed
-to the GPU library, by prefixing the *device* keyword with that number
-separated by a colon. For CUDA, the *device* keyword is ignored.
-Currently, the device tuning support is limited to NVIDIA Kepler, NVIDIA
-Fermi, AMD Cypress, Intel x86_64 CPU, Intel Xeon Phi, or a generic device.
-More devices may be added later.  The default device type can be
-specified when building LAMMPS with the GPU library, via setting a
-variable in the lib/gpu/Makefile that is used.
-
-In addition, a device type *custom* is available, which is followed by
-13 comma separated numbers, which allows to set those tweakable parameters
-from the package command. It can be combined with the (colon separated)
-platform id. The individual settings are:
-
-* MEM_THREADS
-* THREADS_PER_ATOM
-* THREADS_PER_CHARGE
-* BLOCK_PAIR
-* MAX_SHARED_TYPES
-* BLOCK_NBOR_BUILD
-* BLOCK_BIO_PAIR
-* BLOCK_ELLIPSE
-* WARP_SIZE
-* PPPM_BLOCK_1D
-* BLOCK_CELL_2D
-* BLOCK_CELL_ID
-* MAX_BIO_SHARED_TYPES
+the value can improve performance. The number of lanes per atom must
+be a power of 2 and currently cannot be greater than the SIMD width
+for the GPU / accelerator. In the case it exceeds the SIMD width, it
+will automatically be decreased to meet the restriction.
 
 The *blocksize* keyword allows you to tweak the number of threads used
 per thread block. This number should be a multiple of 32 (for GPUs)
 and its maximum depends on the specific GPU hardware. Typical choices
 are 64, 128, or 256. A larger block size increases occupancy of
 individual GPU cores, but reduces the total number of thread blocks,
-thus may lead to load imbalance.
+thus may lead to load imbalance. On modern hardware, the sensitivity
+to the blocksize is typically low.
+
+The *Nthreads* value for the *omp* keyword sets the number of OpenMP
+threads allocated for each MPI task. This setting controls OpenMP
+parallelism only for routines run on the CPUs. For more details on
+setting the number of OpenMP threads, see the discussion of the
+*Nthreads* setting on this doc page for the "package omp" command.
+The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL,
+and GPU packages.
+
+The *platform* keyword is only used with OpenCL to specify the ID for
+an OpenCL platform. See the output from ocl_get_devices in the lib/gpu
+directory. In LAMMPS only one platform can be active at a time and by
+default (id=-1) the platform is auto-selected to find the GPU with the
+most compute cores. When *Ngpu* or other keywords are specified, the
+auto-selection is appropriately restricted. For example, if *Ngpu* is
+3, only platforms with at least 3 accelerators are considered. Similar
+restrictions can be enforced by the *gpuID* and *device_type* keywords.
+
+The *device_type* keyword can be used for OpenCL to specify the type of
+GPU to use or specify a custom configuration for an accelerator. In most
+cases this selection will be automatic and there is no need to use the
+keyword. The *applegpu* type is not specific to a particular GPU vendor,
+but is separate due to the more restrictive Apple OpenCL implementation.
+For expert users, to specify a custom configuration, the *custom* keyword
+followed by the next parameters can be specified:
+
+CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
+THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
+BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
+BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
+PPPM_MAX_SPLINE.
+
+CONFIG_ID can be 0. SHUFFLE_AVAIL in {0,1} indicates that inline-PTX
+(NVIDIA) or OpenCL extensions (Intel) should be used for horizontal
+vector operations. FAST_MATH in {0,1} indicates that OpenCL fast math
+optimizations are used during the build and hardware-accelerated
+transcendental functions are used when available. THREADS_PER_* give the
+default *tpa* values for ellipsoidal models, styles using charge, and
+any other styles. The BLOCK_* parameters specify the block sizes for
+various kernel calls and the MAX_*SHARED*_ parameters are used to
+determine the amount of local shared memory to use for storing model
+parameters.
+
+For OpenCL, the routines are compiled at runtime for the specified GPU
+or accelerator architecture. The *ocl_args* keyword can be used to
+specify additional flags for the runtime build.
 
 ----------
 
@@ -331,44 +346,13 @@ built with co-processor support.
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
-The *omp* keyword determines the number of OpenMP threads allocated
-for each MPI task when any portion of the interactions computed by a
-USER-INTEL pair style are run on the CPU.  This can be the case even
-if LAMMPS was built with co-processor support; see the *balance*
-keyword discussion below.  If you are running with less MPI tasks/node
-than there are CPUs, it can be advantageous to use OpenMP threading on
-the CPUs.
-
-.. note::
-
-   The *omp* keyword has nothing to do with co-processor threads on
-   the Xeon Phi; see the *tpc* and *tptask* keywords below for a
-   discussion of co-processor threads.
-
-The *Nthread* value for the *omp* keyword sets the number of OpenMP
-threads allocated for each MPI task.  Setting *Nthread* = 0 (the
-default) instructs LAMMPS to use whatever value is the default for the
-given OpenMP environment. This is usually determined via the
-*OMP_NUM_THREADS* environment variable or the compiler runtime, which
-is usually a value of 1.
-
-For more details, including examples of how to set the OMP_NUM_THREADS
-environment variable, see the discussion of the *Nthreads* setting on
-this doc page for the "package omp" command.  Nthreads is a required
-argument for the USER-OMP package.  Its meaning is exactly the same
-for the USER-INTEL package.
-
-.. note::
-
-   If you build LAMMPS with both the USER-INTEL and USER-OMP
-   packages, be aware that both packages allow setting of the *Nthreads*
-   value via their package commands, but there is only a single global
-   *Nthreads* value used by OpenMP.  Thus if both package commands are
-   invoked, you should insure the two values are consistent.  If they are
-   not, the last one invoked will take precedence, for both packages.
-   Also note that if the :doc:`-sf hybrid intel omp command-line switch <Run_options>` is used, it invokes a "package intel"
-   command, followed by a "package omp" command, both with a setting of
-   *Nthreads* = 0.
+The *Nthreads* value for the *omp* keyword sets the number of OpenMP
+threads allocated for each MPI task. This setting controls OpenMP
+parallelism only for routines run on the CPUs. For more details on
+setting the number of OpenMP threads, see the discussion of the
+*Nthreads* setting on this doc page for the "package omp" command.
+The meaning of *Nthreads* is exactly the same for the GPU, USER-INTEL,
+and GPU packages.
 
 The *mode* keyword determines the precision mode to use for
 computing pair style forces, either on the CPU or on the co-processor,
@@ -574,7 +558,7 @@ result in better performance for certain configurations and system sizes.
 The *omp* style invokes settings associated with the use of the
 USER-OMP package.
 
-The *Nthread* argument sets the number of OpenMP threads allocated for
+The *Nthreads* argument sets the number of OpenMP threads allocated for
 each MPI task.  For example, if your system has nodes with dual
 quad-core processors, it has a total of 8 cores per node.  You could
 use two MPI tasks per node (e.g. using the -ppn option of the mpirun
@@ -583,7 +567,7 @@ This would use all 8 cores on each node.  Note that the product of MPI
 tasks \* threads/task should not exceed the physical number of cores
 (on a node), otherwise performance will suffer.
 
-Setting *Nthread* = 0 instructs LAMMPS to use whatever value is the
+Setting *Nthreads* = 0 instructs LAMMPS to use whatever value is the
 default for the given OpenMP environment. This is usually determined
 via the *OMP_NUM_THREADS* environment variable or the compiler
 runtime.  Note that in most cases the default for OpenMP capable
@@ -614,6 +598,24 @@ input.  Not all features of LAMMPS support OpenMP threading via the
 USER-OMP package and the parallel efficiency can be very different,
 too.
 
+.. note::
+
+   If you build LAMMPS with the GPU, USER-INTEL, and / or USER-OMP
+   packages, be aware these packages all allow setting of the *Nthreads*
+   value via their package commands, but there is only a single global
+   *Nthreads* value used by OpenMP.  Thus if multiple package commands are
+   invoked, you should insure the values are consistent.  If they are
+   not, the last one invoked will take precedence, for all packages.
+   Also note that if the :doc:`-sf hybrid intel omp command-line switch <Run_options>` is used, it invokes a "package intel" command, followed by a
+   "package omp" command, both with a setting of *Nthreads* = 0. Likewise
+   for a hybrid suffix for gpu and omp. Note that KOKKOS also supports
+   setting the number of OpenMP threads from the command line using the
+   "-k on" :doc:`command-line switch <Run_options>`. The default for
+   KOKKOS is 1 thread per MPI task, so any other number of threads should
+   be explicitly set using the "-k on" command-line switch (and this
+   setting should be consistent with settings from any other packages
+   used).
+
 Optional keyword/value pairs can also be specified.  Each has a
 default value as listed below.
 
@@ -658,9 +660,9 @@ Related commands
 Default
 """""""
 
-For the GPU package, the default is Ngpu = 1 and the option defaults
+For the GPU package, the default is Ngpu = 0 and the option defaults
 are neigh = yes, newton = off, binsize = 0.0, split = 1.0, gpuID = 0
-to Ngpu-1, tpa = 1, and device = not used.  These settings are made
+to Ngpu-1, tpa = 1, omp = 0, and platform=-1.  These settings are made
 automatically if the "-sf gpu" :doc:`command-line switch <Run_options>`
 is used.  If it is not used, you must invoke the package gpu command
 in your input script or via the "-pk gpu" :doc:`command-line switch <Run_options>`.
diff --git a/doc/src/pair_charmm.rst b/doc/src/pair_charmm.rst
index 6d81266a35..b3d2a2b878 100644
--- a/doc/src/pair_charmm.rst
+++ b/doc/src/pair_charmm.rst
@@ -1,4 +1,5 @@
 .. index:: pair_style lj/charmm/coul/charmm
+.. index:: pair_style lj/charmm/coul/charmm/gpu
 .. index:: pair_style lj/charmm/coul/charmm/intel
 .. index:: pair_style lj/charmm/coul/charmm/kk
 .. index:: pair_style lj/charmm/coul/charmm/omp
@@ -19,7 +20,7 @@
 pair_style lj/charmm/coul/charmm command
 ========================================
 
-Accelerator Variants: *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*
+Accelerator Variants: *lj/charmm/coul/charmm/gpu*, *lj/charmm/coul/charmm/intel*, *lj/charmm/coul/charmm/kk*, *lj/charmm/coul/charmm/omp*
 
 pair_style lj/charmm/coul/charmm/implicit command
 =================================================
diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt
index e025e23b09..00fa6ecfaf 100644
--- a/doc/utils/requirements.txt
+++ b/doc/utils/requirements.txt
@@ -1,6 +1,6 @@
 Sphinx
 sphinxcontrib-spelling
-git+https://github.com/akohlmey/sphinx-fortran@parallel-read
+git+git://github.com/akohlmey/sphinx-fortran@parallel-read
 sphinx_tabs
 breathe
 Pygments
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 9937a98850..982e1fde2a 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -2297,6 +2297,7 @@ omegaz
 Omelyan
 omp
 OMP
+oneAPI
 onelevel
 oneway
 onn
@@ -2528,6 +2529,7 @@ ptm
 PTM
 ptol
 ptr
+PTX
 pu
 purdue
 Purohit
diff --git a/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability b/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
index 2c101ac77c..e81fedc34a 100644
--- a/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
+++ b/examples/USER/reaction/tiny_nylon/in.tiny_nylon.stabilized_variable_probability
@@ -22,7 +22,7 @@ improper_style class2
 read_data tiny_nylon.data
 
 variable runsteps equal 1000
-variable prob1 equal step/v_runsteps*2
+variable prob1 equal step/v_runsteps*2+0.1
 variable prob2 equal (step/v_runsteps)>0.5
 
 velocity all create 300.0 4928459 dist gaussian
diff --git a/lib/gpu/Makefile.cuda_mps b/lib/gpu/Makefile.cuda_mps
index 172640ce6a..21aac89151 100644
--- a/lib/gpu/Makefile.cuda_mps
+++ b/lib/gpu/Makefile.cuda_mps
@@ -51,7 +51,7 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
 
 # host code compiler and settings
 
-CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
+CUDR_CPP = mpicxx -fopenmp -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
 CUDR_OPTS = -O2 $(LMP_INC)
 CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PROXY) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
          $(CUDPP_OPT)
diff --git a/lib/gpu/Makefile.hip b/lib/gpu/Makefile.hip
index e2fd3c22d7..dbdef433ec 100644
--- a/lib/gpu/Makefile.hip
+++ b/lib/gpu/Makefile.hip
@@ -17,7 +17,7 @@ LMP_INC = -DLAMMPS_SMALLBIG
 HIP_PRECISION = -D_SINGLE_DOUBLE
 
 HIP_OPTS = -O3
-HIP_HOST_OPTS = -Wno-deprecated-declarations
+HIP_HOST_OPTS = -Wno-deprecated-declarations -fopenmp
 HIP_HOST_INCLUDE =
 
 # use device sort
diff --git a/lib/gpu/Makefile.lammps.mac_ocl b/lib/gpu/Makefile.lammps.mac_ocl
index f6c8a36430..0073efa2ba 100644
--- a/lib/gpu/Makefile.lammps.mac_ocl
+++ b/lib/gpu/Makefile.lammps.mac_ocl
@@ -1,5 +1,5 @@
 # Settings that the LAMMPS build will import when this package library is used
 
-gpu_SYSINC =
+gpu_SYSINC = -DFFT_SINGLE
 gpu_SYSLIB = -framework OpenCL
 gpu_SYSPATH = 
diff --git a/lib/gpu/Makefile.linux_opencl b/lib/gpu/Makefile.linux_opencl
index 2aea7f5a46..43d012dc4a 100644
--- a/lib/gpu/Makefile.linux_opencl
+++ b/lib/gpu/Makefile.linux_opencl
@@ -1,25 +1,21 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for OpenCL 
+#  Generic Linux Makefile for OpenCL - Mixed precision
 # ------------------------------------------------------------------------- */
 
 # which file will be copied to Makefile.lammps
 
 EXTRAMAKE = Makefile.lammps.opencl
 
-# OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
-
 # this setting should match LAMMPS Makefile
 # one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
 
 LMP_INC = -DLAMMPS_SMALLBIG
 
-OCL_INC = -I/usr/local/cuda/include  # Path to CL directory
-OCL_CPP = mpic++ $(DEFAULT_DEVICE) -O3 -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC) -std=c++11
-OCL_LINK = -L/usr/local/cuda/lib64 -lOpenCL
+OCL_INC = 
+OCL_CPP = mpic++ -std=c++11 -O3 -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
+OCL_LINK = -lOpenCL
 OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -fopenmp -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT
 
 BIN_DIR = ./
 OBJ_DIR = ./
@@ -28,4 +24,3 @@ AR = ar
 BSH = /bin/sh
 
 include Opencl.makefile
-
diff --git a/lib/gpu/Makefile.mac_opencl b/lib/gpu/Makefile.mac_opencl
index 62b58c1cef..ae7e8ca6fd 100644
--- a/lib/gpu/Makefile.mac_opencl
+++ b/lib/gpu/Makefile.mac_opencl
@@ -1,19 +1,17 @@
 # /* ----------------------------------------------------------------------   
-#  Generic Mac Makefile for OpenCL 
+#  Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
 # ------------------------------------------------------------------------- */
 
 # which file will be copied to Makefile.lammps
 
 EXTRAMAKE = Makefile.lammps.mac_ocl
 
-OCL_TUNE = -DFERMI_OCL       # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-# OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
+LMP_INC = -DLAMMPS_SMALLBIG
 
-OCL_CPP = mpic++ -O3 -DMPI_GERYON -DUCL_NO_EXIT
+OCL_CPP = clang++ -std=c++11 -O3 -I../../src/STUBS
 OCL_LINK = -framework OpenCL
 OCL_PREC = -D_SINGLE_SINGLE
+OCL_TUNE = -DUCL_NO_EXIT
 
 BIN_DIR = ./
 OBJ_DIR = ./
diff --git a/lib/gpu/Makefile.mac_opencl_mpi b/lib/gpu/Makefile.mac_opencl_mpi
new file mode 100644
index 0000000000..9be9f07e93
--- /dev/null
+++ b/lib/gpu/Makefile.mac_opencl_mpi
@@ -0,0 +1,23 @@
+# /* ----------------------------------------------------------------------   
+#  Generic Mac Makefile for OpenCL - Single precision with FFT_SINGLE
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.mac_ocl
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_CPP = mpicxx -std=c++11 -O3 -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+OCL_LINK = -framework OpenCL
+OCL_PREC = -D_SINGLE_SINGLE
+OCL_TUNE = -DUCL_NO_EXIT -DMPI_GERYON
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
+
diff --git a/lib/gpu/Makefile.oneapi b/lib/gpu/Makefile.oneapi
new file mode 100644
index 0000000000..015ab47057
--- /dev/null
+++ b/lib/gpu/Makefile.oneapi
@@ -0,0 +1,26 @@
+# /* ----------------------------------------------------------------------
+#  Generic Linux Makefile for OpenCL
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.opencl
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+OCL_INC =
+OCL_CPP = mpiicpc -std=c++11 -xHost -O2 -qopenmp -qopenmp-simd  -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
+OCL_LINK = -lOpenCL
+OCL_PREC = -D_SINGLE_DOUBLE
+OCL_TUNE = -DMPI_GERYON -DGERYON_NUMA_FISSION -DUCL_NO_EXIT -fp-model fast=2 -no-prec-div
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+include Opencl.makefile
diff --git a/lib/gpu/Makefile.opencl b/lib/gpu/Makefile.opencl
deleted file mode 100644
index aa7806b542..0000000000
--- a/lib/gpu/Makefile.opencl
+++ /dev/null
@@ -1,92 +0,0 @@
-# /* ----------------------------------------------------------------------   
-#  Generic Linux Makefile for OpenCL 
-# ------------------------------------------------------------------------- */
-
-# which file will be copied to Makefile.lammps
-
-EXTRAMAKE = Makefile.lammps.opencl
-
-# this setting should match LAMMPS Makefile
-# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
-
-LMP_INC = -DLAMMPS_SMALLBIG
-
-# precision for GPU calculations
-# -D_SINGLE_SINGLE  # Single precision for all calculations
-# -D_DOUBLE_DOUBLE  # Double precision for all calculations
-# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
-
-OCL_PREC = -D_SINGLE_DOUBLE
-
-BIN_DIR = ./
-OBJ_DIR = ./
-LIB_DIR = ./
-AR = ar
-BSH = /bin/sh
-
-# Compiler and linker settings
-
-# OCL_TUNE = -DFERMI_OCL     # -- Uncomment for NVIDIA Fermi
-# OCL_TUNE = -DKEPLER_OCL    # -- Uncomment for NVIDIA Kepler
-# OCL_TUNE = -DCYPRESS_OCL   # -- Uncomment for AMD Cypress
-OCL_TUNE = -DGENERIC_OCL   # -- Uncomment for generic device
-
-OCL_INC = -I/usr/local/cuda/include  # Path to CL directory
-OCL_CPP = mpic++ $(DEFAULT_DEVICE) -g -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK $(LMP_INC) $(OCL_INC)
-OCL_LINK = -lOpenCL
-OCL  = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
-
-# Headers for Geryon
-UCL_H  = $(wildcard ./geryon/ucl*.h)
-OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
-PRE1_H = lal_preprocessor.h lal_aux_fun1.h
-ALL_H  =  $(OCL_H) $(wildcard ./lal_*.h)
-
-# Source files
-SRCS := $(wildcard ./lal_*.cpp)
-OBJS := $(subst ./,$(OBJ_DIR)/,$(SRCS:%.cpp=%.o))
-CUS  := $(wildcard lal_*.cu)
-KERS := $(subst ./,$(OBJ_DIR)/,$(CUS:lal_%.cu=%_cl.h))
-KERS := $(addprefix $(OBJ_DIR)/, $(KERS))
-
-# targets
-
-GPU_LIB = $(LIB_DIR)/libgpu.a
-
-EXECS = $(BIN_DIR)/ocl_get_devices
-
-all: $(OBJ_DIR) $(KERS) $(GPU_LIB) $(EXECS)
-
-$(OBJ_DIR):
-	mkdir -p $@
-
-# device code compilation
-
-$(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
-	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
-
-# host code compilation
-
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
-	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
-
-# build libgpu.a
-
-$(GPU_LIB): $(OBJS)
-	$(AR) -crusv $(GPU_LIB) $(OBJS)
-	@cp $(EXTRAMAKE) Makefile.lammps
-
-# test app for querying device info
-
-$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
-	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK)
-
-clean:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
-
-veryclean: clean
-	-rm -rf *~ *.linkinfo
-
-cleanlib:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(KERS) *.linkinfo
-
diff --git a/lib/gpu/Nvidia.makefile b/lib/gpu/Nvidia.makefile
index 6716388562..d3275b890f 100644
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@@ -1,6 +1,7 @@
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
-NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h \
+         lal_pre_cuda_hip.h
 ALL_H  =  $(NVD_H) $(wildcard ./lal_*.h)
 
 # Source files
@@ -39,17 +40,21 @@ BIN2C = $(CUDA_HOME)/bin/bin2c
 
 # device code compilation
 
-$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
+                         lal_pre_cuda_hip.h
 	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
 
 $(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
 	$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
+	rm $(OBJ_DIR)/pppm_f.cubin
 
-$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h \
+                         lal_pre_cuda_hip.h
 	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
 
 $(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
 	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
+	rm $(OBJ_DIR)/pppm_d.cubin
 
 $(OBJ_DIR)/%_cubin.h: lal_%.cu  $(ALL_H)
 	$(CUDA) --fatbin -DNV_KERNEL -o $(OBJ_DIR)/$*.cubin $(OBJ_DIR)/lal_$*.cu
@@ -93,7 +98,7 @@ $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
 
 clean:
-	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.linkinfo
+	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CUHS) *.cubin *.linkinfo
 
 veryclean: clean
 	-rm -rf *~ *.linkinfo
diff --git a/lib/gpu/Opencl.makefile b/lib/gpu/Opencl.makefile
index 996a564998..2ff98827d4 100644
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@@ -1,8 +1,15 @@
+# Common headers for kernels
+PRE1_H = lal_preprocessor.h lal_aux_fun1.h
+
 # Headers for Geryon
 UCL_H  = $(wildcard ./geryon/ucl*.h)
-OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_preprocessor.h
-PRE1_H = lal_preprocessor.h lal_aux_fun1.h
-ALL_H  =  $(OCL_H) $(wildcard ./lal_*.h)
+OCL_H  = $(wildcard ./geryon/ocl*.h) $(UCL_H) lal_precision.h
+
+# Headers for Host files
+HOST_H = lal_answer.h lal_atom.h lal_balance.h lal_base_atomic.h \
+         lal_base_charge.h lal_base_dipole.h lal_base_dpd.h \
+         lal_base_ellipsoid.h lal_base_three.h lal_device.h lal_neighbor.h \
+         lal_neighbor_shared.h lal_pre_ocl_config.h $(OCL_H)
 
 # Source files
 SRCS := $(wildcard ./lal_*.cpp)
@@ -28,12 +35,75 @@ OCL  = $(OCL_CPP) $(OCL_PREC) $(OCL_TUNE) -DUSE_OPENCL
 
 # device code compilation
 
+$(OBJ_DIR)/atom_cl.h: lal_atom.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh atom lal_preprocessor.h lal_atom.cu $(OBJ_DIR)/atom_cl.h
+
+$(OBJ_DIR)/neighbor_cpu_cl.h: lal_neighbor_cpu.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh neighbor_cpu lal_preprocessor.h lal_neighbor_cpu.cu $(OBJ_DIR)/neighbor_cpu_cl.h
+
+$(OBJ_DIR)/neighbor_gpu_cl.h: lal_neighbor_gpu.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh neighbor_gpu lal_preprocessor.h lal_neighbor_gpu.cu $(OBJ_DIR)/neighbor_gpu_cl.h
+
+$(OBJ_DIR)/device_cl.h: lal_device.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh device lal_preprocessor.h lal_device.cu $(OBJ_DIR)/device_cl.h
+
+$(OBJ_DIR)/pppm_cl.h: lal_pppm.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh pppm lal_preprocessor.h lal_pppm.cu $(OBJ_DIR)/pppm_cl.h;
+
+$(OBJ_DIR)/ellipsoid_nbor_cl.h: lal_ellipsoid_nbor.cu lal_preprocessor.h
+	$(BSH) ./geryon/file_to_cstr.sh ellipsoid_nbor lal_preprocessor.h lal_ellipsoid_nbor.cu $(OBJ_DIR)/ellipsoid_nbor_cl.h
+
+$(OBJ_DIR)/gayberne_cl.h: lal_gayberne.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne.cu $(OBJ_DIR)/gayberne_cl.h;
+
+$(OBJ_DIR)/gayberne_lj_cl.h: lal_gayberne_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh gayberne_lj $(PRE1_H) lal_ellipsoid_extra.h lal_gayberne_lj.cu $(OBJ_DIR)/gayberne_lj_cl.h;
+
+$(OBJ_DIR)/re_squared_cl.h: lal_re_squared.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared.cu $(OBJ_DIR)/re_squared_cl.h;
+
+$(OBJ_DIR)/re_squared_lj_cl.h: lal_re_squared_lj.cu $(PRE1_H) lal_ellipsoid_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh re_squared_lj $(PRE1_H) lal_ellipsoid_extra.h lal_re_squared_lj.cu $(OBJ_DIR)/re_squared_lj_cl.h;
+
+$(OBJ_DIR)/tersoff_cl.h: lal_tersoff.cu $(PRE1_H) lal_tersoff_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff $(PRE1_H) lal_tersoff_extra.h lal_tersoff.cu $(OBJ_DIR)/tersoff_cl.h;
+
+$(OBJ_DIR)/tersoff_mod_cl.h: lal_tersoff_mod.cu $(PRE1_H) lal_tersoff_mod_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff_mod $(PRE1_H) lal_tersoff_mod_extra.h lal_tersoff_mod.cu $(OBJ_DIR)/tersoff_mod_cl.h;
+
+$(OBJ_DIR)/tersoff_zbl_cl.h: lal_tersoff_zbl.cu $(PRE1_H) lal_tersoff_zbl_extra.h
+	$(BSH) ./geryon/file_to_cstr.sh tersoff_zbl $(PRE1_H) lal_tersoff_zbl_extra.h lal_tersoff_zbl.cu $(OBJ_DIR)/tersoff_zbl_cl.h;
+
 $(OBJ_DIR)/%_cl.h: lal_%.cu $(PRE1_H)
 	$(BSH) ./geryon/file_to_cstr.sh $* $(PRE1_H) $< $@;
 
 # host code compilation
 
-$(OBJ_DIR)/lal_%.o: lal_%.cpp $(KERS)
+$(OBJ_DIR)/lal_answer.o: lal_answer.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_tstat_ext.o: lal_dpd_tstat_ext.cpp lal_dpd.h $(HOST_H)
+	$(OCL) -o $@ -c lal_dpd_tstat_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: lal_eam_alloy_ext.cpp lal_eam.h $(HOST_H)
+	$(OCL) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: lal_eam_fs_ext.cpp lal_eam.h $(HOST_H)
+	$(OCL) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp $(HOST_H)
+	$(OCL) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%_ext.o: lal_%_ext.cpp lal_%.h $(HOST_H)
+	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_%.o: lal_base_%.cpp $(HOST_H)
+	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_%.o: lal_%.cpp %_cl.h $(HOST_H)
 	$(OCL) -o $@ -c $< -I$(OBJ_DIR)
 
 $(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
diff --git a/lib/gpu/README b/lib/gpu/README
index dfa8dcf7ff..dfffe11b81 100644
--- a/lib/gpu/README
+++ b/lib/gpu/README
@@ -1,21 +1,112 @@
                   --------------------------------
                      LAMMPS ACCELERATOR LIBRARY
                   --------------------------------
-                     
+
                        W. Michael Brown (ORNL)
                         Trung Dac Nguyen (ORNL/Northwestern)
-                          Peng Wang (NVIDIA)
+                        Nitin Dhamankar (Intel)
                        Axel Kohlmeyer (Temple)
+                          Peng Wang (NVIDIA)
+                        Anders Hafreager (UiO)
+                          V. Nikolskiy (HSE)
+                   Maurice de Koning (Unicamp/Brazil)
+                  Rodolfo Paula Leite (Unicamp/Brazil)
                          Steve Plimpton (SNL)
                         Inderaj Bains (NVIDIA)
 
--------------------------------------------------------------------
 
-This directory has source files to build a library that LAMMPS
-links against when using the GPU package.
+------------------------------------------------------------------------------
 
-This library must be built with a C++ compiler, before LAMMPS is
-built, so LAMMPS can link against it.
+This directory has source files to build a library that LAMMPS links against
+when using the GPU package.
+
+This library must be built with a C++ compiler along with CUDA, HIP, or OpenCL
+before LAMMPS is built, so LAMMPS can link against it.
+
+This library, libgpu.a, provides routines for acceleration of certain
+LAMMPS styles and neighbor list builds using CUDA, OpenCL, or ROCm HIP.
+
+Pair styles supported by this library are marked in the list of Pair style
+potentials with a "g". See the online version at:
+
+https://lammps.sandia.gov/doc/Commands_pair.html
+
+In addition the (plain) pppm kspace style is supported as well.
+
+------------------------------------------------------------------------------
+                              DEVICE QUERY
+------------------------------------------------------------------------------
+The gpu library includes binaries to check for available GPUs and their
+properties. It is a good idea to run this on first use to make sure the
+system and build is setup properly. Additionally, the GPU numbering for
+specific selection of devices should be taking from this output. The GPU
+library may split some accelerators into separate virtual accelerators for
+efficient use with MPI.
+
+After building the GPU library, for OpenCL:
+  ./ocl_get_devices
+for CUDA:
+  ./nvc_get_devices
+and for ROCm HIP:
+  ./hip_get_devices
+
+------------------------------------------------------------------------------
+                              QUICK START
+------------------------------------------------------------------------------
+OpenCL: Mac without MPI:
+  make -f Makefile.mac_opencl -j; cd ../../src/; make mpi-stubs
+  make g++_serial -j
+  ./lmp_g++_serial -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Mac with MPI:
+  make -f Makefile.mac_opencl_mpi -j; cd ../../src/; make g++_openmpi -j
+  mpirun -np $NUM_MPI ./lmp_g++_openmpi -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Linux with Intel oneAPI:
+  make -f Makefile.oneapi -j; cd ../../src; make oneapi -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_oneapi -in ../bench/in.lj -log none -sf gpu
+
+OpenCL: Linux with MPI:
+  make -f Makefile.linux_opencl -j; cd ../../src; make omp -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+
+NVIDIA CUDA:
+  make -f Makefile.cuda_mps -j; cd ../../src; make omp -j
+  export CUDA_MPS_LOG_DIRECTORY=/tmp; export CUDA_MPS_PIPE_DIRECTORY=/tmp
+  nvidia-smi -i 0 -c EXCLUSIVE_PROCESS
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+  echo quit | /usr/bin/nvidia-cuda-mps-control
+
+AMD HIP:
+  make -f Makefile.hip -j; cd ../../src; make omp -j
+  export OMP_NUM_THREADS=$NUM_THREADS
+  mpirun -np $NUM_MPI ./lmp_omp -in ../bench/in.lj -log none -sf gpu
+
+------------------------------------------------------------------------------
+                 Installing oneAPI, OpenCl, CUDA, or ROCm
+------------------------------------------------------------------------------
+The easiest approach is to use the linux package manger to perform the
+installation from Intel, NVIDIA, etc. repositories. All are available for
+free. The oneAPI installation includes Intel optimized MPI and C++ compilers,
+along with many libraries. Alternatively, Intel OpenCL can also be installed
+separately from the Intel repository.
+
+NOTE: Installation of the CUDA SDK is not required, only the CUDA toolkit.
+
+See:
+
+https://software.intel.com/content/www/us/en/develop/tools/oneapi/hpc-toolkit.html
+
+https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
+
+https://github.com/RadeonOpenCompute/ROCm
+
+------------------------------------------------------------------------------
+                              Build Intro
+------------------------------------------------------------------------------
 
 You can type "make lib-gpu" from the src directory to see help on how
 to build this library via make commands, or you can do the same thing
@@ -25,13 +116,13 @@ do it manually by following the instructions below.
 Build the library using one of the provided Makefile.* files or create
 your own, specific to your compiler and system.  For example:
 
-make -f Makefile.linux
+make -f Makefile.linux_opencl
 
 When you are done building this library, two files should
 exist in this directory:
 
-libgpu.a		the library LAMMPS will link against
-Makefile.lammps		settings the LAMMPS Makefile will import
+libgpu.a                the library LAMMPS will link against
+Makefile.lammps         settings the LAMMPS Makefile will import
 
 Makefile.lammps is created by the make command, by copying one of the
 Makefile.lammps.* files.  See the EXTRAMAKE setting at the top of the
@@ -45,77 +136,52 @@ IMPORTANT: If you re-build the library, e.g. for a different precision
 Makefile.linux clean, to insure all previous derived files are removed
 before the new build is done.
 
-Makefile.lammps has settings for 3 variables:
-
-user-gpu_SYSINC = leave blank for this package
-user-gpu_SYSLIB = CUDA libraries needed by this package
-user-gpu_SYSPATH = path(s) to where those libraries are
-
-Because you have the CUDA compilers on your system, you should have
-the needed libraries.  If the CUDA development tools were installed
-in the standard manner, the settings in the Makefile.lammps.standard
-file should work.
-
--------------------------------------------------------------------
-
-                          GENERAL NOTES
-                  --------------------------------
-                          
-This library, libgpu.a, provides routines for GPU acceleration
-of certain LAMMPS styles and neighbor list builds. Compilation of this 
-library requires installing the CUDA GPU driver and CUDA toolkit for
-your operating system. Installation of the CUDA SDK is not necessary.
-In addition to the LAMMPS library, the binary nvc_get_devices will also
-be built. This can be used to query the names and properties of GPU 
-devices on your system. A Makefile for OpenCL and ROCm HIP compilation
-is provided, but support for it is not currently provided by the developers.
-Details of the implementation are provided in:
-
-----
-
-Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing 
-Molecular Dynamics on Hybrid High Performance Computers - Short Range 
-Forces. Computer Physics Communications. 2011. 182: p. 898-911. 
-
-and
-
-Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing 
-Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle 
-Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459. 
-
-and
-
-Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High 
-Performance Computers - Three-Body Potentials. Computer Physics Communications. 
-2013. 184: p. 2785–2793.
-
-----
-
-NOTE: Installation of the CUDA SDK is not required, only the CUDA
-toolkit itself or an OpenCL 1.2 compatible header and library.
-
-Pair styles supporting GPU acceleration this this library
-are marked in the list of Pair style potentials with a "g".
-See the online version at: https://lammps.sandia.gov/doc/Commands_pair.html
-
-In addition the (plain) pppm kspace style is supported as well.
+NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG,
+      or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in
+      src/MAKE/Makefile.foo) should be consistent with that specified
+      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
 
 
-                     MULTIPLE LAMMPS PROCESSES
-                  --------------------------------
-                     
-Multiple LAMMPS MPI processes can share GPUs on the system, but multiple
-GPUs cannot be utilized by a single MPI process. In many cases, the
-best performance will be obtained by running as many MPI processes as
-CPU cores available with the condition that the number of MPI processes
-is an integer multiple of the number of GPUs being used. See the 
-LAMMPS user manual for details on running with GPU acceleration.
+------------------------------------------------------------------------------
+                             PRECISION MODES
+------------------------------------------------------------------------------
+The GPU library supports 3 precision modes: single, double, and mixed, with
+the latter being the default for most Makefiles aside from Mac specific
+Makefiles due to the more restrictive nature of the Apple OpenCL for some
+devices.
+
+To specify the precision mode (output to the screen before LAMMPS runs for
+verification), set either CUDA_PRECISION, OCL_PREC, or HIP_PRECISION to one
+of -D_SINGLE_SINGLE, -D_DOUBLE_DOUBLE, or -D_SINGLE_DOUBLE.
+
+Some accelerators or OpenCL implementations only support single precision.
+This mode should be used with care and appropriate validation as the errors
+can scale with system size in this implementation. This can be useful for
+accelerating test runs when setting up a simulation for production runs on
+another machine. In the case where only single precision is supported, either
+LAMMPS must be compiled with -DFFT_SINGLE to use PPPM with GPU acceleration
+or GPU acceleration should be disabled for PPPM (e.g. suffix off or pair/only
+as described in the LAMMPS documentation).
 
 
-                    BUILDING AND PRECISION MODES
-                  --------------------------------
+------------------------------------------------------------------------------
+                             CUDA BUILD NOTES
+------------------------------------------------------------------------------
+NOTE: when compiling with CMake, all of the considerations listed below
+are considered within the CMake configuration process, so no separate
+compilation of the gpu library is required. Also this will build in support
+for all compute architecture that are supported by the CUDA toolkit version
+used to build the gpu library.
 
-To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of 
+If you do not want to use a fat binary, that supports multiple CUDA
+architectures, the CUDA_ARCH must be set to match the GPU architecture. This
+is reported by nvc_get_devices executable created by the build process and
+a detailed list of GPU architectures and CUDA compatible GPUs can be found
+e.g. here: https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+
+The CUDA_HOME variable should be set to the location of the CUDA toolkit.
+
+To build, edit the CUDA_ARCH, CUDA_PRECISION, CUDA_HOME variables in one of
 the Makefiles. CUDA_ARCH should be set based on the compute capability of
 your GPU. This can be verified by running the nvc_get_devices executable after
 the build is complete. Additionally, the GPU package must be installed and
@@ -123,82 +189,93 @@ compiled for LAMMPS. This may require editing the gpu_SYSPATH variable in the
 LAMMPS makefile.
 
 Please note that the GPU library accesses the CUDA driver library directly,
-so it needs to be linked not only to the CUDA runtime library (libcudart.so)
-that ships with the CUDA toolkit, but also with the CUDA driver library
-(libcuda.so) that ships with the Nvidia driver. If you are compiling LAMMPS
-on the head node of a GPU cluster, this library may not be installed,
-so you may need to copy it over from one of the compute nodes (best into
-this directory). Recent CUDA toolkits starting from CUDA 9 provide a dummy
-libcuda.so library (typically under $(CUDA_HOME)/lib64/stubs), that can be used for
-linking.
+so it needs to be linked with the CUDA driver library (libcuda.so) that ships
+with the Nvidia driver. If you are compiling LAMMPS on the head node of a GPU
+cluster, this library may not be installed, so you may need to copy it over
+from one of the compute nodes (best into this directory). Recent CUDA toolkits
+starting from CUDA 9 provide a dummy libcuda.so library (typically under
+$(CUDA_HOME)/lib64/stubs), that can be used for linking.
 
-The gpu library supports 3 precision modes as determined by 
-the CUDA_PRECISION variable:
+Best performance with the GPU library is typically with multiple MPI processes
+sharing the same GPU cards. For NVIDIA, this is most efficient with CUDA
+MPS enabled. To prevent runtime errors for GPUs configured in exclusive process
+mode with MPS, the GPU library should be build with either of the equivalent
+-DCUDA_MPS_SUPPORT or -DCUDA_PROXY flags.
 
-  CUDA_PRECISION = -D_SINGLE_SINGLE  # Single precision for all calculations
-  CUDA_PRECISION = -D_DOUBLE_DOUBLE  # Double precision for all calculations
-  CUDA_PRECISION = -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+------------------------------------------------------------------------------
+                             HIP BUILD NOTES
+------------------------------------------------------------------------------
 
-As of CUDA 7.5 only GPUs with compute capability 2.0 (Fermi) or newer are
-supported and as of CUDA 9.0 only compute capability 3.0 (Kepler) or newer
-are supported. There are some limitations of this library for GPUs older
-than that, which require additional preprocessor flag, and limit features,
-but they are kept for historical reasons. There is no value in trying to
-use those GPUs for production calculations.
-
-You have to make sure that you set a CUDA_ARCH line suitable for your
-hardware and CUDA toolkit version: e.g. -arch=sm_35 for Tesla K20 or K40
-or -arch=sm_52 GeForce GTX Titan X. A detailed list of GPU architectures
-and CUDA compatible GPUs can be found e.g. here: 
-https://en.wikipedia.org/wiki/CUDA#GPUs_supported
-
-NOTE: when compiling with CMake, all of the considerations listed below
-are considered within the CMake configuration process, so no separate 
-compilation of the gpu library is required. Also this will build in support
-for all compute architecture that are supported by the CUDA toolkit version
-used to build the gpu library.
-
-Please note the CUDA_CODE settings in Makefile.linux_multi, which allows
-to compile this library with support for multiple GPUs. This list can be
-extended for newer GPUs with newer CUDA toolkits and should allow to build
-a single GPU library compatible with all GPUs that are worth using for
-GPU acceleration and supported by the current CUDA toolkits and drivers.
-
-NOTE: The system-specific setting LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG, 
-      or LAMMPS_SMALLSMALL if specified when building LAMMPS (i.e. in 
-      src/MAKE/Makefile.foo) should be consistent with that specified 
-      when building libgpu.a (i.e. by LMP_INC in the lib/gpu/Makefile.bar).
-
-                      BUILDING FOR HIP FRAMEWORK
-                   --------------------------------
-1. Install the latest ROCm framework (https://github.com/RadeonOpenCompute/ROCm).
-2. GPU sorting requires installing hipcub 
+1. GPU sorting requires installing hipcub
 (https://github.com/ROCmSoftwarePlatform/hipCUB). The HIP CUDA-backend
 additionally requires cub (https://nvlabs.github.io/cub). Download and
 extract the cub directory to lammps/lib/gpu/ or specify an appropriate
 path in lammps/lib/gpu/Makefile.hip.
-3. In Makefile.hip it is possible to specify the target platform via 
-export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target 
+2. In Makefile.hip it is possible to specify the target platform via
+export HIP_PLATFORM=hcc or HIP_PLATFORM=nvcc as well as the target
 architecture (gfx803, gfx900, gfx906 etc.)
-4. If your MPI implementation does not support `mpicxx --showme` command,
+3. If your MPI implementation does not support `mpicxx --showme` command,
 it is required to specify the corresponding MPI compiler and linker flags
 in lammps/lib/gpu/Makefile.hip and in lammps/src/MAKE/OPTIONS/Makefile.hip.
-5. Building the GPU library (libgpu.a): 
-    cd lammps/lib/gpu; make -f Makefile.hip -j
-6. Building the LAMMPS executable (lmp_hip):
-    cd ../../src; make hip -j
 
-                      EXAMPLE CONVENTIONAL BUILD PROCESS
-                  --------------------------------
-                    
-cd ~/lammps/lib/gpu
-emacs Makefile.linux
-make -f Makefile.linux
-./nvc_get_devices
-cd ../../src
-emacs ./MAKE/Makefile.linux
-make yes-asphere
-make yes-kspace
-make yes-gpu
-make linux
+------------------------------------------------------------------------------
+                             OPENCL BUILD NOTES
+------------------------------------------------------------------------------
+If GERYON_NUMA_FISSION is defined at build time, LAMMPS will consider separate
+NUMA nodes on GPUs or accelerators as separate devices. For example, a 2-socket
+CPU would appear as two separate devices for OpenCL (and LAMMPS would require
+two MPI processes to use both sockets with the GPU library - each with its
+own device ID as output by ocl_get_devices).
+
+For a debug build, use "-DUCL_DEBUG -DGERYON_KERNEL_DUMP" and remove
+"-DUCL_NO_EXIT" and "-DMPI_GERYON" from the build options.
+
+------------------------------------------------------------------------------
+                   ALL PREPROCESSOR OPTIONS (For Advanced Users)
+------------------------------------------------------------------------------
+_SINGLE_SINGLE          Build library for single precision mode
+_SINGLE_DOUBLE          Build library for mixed precision mode
+_DOUBLE_DOUBLE          Build library for double precision mode
+CUDA_MPS_SUPPORT        Do not generate errors for exclusive mode for CUDA
+CUDA_PROXY              Same as above
+MPI_GERYON              Library should use MPI_Abort for unhandled errors
+GERYON_NUMA_FISSION     Accelerators with main memory NUMA are split into
+                        multiple virtual accelerators for each NUMA node
+LAL_USE_OMP=0           Disable OpenMP in lib, regardless of compiler setting
+LAL_USE_OMP_SIMD=0      Disable OpenMP SIMD in lib, regardless of compiler set
+GERYON_OCL_FLUSH        For OpenCL, flush queue after every enqueue
+LAL_NO_OCL_EV_JIT       Turn off JIT specialization for kernels in OpenCL
+LAL_USE_OLD_NEIGHBOR    Use old neighbor list algorithm
+USE_CUDPP               Enable GPU binning in neighbor builds (not recommended)
+USE_HIP_DEVICE_SORT     Enable GPU binning for HIP builds
+                        (only w/ LAL_USE_OLD_NEIGHBOR)
+LAL_NO_BLOCK_REDUCE     Use host for energy/virial accumulation
+LAL_OCL_EXTRA_ARGS      Supply extra args for OpenCL compiler delimited with :
+UCL_NO_EXIT             LAMMPS should handle errors instead of Geryon lib
+UCL_DEBUG               Debug build for Geryon
+GERYON_KERNEL_DUMP      Dump all compiled OpenCL programs with compiler
+                        flags and build logs
+GPU_CAST                Casting performed on GPU, untested recently
+THREE_CONCURRENT        Concurrent 3-body calcs in separate queues, untested
+
+
+------------------------------------------------------------------------------
+                           References for Details
+------------------------------------------------------------------------------
+
+Brown, W.M., Wang, P. Plimpton, S.J., Tharrington, A.N. Implementing
+Molecular Dynamics on Hybrid High Performance Computers - Short Range
+Forces. Computer Physics Communications. 2011. 182: p. 898-911.
+
+and
+
+Brown, W.M., Kohlmeyer, A. Plimpton, S.J., Tharrington, A.N. Implementing
+Molecular Dynamics on Hybrid High Performance Computers - Particle-Particle
+Particle-Mesh. Computer Physics Communications. 2012. 183: p. 449-459.
+
+and
+
+Brown, W.M., Masako, Y. Implementing Molecular Dynamics on Hybrid High
+Performance Computers - Three-Body Potentials. Computer Physics Communications.
+2013. 184: p. 2785–2793.
 
diff --git a/lib/gpu/geryon/hip_device.h b/lib/gpu/geryon/hip_device.h
index d2fb1919b7..373b3783b0 100644
--- a/lib/gpu/geryon/hip_device.h
+++ b/lib/gpu/geryon/hip_device.h
@@ -24,6 +24,8 @@ namespace ucl_hip {
 // --------------------------------------------------------------------------
 typedef hipStream_t command_queue;
 
+inline void ucl_flush(command_queue &cq) {}
+
 inline void ucl_sync(hipStream_t &stream) {
   CU_SAFE_CALL(hipStreamSynchronize(stream));
 }
@@ -143,15 +145,26 @@ class UCL_Device {
   inline std::string device_type_name(const int i) { return "GPU"; }
 
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i) { return UCL_GPU; }
+  inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
 
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
 
+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].SIMDWidth;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].SIMDWidth;}
+
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
@@ -215,7 +228,19 @@ class UCL_Device {
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i)
     { return _properties[i].maxThreadsPerBlock; }
-
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].maxThreadsDim[dim];}
+  
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].sharedMemPerBlock; }
+ 
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
   /// Return the maximum memory pitch in bytes
@@ -255,11 +280,20 @@ class UCL_Device {
   inline int max_sub_devices(const int i)
     { return 0; }
 
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return arch(i)>=3.0; }
+
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
 
-  /// Select the platform that has accelerators (for compatibility with OpenCL)
-  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+  /// For compatability with OCL API
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="")
+    { return set_platform(0); }
 
   inline int load_module(const void* program, hipModule_t& module, std::string *log=nullptr){
     auto it = _loaded_modules.emplace(program, hipModule_t());
diff --git a/lib/gpu/geryon/hip_kernel.h b/lib/gpu/geryon/hip_kernel.h
index c5014b52e7..10bc9f1334 100644
--- a/lib/gpu/geryon/hip_kernel.h
+++ b/lib/gpu/geryon/hip_kernel.h
@@ -14,6 +14,7 @@
 #include <fstream>
 #include <string>
 #include <iostream>
+#include <cstdio>
 
 namespace ucl_hip {
 
@@ -64,7 +65,7 @@ class UCL_Program {
   }
 
   /// Load a program from a string and compile with flags
-  inline int load_string(const void *program, const char *flags="", std::string *log=nullptr) {
+  inline int load_string(const void *program, const char *flags="", std::string *log=nullptr, FILE* foutput=nullptr) {
     return _device_ptr->load_module(program, _module, log);
   }
 
@@ -73,6 +74,7 @@ class UCL_Program {
   hipModule_t _module;
   hipStream_t _cq;
   friend class UCL_Texture;
+  friend class UCL_Const;
 };
 
 /// Class for dealing with CUDA Driver kernels
diff --git a/lib/gpu/geryon/hip_texture.h b/lib/gpu/geryon/hip_texture.h
index ae16bee900..9117adc879 100644
--- a/lib/gpu/geryon/hip_texture.h
+++ b/lib/gpu/geryon/hip_texture.h
@@ -107,6 +107,37 @@ class UCL_Texture {
   }
 };
 
+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+  UCL_Const() {}
+  ~UCL_Const() {}
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    _cq=prog.cq();
+    CU_SAFE_CALL(hipModuleGetGlobal(&_global, &_global_bytes, prog._module,
+				    global_name)); 
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    CU_SAFE_CALL(hipMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
+				    _cq));
+  }
+  /// Get device ptr associated with object
+  inline const void* begin() const { return &_global; }
+  inline void clear() {}
+
+ private:
+  hipStream_t _cq;
+  void* _global;
+  size_t _global_bytes;
+  friend class UCL_Kernel;
+};
+
 } // namespace
 
 #endif
diff --git a/lib/gpu/geryon/nvd_device.h b/lib/gpu/geryon/nvd_device.h
index 42f176bcbf..52b2ed478e 100644
--- a/lib/gpu/geryon/nvd_device.h
+++ b/lib/gpu/geryon/nvd_device.h
@@ -37,6 +37,8 @@ namespace ucl_cudadr {
 // --------------------------------------------------------------------------
 typedef CUstream command_queue;
 
+inline void ucl_flush(command_queue &cq) {}
+
 inline void ucl_sync(CUstream &stream) {
   CU_SAFE_CALL(cuStreamSynchronize(stream));
 }
@@ -156,15 +158,26 @@ class UCL_Device {
   inline std::string device_type_name(const int i) { return "GPU"; }
 
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i) { return UCL_GPU; }
+  inline enum UCL_DEVICE_TYPE device_type(const int i) { return UCL_GPU; }
 
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i) { return device_type(i)==UCL_CPU; }
 
+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].SIMDWidth;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].SIMDWidth;}
+
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
@@ -228,6 +241,18 @@ class UCL_Device {
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i)
     { return _properties[i].maxThreadsPerBlock; }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].maxThreadsDim[dim]; }
+  
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].sharedMemPerBlock; }
 
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
@@ -268,11 +293,22 @@ class UCL_Device {
   inline int max_sub_devices(const int i)
     { return 0; }
 
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return arch(i)>=3.0; }
+
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
 
-  /// Select the platform that has accelerators (for compatibility with OpenCL)
-  inline int set_platform_accelerator(int pid=-1) { return UCL_SUCCESS; }
+  /// For compatability with OCL API
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="",
+			       const int ndevices=-1,
+			       const int first_device=-1)
+    { return set_platform(0); }
 
  private:
   int _device, _num_devices;
diff --git a/lib/gpu/geryon/nvd_kernel.h b/lib/gpu/geryon/nvd_kernel.h
index d74b0e2dc1..c31b8cdf9b 100644
--- a/lib/gpu/geryon/nvd_kernel.h
+++ b/lib/gpu/geryon/nvd_kernel.h
@@ -26,6 +26,7 @@
 
 #include "nvd_device.h"
 #include <fstream>
+#include <cstdio>
 
 namespace ucl_cudadr {
 
@@ -77,7 +78,7 @@ class UCL_Program {
 
   /// Load a program from a string and compile with flags
   inline int load_string(const void *program, const char *flags="",
-                         std::string *log=nullptr) {
+                         std::string *log=nullptr, FILE* foutput=nullptr) {
     if (std::string(flags)=="BINARY")
       return load_binary((const char *)program);
     const unsigned int num_opts=2;
@@ -100,12 +101,25 @@ class UCL_Program {
 
     if (err != CUDA_SUCCESS) {
       #ifndef UCL_NO_EXIT
-      std::cerr << std::endl
+      std::cerr << std::endl << std::endl
                 << "----------------------------------------------------------\n"
                 << " UCL Error: Error compiling PTX Program...\n"
                 << "----------------------------------------------------------\n";
-      std::cerr << log << std::endl;
+      std::cerr << log << std::endl
+                << "----------------------------------------------------------\n\n";
       #endif
+      if (foutput != NULL) {
+	fprintf(foutput,"\n\n");
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput," UCL Error: Error compiling PTX Program...\n");
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput,"%s\n",log);
+	fprintf(foutput,
+		"----------------------------------------------------------\n");
+	fprintf(foutput,"\n\n");
+      }
       return UCL_COMPILE_ERROR;
     }
 
@@ -139,11 +153,15 @@ class UCL_Program {
     return UCL_SUCCESS;
   }
 
+  /// Return the default command queue/stream associated with this data
+  inline command_queue & cq() { return _cq; }
+
   friend class UCL_Kernel;
  private:
   CUmodule _module;
   CUstream _cq;
   friend class UCL_Texture;
+  friend class UCL_Const;
 };
 
 /// Class for dealing with CUDA Driver kernels
diff --git a/lib/gpu/geryon/nvd_texture.h b/lib/gpu/geryon/nvd_texture.h
index c766af826c..65f4ad9ef5 100644
--- a/lib/gpu/geryon/nvd_texture.h
+++ b/lib/gpu/geryon/nvd_texture.h
@@ -38,8 +38,11 @@ class UCL_Texture {
   inline UCL_Texture(UCL_Program &prog, const char *texture_name)
     { get_texture(prog,texture_name); }
   /// Set the texture reference for this object
-  inline void get_texture(UCL_Program &prog, const char *texture_name)
-    { CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name)); }
+  inline void get_texture(UCL_Program &prog, const char *texture_name) {
+    #if (CUDA_VERSION < 11000)
+    CU_SAFE_CALL(cuModuleGetTexRef(&_tex, prog._module, texture_name));
+    #endif
+  }
 
   /// Bind a float array where each fetch grabs a vector of length numel
   template<class numtyp>
@@ -72,11 +75,14 @@ class UCL_Texture {
   }
 
  private:
+  #if (CUDA_VERSION < 11000)
   CUtexref _tex;
+  #endif
   friend class UCL_Kernel;
 
   template<class mat_typ>
   inline void _bind_float(mat_typ &vec, const unsigned numel) {
+    #if (CUDA_VERSION < 11000)
     #ifdef UCL_DEBUG
     assert(numel!=0 && numel<5);
     #endif
@@ -90,10 +96,42 @@ class UCL_Texture {
       else
         CU_SAFE_CALL(cuTexRefSetFormat(_tex,CU_AD_FORMAT_SIGNED_INT32,numel*2));
     }
+    #endif
   }
 
 };
 
+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+  UCL_Const() {}
+  ~UCL_Const() {}
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    _cq=prog.cq();
+    CU_SAFE_CALL(cuModuleGetGlobal(&_global, &_global_bytes, prog._module,
+				   global_name)); 
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    CU_SAFE_CALL(cuMemcpyHtoDAsync(_global, src.begin(), numel*sizeof(numtyp),
+				   _cq));
+  }
+  /// Get device ptr associated with object
+  inline const CUdeviceptr * begin() const { return &_global; }
+  inline void clear() {}
+
+ private:
+  CUstream _cq;
+  CUdeviceptr _global;
+  size_t _global_bytes;
+  friend class UCL_Kernel;
+};
+
 } // namespace
 
 #endif
diff --git a/lib/gpu/geryon/ocl_device.h b/lib/gpu/geryon/ocl_device.h
index de4def0bc1..435ee24dd3 100644
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@@ -28,12 +28,8 @@
 #include <vector>
 #include <iostream>
 
-/* We default to OpenCL 1.2 as target version for now as
- * there are known issues with OpenCL 2.0 and later.
- * This is also to silence warnings from generic OpenCL headers */
-
-#if !defined(CL_TARGET_OPENCL_VERSION)
-#define CL_TARGET_OPENCL_VERSION 120
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 210
 #endif
 
 #ifdef __APPLE__
@@ -55,17 +51,36 @@ namespace ucl_opencl {
 typedef cl_command_queue command_queue;
 typedef cl_context context_type;
 
+inline void ucl_flush(command_queue &cq) { CL_SAFE_CALL(clFlush(cq)); }
+
 inline void ucl_sync(cl_command_queue &cq) {
   CL_SAFE_CALL(clFinish(cq));
 }
 
-inline bool _shared_mem_device(cl_device_type &device_type) {
+#if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON)
+inline bool _shared_mem_device(cl_device_id &device) { return true; }
+#elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF)
+inline bool _shared_mem_device(cl_device_id &device) { return false; }
+#else
+inline bool _shared_mem_device(cl_device_id &device) {
+  #ifdef CL_VERSION_1_2
+  cl_bool br;
+  CL_SAFE_CALL(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY,
+                               sizeof(cl_bool), &br,NULL));
+  return (br == CL_TRUE);
+  #else
+  cl_device_type device_type;
+  CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
+			       sizeof(device_type),&device_type,NULL));
   return (device_type==CL_DEVICE_TYPE_CPU);
+  #endif
 }
+#endif
 
 struct OCLProperties {
   std::string name;
   cl_device_type device_type;
+  bool is_subdevice;
   cl_ulong global_mem;
   cl_ulong shared_mem;
   cl_ulong const_mem;
@@ -74,12 +89,16 @@ struct OCLProperties {
   size_t work_group_size;
   size_t work_item_size[3];
   bool double_precision;
+  int preferred_vector_width32, preferred_vector_width64;
   int alignment;
   size_t timer_resolution;
   bool ecc_support;
   std::string c_version;
   bool partition_equal, partition_counts, partition_affinity;
   cl_uint max_sub_devices;
+  int cl_device_version;
+  bool has_subgroup_support;
+  bool has_shuffle_support;
 };
 
 /// Class for looking at data parallel device properties
@@ -182,16 +201,27 @@ class UCL_Device {
   inline std::string device_type_name(const int i);
 
   /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type() { return device_type(_device); }
+  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
   /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
-  inline int device_type(const int i);
+  inline enum UCL_DEVICE_TYPE device_type(const int i);
 
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory() { return shared_memory(_device); }
   /// Returns true if host memory is efficiently addressable from device
   inline bool shared_memory(const int i)
-    { return _shared_mem_device(_properties[i].device_type); }
+    { return _shared_mem_device(_cl_devices[i]); }
 
+  /// Returns preferred vector width
+  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp32_width(const int i)
+    {return _properties[i].preferred_vector_width32;}
+  /// Returns preferred vector width
+  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
+  /// Returns preferred vector width
+  inline int preferred_fp64_width(const int i)
+    {return _properties[i].preferred_vector_width64;}
+  
   /// Returns true if double precision is support for the current device
   inline bool double_precision() { return double_precision(_device); }
   /// Returns true if double precision is support for the device
@@ -242,6 +272,18 @@ class UCL_Device {
   /// Get the maximum number of threads per block
   inline size_t group_size(const int i)
     { return _properties[i].work_group_size; }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int dim)
+    { return group_size_dim(_device, dim); }
+  /// Get the maximum number of threads per block in dimension 'dim'
+  inline size_t group_size_dim(const int i, const int dim)
+    { return _properties[i].work_item_size[dim]; }
+
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size() { return slm_size(_device); }
+  /// Get the shared local memory size in bytes
+  inline size_t slm_size(const int i)
+    { return _properties[i].shared_mem; }
 
   /// Return the maximum memory pitch in bytes for current device
   inline size_t max_pitch() { return max_pitch(_device); }
@@ -256,6 +298,12 @@ class UCL_Device {
   inline bool sharing_supported(const int i)
     { return true; }
 
+  /// True if the device is a sub-device
+  inline bool is_subdevice()
+    { return is_subdevice(_device); }
+  /// True if the device is a sub-device
+  inline bool is_subdevice(const int i)
+    { return _properties[i].is_subdevice; }
   /// True if splitting device into equal subdevices supported
   inline bool fission_equal()
     { return fission_equal(_device); }
@@ -274,6 +322,18 @@ class UCL_Device {
   /// True if splitting device into subdevices by affinity domains supported
   inline bool fission_by_affinity(const int i)
     { return _properties[i].partition_affinity; }
+  /// True if the device has subgroup support
+  inline bool has_subgroup_support()
+    { return has_subgroup_support(_device); }
+  /// True if the device has subgroup support
+  inline bool has_subgroup_support(const int i)
+    { return _properties[i].has_subgroup_support; }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support()
+    { return has_shuffle_support(_device); }
+  /// True if the device supports shuffle intrinsics
+  inline bool has_shuffle_support(const int i)
+    { return _properties[i].has_shuffle_support; }
 
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices()
@@ -281,6 +341,12 @@ class UCL_Device {
   /// Maximum number of subdevices allowed from device fission
   inline int max_sub_devices(const int i)
     { return _properties[i].max_sub_devices; }
+  /// OpenCL version supported by the device
+  inline int cl_device_version()
+    { return cl_device_version(_device); }
+  /// OpenCL version supported by the device
+  inline int cl_device_version(const int i)
+    { return _properties[i].cl_device_version; }
 
   /// List all devices along with all properties
   inline void print_all(std::ostream &out);
@@ -288,8 +354,14 @@ class UCL_Device {
   /// Return the OpenCL type for the device
   inline cl_device_id & cl_device() { return _cl_device; }
 
-  /// Select the platform that has accelerators
-  inline int set_platform_accelerator(int pid=-1);
+  /// Automatically set the platform by type, vendor, and/or CU count
+  /** If first_device is positive, search restricted to platforms containing
+    * this device IDs. If ndevices is positive, search is restricted 
+    * to platforms with at least that many devices  **/
+  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
+			       const std::string vendor="",
+			       const int ndevices=-1,
+			       const int first_device=-1);
 
  private:
   int _num_platforms;          // Number of platforms
@@ -322,8 +394,7 @@ UCL_Device::UCL_Device() {
     return;
   } else
     _num_platforms=static_cast<int>(nplatforms);
-  // note that platform 0 may not necessarily be associated with accelerators
-  set_platform_accelerator();
+  set_platform(0);
 }
 
 UCL_Device::~UCL_Device() {
@@ -332,6 +403,14 @@ UCL_Device::~UCL_Device() {
 
 void UCL_Device::clear() {
   _properties.clear();
+
+  #ifdef GERYON_NUMA_FISSION
+  #ifdef CL_VERSION_1_2
+  for (int i=0; i<_cl_devices.size(); i++)
+    CL_DESTRUCT_CALL(clReleaseDevice(_cl_devices[i]));
+  #endif
+  #endif
+
   _cl_devices.clear();
   if (_device>-1) {
     for (size_t i=0; i<_cq.size(); i++) {
@@ -341,6 +420,7 @@ void UCL_Device::clear() {
     CL_DESTRUCT_CALL(clReleaseContext(_context));
   }
   _device=-1;
+  _num_devices=0;
 }
 
 int UCL_Device::set_platform(int pid) {
@@ -370,11 +450,51 @@ int UCL_Device::set_platform(int pid) {
   CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
                               &n));
 
+  #ifndef GERYON_NUMA_FISSION
   // --- Store properties for each device
   for (int i=0; i<_num_devices; i++) {
     _cl_devices.push_back(device_list[i]);
     add_properties(device_list[i]);
   }
+  #else
+  // --- Create sub-devices for anything partitionable by NUMA and store props
+  int num_unpart = _num_devices;
+  _num_devices = 0;
+  for (int i=0; i<num_unpart; i++) {
+    cl_uint num_subdevices = 1;
+    cl_device_id *subdevice_list = device_list + i;
+
+    #ifdef CL_VERSION_1_2
+    cl_device_affinity_domain adomain;
+    CL_SAFE_CALL(clGetDeviceInfo(device_list[i],
+				 CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
+				 sizeof(cl_device_affinity_domain),
+				 &adomain,NULL));
+
+    cl_device_partition_property props[3];
+    props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
+    props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
+    props[2]=0;
+    if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
+      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
+				      &num_subdevices));
+    if (num_subdevices > 1) {
+      subdevice_list = new cl_device_id[num_subdevices];
+      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
+				      subdevice_list, &num_subdevices));
+    }
+    #endif
+
+    for (int j=0; j<num_subdevices; j++) {
+      _num_devices++;
+      _cl_devices.push_back(subdevice_list[j]);
+      add_properties(subdevice_list[j]);
+    }
+
+    if (num_subdevices > 1) delete[] subdevice_list;
+  } // for i
+  #endif
+
   delete[] device_list;
   return UCL_SUCCESS;
 }
@@ -429,11 +549,18 @@ void UCL_Device::add_properties(cl_device_id device_list) {
                                sizeof(cl_uint),&op.alignment,nullptr));
   op.alignment/=8;
 
+  cl_uint float_width;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,
+                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
+                               sizeof(float_width),&float_width,nullptr));
+  op.preferred_vector_width32=float_width;
+
   // Determine if double precision is supported
   cl_uint double_width;
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                                sizeof(double_width),&double_width,nullptr));
+  op.preferred_vector_width64=double_width;
   if (double_width==0)
     op.double_precision=false;
   else
@@ -452,9 +579,14 @@ void UCL_Device::add_properties(cl_device_id device_list) {
     op.ecc_support=true;
 
   op.c_version="";
+  op.is_subdevice=false;
   op.partition_equal=false;
   op.partition_counts=false;
   op.partition_affinity=false;
+  op.max_sub_devices=1;
+  op.cl_device_version=0;
+  op.has_subgroup_support=false;
+  op.has_shuffle_support=false;
 
   #ifdef CL_VERSION_1_2
   size_t return_bytes;
@@ -463,6 +595,13 @@ void UCL_Device::add_properties(cl_device_id device_list) {
   op.c_version=buffer;
 
   cl_device_partition_property pinfo[4];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_TYPE,
+			       4*sizeof(cl_device_partition_property),
+			       &pinfo, &return_bytes));
+  if (return_bytes == 0) op.is_subdevice=false;
+  else if (pinfo[0]) op.is_subdevice=true;
+  else op.is_subdevice=false;
+
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PARTITION_PROPERTIES,
                                4*sizeof(cl_device_partition_property),
@@ -480,6 +619,46 @@ void UCL_Device::add_properties(cl_device_id device_list) {
   CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                                sizeof(cl_uint),&op.max_sub_devices,nullptr));
+
+  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_VERSION,1024,buffer,nullptr));
+  int cl_version_maj = buffer[7] - '0';
+  int cl_version_min = buffer[9] - '0';
+  op.cl_device_version = cl_version_maj * 100 + cl_version_min * 10;
+
+  size_t ext_str_size_ret;
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, 0, nullptr,
+			       &ext_str_size_ret));
+  char buffer2[ext_str_size_ret];
+  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS,
+			       ext_str_size_ret, buffer2, nullptr));
+  #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
+  if (op.cl_device_version >= 210) {
+    if ((std::string(buffer2).find("cl_khr_subgroups") != std::string::npos) ||
+        (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos))
+      op.has_subgroup_support=true;
+    if (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos)
+      op.has_shuffle_support=true;
+  }
+  #endif
+  if (std::string(buffer2).find("cl_nv_device_attribute_query") !=
+      std::string::npos) {
+    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
+    #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
+    #endif
+    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
+    #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
+    #endif
+    cl_uint major, minor;
+    CL_SAFE_CALL(clGetDeviceInfo(device_list,
+				 CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
+                                 sizeof(cl_uint), &major, nullptr));
+    CL_SAFE_CALL(clGetDeviceInfo(device_list,
+				 CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
+                                 sizeof(cl_uint), &minor, nullptr));
+    double arch = static_cast<double>(minor)/10+major;
+    if (arch >= 3.0)
+      op.has_shuffle_support=true;
+  }
   #endif
 
   _properties.push_back(op);
@@ -516,7 +695,7 @@ std::string UCL_Device::device_type_name(const int i) {
 }
 
 // Get a string telling the type of the device
-int UCL_Device::device_type(const int i) {
+enum UCL_DEVICE_TYPE UCL_Device::device_type(const int i) {
   if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
     return UCL_CPU;
   else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
@@ -529,14 +708,8 @@ int UCL_Device::device_type(const int i) {
 
 // Set the CUDA device to the specified device number
 int UCL_Device::set(int num) {
-  cl_device_id *device_list = new cl_device_id[_num_devices];
-  cl_uint n;
-  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,_num_devices,
-                               device_list,&n));
-
   _device=num;
-  _cl_device=device_list[_device];
-  delete[] device_list;
+  _cl_device=_cl_devices[_device];
   return create_context();
 }
 
@@ -555,6 +728,11 @@ void UCL_Device::print_all(std::ostream &out) {
       out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
       out << "  Type of device:                                "
           << device_type_name(i).c_str() << std::endl;
+      out << "  Is a subdevice:                                ";
+      if (is_subdevice(i))
+	out << "Yes\n";
+      else
+	out << "No\n";
       out << "  Double precision support:                      ";
       if (double_precision(i))
         out << "Yes\n";
@@ -613,33 +791,93 @@ void UCL_Device::print_all(std::ostream &out) {
         out << "No\n";
       out << "  Maximum subdevices from fission:               "
           << max_sub_devices(i) << std::endl;
+      out << "  Shared memory system:                          ";
+      if (shared_memory(i))
+        out << "Yes\n";
+      else
+        out << "No\n";
     }
   }
 }
 
-// Select the platform that is associated with accelerators
-// if pid < 0, select the first platform
-int UCL_Device::set_platform_accelerator(int pid) {
-  if (pid < 0) {
-    int found = 0;
-    for (int n=0; n<_num_platforms; n++) {
-      set_platform(n);
-      for (int i=0; i<num_devices(); i++) {
-        if ((_properties[i].device_type & CL_DEVICE_TYPE_CPU) ||
-            (_properties[i].device_type & CL_DEVICE_TYPE_GPU) ||
-            (_properties[i].device_type & CL_DEVICE_TYPE_ACCELERATOR)) {
-          found = 1;
-          break;
-        }
+int UCL_Device::auto_set_platform(const enum UCL_DEVICE_TYPE type,
+				  const std::string vendor,
+				  const int ndevices,
+				  const int first_device) {
+  if (_num_platforms < 2) return set_platform(0);
+
+  int last_device = -1;
+  if (first_device > -1) {
+    if (ndevices)
+      last_device = first_device + ndevices - 1;
+    else
+      last_device = first_device;
+  }
+  
+  bool vendor_match=false;
+  bool type_match=false;
+  int max_cus=0;
+  int best_platform=0;
+
+  std::string vendor_upper=vendor;
+  for (int i=0; i<vendor.length(); i++)
+    if (vendor_upper[i]<='z' && vendor_upper[i]>='a')
+      vendor_upper[i]=toupper(vendor_upper[i]);
+
+  for (int n=0; n<_num_platforms; n++) {
+    set_platform(n);
+    if (last_device > -1 && last_device >= num_devices()) continue;
+    if (ndevices > num_devices()) continue;
+
+    int first_id=0;
+    int last_id=num_devices()-1;
+    if (last_device > -1) {
+      first_id=first_device;
+      last_id=last_device;
+    }
+
+    if (vendor_upper!="") {
+      std::string pname = platform_name();
+      for (int i=0; i<pname.length(); i++)
+	if (pname[i]<='z' && pname[i]>='a')
+	  pname[i]=toupper(pname[i]);
+
+      if (pname.find(vendor_upper)!=std::string::npos) {
+	if (vendor_match == false) {
+	  best_platform=n;
+	  max_cus=0;
+	  vendor_match=true;
+	}
+      } else if (vendor_match)
+	continue;
+    }
+
+    if (type != UCL_DEFAULT) {
+      bool ptype_matched=false;
+      for (int d=first_id; d<=last_id; d++) {
+	if (type==device_type(d)) {
+	  if (type_match == false) {
+	    best_platform=n;
+	    max_cus=0;
+	    type_match=true;
+	    ptype_matched=true;
+	  }
+	}
+      }
+      if (type_match==true && ptype_matched==false)
+	continue;
+    }
+
+    for (int d=first_id; d<=last_id; d++) {
+      if (cus(d) > max_cus) {
+	best_platform=n;
+	max_cus=cus(d);
       }
-      if (found) return UCL_SUCCESS;
     }
-    return UCL_ERROR;
-  } else {
-    return set_platform(pid);
   }
+  return set_platform(best_platform);
 }
 
-} // namespace ucl_opencl 
+} // namespace ucl_opencl
 
 #endif
diff --git a/lib/gpu/geryon/ocl_kernel.h b/lib/gpu/geryon/ocl_kernel.h
index 77593f4515..23f9baa09e 100644
--- a/lib/gpu/geryon/ocl_kernel.h
+++ b/lib/gpu/geryon/ocl_kernel.h
@@ -2,6 +2,7 @@
                                 ocl_kernel.h
                              -------------------
                                W. Michael Brown
+                            Nitin Dhamankar (Intel)
 
   Utilities for dealing with OpenCL kernels
 
@@ -26,6 +27,7 @@
 
 #include "ocl_device.h"
 #include <fstream>
+#include <cstdio>
 
 namespace ucl_opencl {
 
@@ -93,7 +95,7 @@ class UCL_Program {
 
   /// Load a program from a string and compile with flags
   inline int load_string(const void *program, const char *flags="",
-                         std::string *log=nullptr) {
+                         std::string *log=nullptr, FILE* foutput=nullptr) {
     cl_int error_flag;
     const char *prog=(const char *)program;
     _program=clCreateProgramWithSource(_context,1,&prog,nullptr,&error_flag);
@@ -107,27 +109,66 @@ class UCL_Program {
                                        sizeof(cl_build_status),&build_status,
                                        nullptr));
 
-    if (build_status != CL_SUCCESS || log!=nullptr) {
+    #ifdef GERYON_KERNEL_DUMP
+    {
       size_t ms;
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,0,
-                                         nullptr, &ms));
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 0,NULL,&ms));
       char *build_log = new char[ms];
-      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,ms,
-                                         build_log, nullptr));
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 ms,build_log, NULL));
+      std::cout << std::endl << std::endl
+		<< "--------------------------------------------------------\n"
+		<< "   UCL PROGRAM DUMP\n"
+		<< "--------------------------------------------------------\n"
+		<< flags << std::endl
+		<< "--------------------------------------------------------\n"
+		<< prog << std::endl
+		<< "--------------------------------------------------------\n"
+		<< build_log
+		<< "--------------------------------------------------------\n"
+		<< std::endl << std::endl;
+    }
+    #endif
+    
+    if (build_status != CL_SUCCESS || log!=NULL) {
+      size_t ms;
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 0,NULL,&ms));
+      char *build_log = new char[ms];
+      CL_SAFE_CALL(clGetProgramBuildInfo(_program,_device,CL_PROGRAM_BUILD_LOG,
+					 ms,build_log, NULL));
 
       if (log!=nullptr)
         *log=std::string(build_log);
 
       if (build_status != CL_SUCCESS) {
         #ifndef UCL_NO_EXIT
-        std::cerr << std::endl
-                  << "----------------------------------------------------------\n"
-                  << " UCL Error: Error compiling OpenCL Program ("
-                  << build_status << ") ...\n"
-                  << "----------------------------------------------------------\n";
+        std::cerr << std::endl << std::endl
+          << "----------------------------------------------------------\n"
+          << " UCL Error: Error compiling OpenCL Program ("
+          << build_status << ") ...\n"
+          << "----------------------------------------------------------\n";
         std::cerr << build_log << std::endl;
+	std::cerr <<
+	  "----------------------------------------------------------\n"
+	  << std::endl << std::endl;
         #endif
-        delete[] build_log;
+	if (foutput != NULL) {
+	  fprintf(foutput,"\n\n");
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,
+		  " UCL Error: Error compiling OpenCL Program (%d) ...\n",
+		  build_status);
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,"%s\n",build_log);
+	  fprintf(foutput,
+	    "----------------------------------------------------------\n");
+	  fprintf(foutput,"\n\n");
+	}
+	delete[] build_log;
         return UCL_COMPILE_ERROR;
       } else delete[] build_log;
     }
@@ -141,6 +182,7 @@ class UCL_Program {
   inline void cq(command_queue &cq_in) { _cq=cq_in; }
 
   friend class UCL_Kernel;
+  friend class UCL_Const;
  private:
   bool _init_done;
   cl_program _program;
@@ -322,9 +364,45 @@ class UCL_Kernel {
   inline void cq(command_queue &cq_in) { _cq=cq_in; }
   #include "ucl_arg_kludge.h"
 
+  #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
+  inline size_t max_subgroup_size(const size_t block_size_x) {
+    size_t block_size = block_size_x;
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+
+  inline size_t max_subgroup_size(const size_t block_size_x,
+                                  const size_t block_size_y) {
+    size_t block_size[2] { block_size_x, block_size_y };
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+
+  inline size_t max_subgroup_size(const size_t block_size_x,
+                                  const size_t block_size_y,
+                                  const size_t block_size_z) {
+    size_t block_size[3] { block_size_x, block_size_y, block_size_z };
+    CL_SAFE_CALL(clGetKernelSubGroupInfo(_kernel, _device,
+                                         CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE,
+                                         sizeof(block_size), (void *) &block_size,
+                                         sizeof(size_t), (void *) &_mx_subgroup_sz,
+                                         NULL));
+    return _mx_subgroup_sz;
+  }
+  #endif
+
  private:
   cl_kernel _kernel;
   cl_program _program;
+  cl_device_id _device;
   cl_uint _dimensions;
   size_t _block_size[3];
   size_t _num_blocks[3];
@@ -338,6 +416,11 @@ class UCL_Kernel {
   unsigned _kernel_info_nargs;
   //std::string _kernel_info_args[256];
   #endif
+
+  #ifdef CL_VERSION_2_1
+  size_t _mx_subgroup_sz;      // Maximum sub-group size for this kernel
+  #endif
+
 };
 
 inline int UCL_Kernel::set_function(UCL_Program &program, const char *function) {
@@ -347,6 +430,7 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
   CL_SAFE_CALL(clRetainCommandQueue(_cq));
   _program=program._program;
   CL_SAFE_CALL(clRetainProgram(_program));
+  _device=program._device;
   cl_int error_flag;
   _kernel=clCreateKernel(program._program,function,&error_flag);
 
@@ -380,8 +464,11 @@ inline int UCL_Kernel::set_function(UCL_Program &program, const char *function)
 }
 
 void UCL_Kernel::run() {
-  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,nullptr,
-                                      _num_blocks,_block_size,0,nullptr,nullptr));
+  CL_SAFE_CALL(clEnqueueNDRangeKernel(_cq,_kernel,_dimensions,NULL,
+                                      _num_blocks,_block_size,0,NULL,NULL));
+  #ifdef GERYON_OCL_FLUSH
+  ucl_flush(_cq);
+  #endif
 }
 
 } // namespace
diff --git a/lib/gpu/geryon/ocl_macros.h b/lib/gpu/geryon/ocl_macros.h
index aeff689859..0e9ce78389 100644
--- a/lib/gpu/geryon/ocl_macros.h
+++ b/lib/gpu/geryon/ocl_macros.h
@@ -4,12 +4,8 @@
 #include <cstdio>
 #include <cassert>
 
-/* We default to OpenCL 1.2 as target version for now as
- * there are known issues with OpenCL 2.0 and later.
- * This is also to silence warnings from generic OpenCL headers */
-
-#if !defined(CL_TARGET_OPENCL_VERSION)
-#define CL_TARGET_OPENCL_VERSION 120
+#ifndef CL_TARGET_OPENCL_VERSION
+#define CL_TARGET_OPENCL_VERSION 210
 #endif
 
 #ifdef __APPLE__
diff --git a/lib/gpu/geryon/ocl_memory.h b/lib/gpu/geryon/ocl_memory.h
index 740020ab18..8937d4145a 100644
--- a/lib/gpu/geryon/ocl_memory.h
+++ b/lib/gpu/geryon/ocl_memory.h
@@ -106,9 +106,9 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
   mat.cbegin()=clCreateBuffer(context,buffer_perm,n,nullptr,&error_flag);
   if (error_flag != CL_SUCCESS)
     return UCL_MEMORY_ERROR;
-    *mat.host_ptr() = (typename mat_type::data_type*)
-                      clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
-                                         map_perm,0,n,0,nullptr,nullptr,nullptr);
+  *mat.host_ptr() = (typename mat_type::data_type*)
+    clEnqueueMapBuffer(cm.cq(),mat.cbegin(),CL_TRUE,
+		       map_perm,0,n,0,NULL,NULL,NULL);
 
   mat.cq()=cm.cq();
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -116,18 +116,15 @@ inline int _host_alloc(mat_type &mat, copy_type &cm, const size_t n,
 }
 
 template <class mat_type, class copy_type>
-inline int _host_view(mat_type &mat, copy_type &cm, const size_t n) {
+inline int _host_view(mat_type &mat, copy_type &cm, const size_t o,
+                      const size_t n) {
   cl_int error_flag;
-  cl_context context;
-  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_CONTEXT,sizeof(context),
-                                  &context,nullptr));
-  cl_mem_flags orig_flags;
-  CL_SAFE_CALL(clGetMemObjectInfo(cm.cbegin(),CL_MEM_FLAGS,sizeof(orig_flags),
-                                  &orig_flags,nullptr));
-  orig_flags=orig_flags & ~CL_MEM_ALLOC_HOST_PTR;
-
-  mat.cbegin()=clCreateBuffer(context, CL_MEM_USE_HOST_PTR | orig_flags, n,
-                              *mat.host_ptr(), &error_flag);
+  cl_buffer_region subbuffer;
+  subbuffer.origin = o;
+  subbuffer.size = n;
+  mat.cbegin()=clCreateSubBuffer(cm.cbegin(), 0,
+                                 CL_BUFFER_CREATE_TYPE_REGION, &subbuffer,
+                                 &error_flag);
 
   CL_CHECK_ERR(error_flag);
   CL_SAFE_CALL(clRetainCommandQueue(mat.cq()));
@@ -470,6 +467,9 @@ inline void _device_zero(mat_type &mat, const size_t n, command_queue &cq) {
   size_t kn=n/sizeof(typename mat_type::data_type);
   CL_SAFE_CALL(clEnqueueNDRangeKernel(cq,kzero,1,0,&kn,0,0,0,0));
   #endif
+  #ifdef GERYON_OCL_FLUSH
+  ucl_flush(cq);
+  #endif
 }
 
 // --------------------------------------------------------------------------
@@ -585,7 +585,10 @@ template <> struct _ucl_memcpy<1,0> {
     std::cerr << "UCL_COPY 1NS\n";
     #endif
     CL_SAFE_CALL(clEnqueueReadBuffer(cq,src.cbegin(),block,src_offset,n,
-                                     dst.begin(),0,nullptr,nullptr));
+                                     dst.begin(),0,NULL,NULL));
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
   template <class p1, class p2>
   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@@ -617,6 +620,9 @@ template <> struct _ucl_memcpy<1,0> {
         src_offset+=spitch;
         dst_offset+=dpitch;
       }
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
 };
 
@@ -637,7 +643,10 @@ template <> struct _ucl_memcpy<0,1> {
     std::cerr << "UCL_COPY 3NS\n";
     #endif
     CL_SAFE_CALL(clEnqueueWriteBuffer(cq,dst.cbegin(),block,dst_offset,n,
-                                      src.begin(),0,nullptr,nullptr));
+                                      src.begin(),0,NULL,NULL));
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
   template <class p1, class p2>
   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@@ -669,6 +678,9 @@ template <> struct _ucl_memcpy<0,1> {
         src_offset+=spitch;
         dst_offset+=dpitch;
       }
+    #ifdef GERYON_OCL_FLUSH
+    if (block==CL_FALSE) ucl_flush(cq);
+    #endif
   }
 };
 
@@ -690,6 +702,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     #endif
 
     if (block==CL_TRUE) ucl_sync(cq);
+    #ifdef GERYON_OCL_FLUSH
+    else ucl_flush(cq);
+    #endif
   }
   template <class p1, class p2>
   static inline void mc(p1 &dst, const size_t dpitch, const p2 &src,
@@ -720,6 +735,9 @@ template <int mem1, int mem2> struct _ucl_memcpy {
     #endif
 
     if (block==CL_TRUE) ucl_sync(cq);
+    #ifdef GERYON_OCL_FLUSH
+    else ucl_flush(cq);
+    #endif
   }
 };
 
diff --git a/lib/gpu/geryon/ocl_texture.h b/lib/gpu/geryon/ocl_texture.h
index 0e60045f55..43de4b258c 100644
--- a/lib/gpu/geryon/ocl_texture.h
+++ b/lib/gpu/geryon/ocl_texture.h
@@ -53,6 +53,59 @@ class UCL_Texture {
   friend class UCL_Kernel;
 };
 
+/// Class storing a const global memory reference
+class UCL_Const {
+ public:
+   UCL_Const() : _global_bytes(0), _active(false) {}
+  ~UCL_Const() { clear(); }
+  /// Construct with a specified global reference
+  inline UCL_Const(UCL_Program &prog, const char *global_name)
+    { get_global(prog,global_name); }
+  /// Set the global reference for this object
+  inline void get_global(UCL_Program &prog, const char *global_name) {
+    if (_active) {
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+    }
+    _active = true;
+    _context = prog._context;
+    _cq = prog._cq;
+    CL_SAFE_CALL(clRetainContext(_context));
+    CL_SAFE_CALL(clRetainCommandQueue(_cq));
+  }
+  /// Copy from array on host to const memory
+  template <class numtyp>
+  inline void update_device(UCL_H_Vec<numtyp> &src, const int numel) {
+    const int bytes=numel*sizeof(numtyp);
+    if (_global_bytes < bytes) {
+      if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
+      cl_int e;
+      _global = clCreateBuffer(_context, CL_MEM_READ_ONLY, bytes, NULL, &e);
+      CL_SAFE_CALL(e);
+    }
+    CL_SAFE_CALL(clEnqueueWriteBuffer(_cq, _global, CL_FALSE, 0, bytes,
+				      (void *)src.begin(), 0, NULL, NULL));
+  }
+  /// Get device ptr associated with object
+  inline const cl_mem * begin() const { return &_global; }
+  inline void clear() {
+    if (_global_bytes) CL_SAFE_CALL(clReleaseMemObject(_global));
+    if (_active) {
+      CL_DESTRUCT_CALL(clReleaseContext(_context));
+      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq));
+    }
+    _global_bytes=0;
+    _active=false;
+  }
+
+ private:
+  cl_mem _global;
+  size_t _global_bytes;
+  cl_context _context;
+  cl_command_queue _cq;
+  bool _active;
+};
+
 } // namespace
 
 #endif
diff --git a/lib/gpu/geryon/ocl_timer.h b/lib/gpu/geryon/ocl_timer.h
index 8e8ffa929e..ca74312d51 100644
--- a/lib/gpu/geryon/ocl_timer.h
+++ b/lib/gpu/geryon/ocl_timer.h
@@ -61,7 +61,6 @@ class UCL_Timer {
   /// Initialize command queue for timing
   inline void init(UCL_Device &dev, command_queue &cq) {
     clear();
-    t_factor=dev.timer_resolution()/1000000000.0;
     _cq=cq;
     clRetainCommandQueue(_cq);
     _initialized=true;
@@ -124,17 +123,17 @@ class UCL_Timer {
     clReleaseEvent(start_event);
     clReleaseEvent(stop_event);
     has_measured_time = false;
-    return (tend-tstart)*t_factor;
+    return (tend-tstart)*1e-6;
   }
 
   /// Return the time (s) of last start to stop - Forces synchronization
-  inline double seconds() { return time()/1000.0; }
+  inline double seconds() { return time()*1e-3; }
 
   /// Return the total time in ms
   inline double total_time() { return _total_time; }
 
   /// Return the total time in seconds
-  inline double total_seconds() { return _total_time/1000.0; }
+  inline double total_seconds() { return _total_time*1e-3; }
 
  private:
   cl_event start_event, stop_event;
diff --git a/lib/gpu/geryon/ucl_basemat.h b/lib/gpu/geryon/ucl_basemat.h
index 07e23aebe7..51fd33d623 100644
--- a/lib/gpu/geryon/ucl_basemat.h
+++ b/lib/gpu/geryon/ucl_basemat.h
@@ -69,17 +69,17 @@ class UCL_BaseMat {
   /// Return the type/permissions of memory allocation
   /** Returns UCL_READ_WRITE, UCL_WRITE_ONLY, UCL_READ_ONLY, UCL_NOT_PINNED
     * or UCL_VIEW **/
+  /// Assert that any ops in associate command queue have been issued to device
+  inline void flush() { ucl_flush(_cq); }
+
   inline enum UCL_MEMOPT kind() const { return _kind; }
 
   inline bool shared_mem_device() {
     #ifdef _OCL_MAT
     cl_device_id device;
     CL_SAFE_CALL(clGetCommandQueueInfo(_cq,CL_QUEUE_DEVICE,
-                                       sizeof(cl_device_id),&device,nullptr));
-    cl_device_type device_type;
-    CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
-                                 sizeof(device_type),&device_type,nullptr));
-    return _shared_mem_device(device_type);
+                                       sizeof(cl_device_id),&device,NULL));
+    return _shared_mem_device(device);
     #else
     return false;
     #endif
diff --git a/lib/gpu/geryon/ucl_d_vec.h b/lib/gpu/geryon/ucl_d_vec.h
index cd2a90fe2d..e791f18f29 100644
--- a/lib/gpu/geryon/ucl_d_vec.h
+++ b/lib/gpu/geryon/ucl_d_vec.h
@@ -39,7 +39,7 @@ class UCL_D_Vec : public UCL_BaseMat {
   };
   typedef numtyp data_type;
 
-  UCL_D_Vec() : _cols(0) {}
+ UCL_D_Vec() : _cols(0), _row_bytes(0) {}
   ~UCL_D_Vec() { _device_free(*this); }
 
   /// Construct with n columns
diff --git a/lib/gpu/geryon/ucl_get_devices.cpp b/lib/gpu/geryon/ucl_get_devices.cpp
index b8dfc6f7b1..5654bb40bd 100644
--- a/lib/gpu/geryon/ucl_get_devices.cpp
+++ b/lib/gpu/geryon/ucl_get_devices.cpp
@@ -44,10 +44,8 @@ using namespace ucl_hip;
 int main(int argc, char** argv) {
   UCL_Device cop;
   std::cout << "Found " << cop.num_platforms() << " platform(s).\n";
-  if (cop.num_platforms()>0) {
-    std::cout << "Using platform: " << cop.platform_name() << std::endl;
+  if (cop.num_platforms()>0)
     cop.print_all(std::cout);
-  }
   return 0;
 }
 
diff --git a/lib/gpu/geryon/ucl_h_mat.h b/lib/gpu/geryon/ucl_h_mat.h
index 1df3c2de4b..41dad2b285 100644
--- a/lib/gpu/geryon/ucl_h_mat.h
+++ b/lib/gpu/geryon/ucl_h_mat.h
@@ -241,7 +241,7 @@ class UCL_H_Mat : public UCL_BaseMat {
     _array=input.begin()+offset;
     _end=_array+_cols;
     #ifdef _OCL_MAT
-    _host_view(*this,input,_row_bytes*_rows);
+    _host_view(*this,input,offset*sizeof(numtyp),_row_bytes*_rows);
     #endif
   }
 
diff --git a/lib/gpu/geryon/ucl_h_vec.h b/lib/gpu/geryon/ucl_h_vec.h
index a9d64349d9..5de0c312b0 100644
--- a/lib/gpu/geryon/ucl_h_vec.h
+++ b/lib/gpu/geryon/ucl_h_vec.h
@@ -39,7 +39,7 @@ class UCL_H_Vec : public UCL_BaseMat {
    };
    typedef numtyp data_type;
 
-  UCL_H_Vec() : _cols(0) {
+ UCL_H_Vec() : _cols(0), _row_bytes(0) {
     #ifdef _OCL_MAT
     _carray=(cl_mem)(0);
     #endif
@@ -135,7 +135,7 @@ class UCL_H_Vec : public UCL_BaseMat {
     _cols=cols;
     _row_bytes=_cols*sizeof(numtyp);
     this->_cq=input.cq();
-    _array=input.begin();
+    _array=(numtyp *)input.begin();
     _end=_array+_cols;
     #ifdef _OCL_MAT
     _carray=input.cbegin();
@@ -240,10 +240,10 @@ class UCL_H_Vec : public UCL_BaseMat {
     _cols=cols;
     _row_bytes=_cols*sizeof(numtyp);
     this->_cq=input.cq();
-    _array=input.begin()+offset;
+    _array=(numtyp *)input.begin()+offset;
     _end=_array+_cols;
     #ifdef _OCL_MAT
-    _host_view(*this,input,_row_bytes);
+    _host_view(*this,input,offset*sizeof(numtyp),_row_bytes);
     #endif
   }
 
diff --git a/lib/gpu/geryon/ucl_vector.h b/lib/gpu/geryon/ucl_vector.h
index 7fe2604de6..c03fd31fce 100644
--- a/lib/gpu/geryon/ucl_vector.h
+++ b/lib/gpu/geryon/ucl_vector.h
@@ -162,7 +162,9 @@ class UCL_Vector {
   inline void cq(command_queue &cq_in) { host.cq(cq_in); device.cq(cq_in); }
   /// Block until command_queue associated with matrix is complete
   inline void sync() { host.sync(); }
-
+  /// Assert that any ops in associate command queue have been issued to device
+  inline void flush() { ucl_flush(host.cq()); }
+  
   ///Get the size of a row on the host (including any padding) in elements
   inline size_t row_size() const { return host.row_size(); }
   /// Get the size of a row on the host(including any padding) in bytes
diff --git a/lib/gpu/lal_answer.cpp b/lib/gpu/lal_answer.cpp
index 803b781286..4a68466d05 100644
--- a/lib/gpu/lal_answer.cpp
+++ b/lib/gpu/lal_answer.cpp
@@ -14,6 +14,9 @@
  ***************************************************************************/
 
 #include "lal_answer.h"
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
 
 namespace LAMMPS_AL {
 #define AnswerT Answer<numtyp,acctyp>
@@ -56,7 +59,7 @@ bool AnswerT::alloc(const int inum) {
 
 template <class numtyp, class acctyp>
 bool AnswerT::init(const int inum, const bool charge, const bool rot,
-                       UCL_Device &devi) {
+                   UCL_Device &devi) {
   clear();
 
   bool success=true;
@@ -81,6 +84,10 @@ bool AnswerT::init(const int inum, const bool charge, const bool rot,
   _time_cast=0.0;
   _time_cpu_idle=0.0;
 
+  success=success && (error_flag.alloc(1,*dev,UCL_READ_WRITE,
+                                        UCL_WRITE_ONLY)==UCL_SUCCESS);
+  if (success) error_flag.zero();
+
   return success && alloc(ef_inum);
 }
 
@@ -111,6 +118,7 @@ bool AnswerT::add_fields(const bool charge, const bool rot) {
 template <class numtyp, class acctyp>
 void AnswerT::clear() {
   _gpu_bytes=0;
+  error_flag.clear();
   if (!_allocated)
     return;
   _allocated=false;
@@ -138,12 +146,21 @@ double AnswerT::host_memory_usage() const {
 
 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
-                               const bool ef_atom, const bool vf_atom) {
+                           const bool ef_atom, const bool vf_atom,
+                           const int red_blocks) {
   time_answer.start();
   _eflag=eflag;
   _vflag=vflag;
   _ef_atom=ef_atom;
   _vf_atom=vf_atom;
+  #ifdef LAL_NO_BLOCK_REDUCE
+  _ev_stride=_inum;
+  #else
+  if (ef_atom || vf_atom)
+    _ev_stride=_inum;
+  else
+    _ev_stride=red_blocks;
+  #endif
 
   int csize=_ev_fields;
   if (!eflag)
@@ -152,20 +169,24 @@ void AnswerT::copy_answers(const bool eflag, const bool vflag,
     csize-=6;
 
   if (csize>0)
-    engv.update_host(_inum*csize,true);
+    engv.update_host(_ev_stride*csize,true);
   if (_rot)
     force.update_host(_inum*4*2,true);
   else
     force.update_host(_inum*4,true);
   time_answer.stop();
+
+  #ifndef GERYON_OCL_FLUSH
+  force.flush();
+  #endif
 }
 
 template <class numtyp, class acctyp>
 void AnswerT::copy_answers(const bool eflag, const bool vflag,
-                               const bool ef_atom, const bool vf_atom,
-                               int *ilist) {
+                           const bool ef_atom, const bool vf_atom,
+                           int *ilist, const int red_blocks) {
   _ilist=ilist;
-  copy_answers(eflag,vflag,ef_atom,vf_atom);
+  copy_answers(eflag,vflag,ef_atom,vf_atom,red_blocks);
 }
 
 template <class numtyp, class acctyp>
@@ -177,21 +198,24 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
   double evdwl=0.0;
   int vstart=0;
   if (_eflag) {
-    for (int i=0; i<_inum; i++)
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:evdwl)
+    #endif
+    for (int i=0; i<_ev_stride; i++)
       evdwl+=engv[i];
     if (_ef_atom) {
       if (_ilist==nullptr) {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
           eatom[i]+=engv[i];
       } else {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
           eatom[_ilist[i]]+=engv[i];
       }
     }
-    vstart=_inum;
+    vstart=_ev_stride;
   }
   if (_vflag) {
-    int iend=vstart+_inum;
+    int iend=vstart+_ev_stride;
     for (int j=0; j<6; j++) {
       for (int i=vstart; i<iend; i++)
         virial[j]+=engv[i];
@@ -206,8 +230,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
             vatom[_ilist[ii++]][j]+=engv[i];
         }
       }
-      vstart+=_inum;
-      iend+=_inum;
+      vstart+=_ev_stride;
+      iend+=_ev_stride;
     }
   }
 
@@ -224,28 +248,36 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
     return energy_virial(eatom,vatom,virial);
 
   double evdwl=0.0;
-  int ii, vstart=0, iend=_inum;
+  int ii, vstart=0, iend=_ev_stride;
   if (_eflag) {
-    iend=_inum*2;
-    for (int i=0; i<_inum; i++)
+    iend=_ev_stride*2;
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:evdwl)
+    #endif
+    for (int i=0; i<_ev_stride; i++)
       evdwl+=engv[i];
-    for (int i=_inum; i<iend; i++)
-      ecoul+=engv[i];
+    double ecv=0.0;
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd reduction(+:ecv)
+    #endif
+    for (int i=_ev_stride; i<iend; i++)
+      ecv+=engv[i];
+    ecoul+=ecv;
     if (_ef_atom) {
       if (_ilist==nullptr) {
-        for (int i=0; i<_inum; i++)
+        for (int i=0; i<_ev_stride; i++)
           eatom[i]+=engv[i];
-        for (int i=_inum; i<iend; i++)
+        for (int i=_ev_stride; i<iend; i++)
           eatom[i]+=engv[i];
       } else {
-        for (int i=0, ii=0; i<_inum; i++)
+        for (int i=0, ii=0; i<_ev_stride; i++)
           eatom[_ilist[ii++]]+=engv[i];
-        for (int i=_inum, ii=0; i<iend; i++)
+        for (int i=_ev_stride, ii=0; i<iend; i++)
           eatom[_ilist[ii++]]+=engv[i];
       }
     }
     vstart=iend;
-    iend+=_inum;
+    iend+=_ev_stride;
   }
   if (_vflag) {
     for (int j=0; j<6; j++) {
@@ -260,8 +292,8 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
             vatom[_ilist[ii++]][j]+=engv[i];
         }
       }
-      vstart+=_inum;
-      iend+=_inum;
+      vstart+=_ev_stride;
+      iend+=_ev_stride;
     }
   }
 
@@ -270,38 +302,79 @@ double AnswerT::energy_virial(double *eatom, double **vatom,
 
 template <class numtyp, class acctyp>
 void AnswerT::get_answers(double **f, double **tor) {
-  int fl=0;
   if (_ilist==nullptr) {
-    for (int i=0; i<_inum; i++) {
-      f[i][0]+=force[fl];
-      f[i][1]+=force[fl+1];
-      f[i][2]+=force[fl+2];
-      fl+=4;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
-        tor[i][0]+=force[fl];
-        tor[i][1]+=force[fl+1];
-        tor[i][2]+=force[fl+2];
-        fl+=4;
+    typedef struct { double x,y,z; } vec3d;
+    typedef struct { acctyp x,y,z,w; } vec4d_t;
+    vec3d *fp=reinterpret_cast<vec3d*>(&(f[0][0]));
+    vec4d_t *forcep=reinterpret_cast<vec4d_t*>(&(force[0]));
+
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = omp_get_num_threads();
+      const int tid = omp_get_thread_num();
+      const int idelta = _inum / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = std::min(ifrom + idelta, _inum);
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = _inum;
+      #endif
+
+      for (int i=ifrom; i<ito; i++) {
+        fp[i].x+=forcep[i].x;
+        fp[i].y+=forcep[i].y;
+        fp[i].z+=forcep[i].z;
+      }
+      if (_rot) {
+        vec3d *torp=reinterpret_cast<vec3d*>(&(tor[0][0]));
+        vec4d_t *torquep=reinterpret_cast<vec4d_t*>(&(force[_inum*4]));
+        for (int i=ifrom; i<ito; i++) {
+          torp[i].x+=torquep[i].x;
+          torp[i].y+=torquep[i].y;
+          torp[i].z+=torquep[i].z;
+        }
       }
     }
   } else {
-    for (int i=0; i<_inum; i++) {
-      int ii=_ilist[i];
-      f[ii][0]+=force[fl];
-      f[ii][1]+=force[fl+1];
-      f[ii][2]+=force[fl+2];
-      fl+=4;
-    }
-    if (_rot) {
-      for (int i=0; i<_inum; i++) {
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = omp_get_num_threads();
+      const int tid = omp_get_thread_num();
+      const int idelta = _inum / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = std::min(ifrom + idelta, _inum);
+      int fl=ifrom*4;
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = _inum;
+      int fl=0;
+      #endif
+
+      for (int i=ifrom; i<ito; i++) {
         int ii=_ilist[i];
-        tor[ii][0]+=force[fl];
-        tor[ii][1]+=force[fl+1];
-        tor[ii][2]+=force[fl+2];
+        f[ii][0]+=force[fl];
+        f[ii][1]+=force[fl+1];
+        f[ii][2]+=force[fl+2];
         fl+=4;
       }
+      if (_rot) {
+        fl=_inum*4 + ifrom*4;
+        for (int i=ifrom; i<ito; i++) {
+          int ii=_ilist[i];
+          tor[ii][0]+=force[fl];
+          tor[ii][1]+=force[fl+1];
+          tor[ii][2]+=force[fl+2];
+          fl+=4;
+        }
+      }
     }
   }
 }
diff --git a/lib/gpu/lal_answer.h b/lib/gpu/lal_answer.h
index 20dcb9ad09..ae52e64f16 100644
--- a/lib/gpu/lal_answer.h
+++ b/lib/gpu/lal_answer.h
@@ -110,12 +110,12 @@ class Answer {
   // -------------------------COPY FROM GPU -------------------------------
 
   /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom);
+  void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
+                    const bool vf_atom, const int red_blocks);
 
   /// Copy answers from device into read buffer asynchronously
-  void copy_answers(const bool eflag, const bool vflag,
-                    const bool ef_atom, const bool vf_atom, int *ilist);
+  void copy_answers(const bool eflag, const bool vflag, const bool ef_atom,
+                    const bool vf_atom, int *ilist, const int red_blocks);
 
   /// Copy energy and virial data into LAMMPS memory
   double energy_virial(double *eatom, double **vatom, double *virial);
@@ -128,11 +128,13 @@ class Answer {
   void get_answers(double **f, double **tor);
 
   inline double get_answers(double **f, double **tor, double *eatom,
-                            double **vatom, double *virial, double &ecoul) {
+                            double **vatom, double *virial, double &ecoul,
+                            int &error_flag_in) {
     double ta=MPI_Wtime();
     time_answer.sync_stop();
     _time_cpu_idle+=MPI_Wtime()-ta;
     double ts=MPI_Wtime();
+    if (error_flag[0]) error_flag_in=error_flag[0];
     double evdw=energy_virial(eatom,vatom,virial,ecoul);
     get_answers(f,tor);
     _time_cast+=MPI_Wtime()-ts;
@@ -151,6 +153,8 @@ class Answer {
   UCL_Vector<acctyp,acctyp> force;
   /// Energy and virial per-atom storage
   UCL_Vector<acctyp,acctyp> engv;
+  /// Error flag
+  UCL_Vector<int,int> error_flag;
 
   /// Device timers
   UCL_Timer time_answer;
@@ -162,7 +166,7 @@ class Answer {
   bool alloc(const int inum);
 
   bool _allocated, _eflag, _vflag, _ef_atom, _vf_atom, _rot, _charge, _other;
-  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields;
+  int _max_local, _inum, _e_fields, _ev_fields, _ans_fields, _ev_stride;
   int *_ilist;
   double _time_cast, _time_cpu_idle;
 
diff --git a/lib/gpu/lal_atom.cpp b/lib/gpu/lal_atom.cpp
index 7ce3e3e7ff..cda4d383b5 100644
--- a/lib/gpu/lal_atom.cpp
+++ b/lib/gpu/lal_atom.cpp
@@ -414,9 +414,9 @@ const char *atom=0;
 
 template <class numtyp, class acctyp>
 void AtomT::compile_kernels(UCL_Device &dev) {
-  std::string flags = "-D"+std::string(OCL_VENDOR);
+  std::string flags = "";
   atom_program=new UCL_Program(dev);
-  atom_program->load_string(atom,flags);
+  atom_program->load_string(atom,flags,nullptr,screen);
   k_cast_x.set_function(*atom_program,"kernel_cast_x");
   _compiled=true;
 }
diff --git a/lib/gpu/lal_atom.h b/lib/gpu/lal_atom.h
index e39740d6c8..3cf97d94a0 100644
--- a/lib/gpu/lal_atom.h
+++ b/lib/gpu/lal_atom.h
@@ -24,6 +24,9 @@
 #include "geryon/ocl_mat.h"
 #include "geryon/ocl_kernel.h"
 using namespace ucl_opencl;
+#ifndef LAL_NO_OCL_EV_JIT
+#define LAL_OCL_EV_JIT
+#endif
 #elif defined(USE_CUDART)
 #include "geryon/nvc_timer.h"
 #include "geryon/nvc_mat.h"
@@ -178,7 +181,7 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
     ucl_copy(dev_v,view,false);
   }
 
@@ -197,7 +200,26 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
+    ucl_copy(dev_v,view,false);
+  }
+
+  /// Pack LAMMPS atom type constants into 2 vectors and copy to device
+  template <class dev_typ, class t1, class t2>
+  inline void type_pack2(const int n, UCL_D_Vec<dev_typ> &dev_v,
+                         UCL_H_Vec<numtyp> &buffer, t1 ***one, t2 ***two) {
+    int ii=0;
+    for (int i=0; i<n; i++) {
+      for (int j=0; j<n; j++) {
+        for (int k=0; k<n; k++) {
+          buffer[ii*2]=static_cast<numtyp>(one[i][j][k]);
+          buffer[ii*2+1]=static_cast<numtyp>(two[i][j][k]);
+          ii++;
+        }
+      }
+    }
+    UCL_H_Vec<dev_typ> view;
+    view.view_offset(0,buffer,n*n*n);
     ucl_copy(dev_v,view,false);
   }
 
@@ -217,7 +239,7 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
     ucl_copy(dev_v,view,false);
   }
 
@@ -238,7 +260,7 @@ class Atom {
       ii+=m_size-n;
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),m_size*m_size,*dev);
+    view.view_offset(0,buffer,m_size*m_size);
     ucl_copy(dev_v,view,false);
   }
 
@@ -251,7 +273,7 @@ class Atom {
       buffer[i*2+1]=static_cast<numtyp>(two[i][i]);
     }
     UCL_H_Vec<dev_typ> view;
-    view.view((dev_typ*)buffer.begin(),n,*dev);
+    view.view_offset(0,buffer,n);
     ucl_copy(dev_v,view,false);
   }
 
@@ -261,6 +283,9 @@ class Atom {
   inline void data_unavail()
     { _x_avail=false; _q_avail=false; _quat_avail=false; _v_avail=false; _resized=false; }
 
+  typedef struct { double x,y,z; } vec3d;
+  typedef struct { numtyp x,y,z,w; } vec4d_t;
+
   /// Cast positions and types to write buffer
   inline void cast_x_data(double **host_ptr, const int *host_type) {
     if (_x_avail==false) {
@@ -269,13 +294,16 @@ class Atom {
       memcpy(host_x_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
       memcpy(host_type_cast.begin(),host_type,_nall*sizeof(int));
       #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *xp=reinterpret_cast<vec4d_t*>(&(x[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
       for (int i=0; i<_nall; i++) {
-        x[wl]=host_ptr[i][0];
-        x[wl+1]=host_ptr[i][1];
-        x[wl+2]=host_ptr[i][2];
-        x[wl+3]=host_type[i];
-        wl+=4;
+        xp[i].x=host_p[i].x;
+        xp[i].y=host_p[i].y;
+        xp[i].z=host_p[i].z;
+        xp[i].w=host_type[i];
       }
       #endif
       _time_cast+=MPI_Wtime()-t;
@@ -320,6 +348,11 @@ class Atom {
       } else if (sizeof(numtyp)==sizeof(double))
         memcpy(q.host.begin(),host_ptr,_nall*sizeof(numtyp));
       else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
         for (int i=0; i<_nall; i++) q[i]=host_ptr[i];
       _time_cast+=MPI_Wtime()-t;
     }
@@ -346,6 +379,11 @@ class Atom {
       } else if (sizeof(numtyp)==sizeof(double))
         memcpy(quat.host.begin(),host_ptr,_nall*4*sizeof(numtyp));
       else
+        #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+        #pragma omp parallel for simd schedule(static)
+        #elif (LAL_USE_OMP_SIMD == 1)
+        #pragma omp simd
+        #endif
         for (int i=0; i<_nall*4; i++) quat[i]=host_ptr[i];
       _time_cast+=MPI_Wtime()-t;
     }
@@ -370,13 +408,16 @@ class Atom {
       memcpy(host_v_cast.begin(),host_ptr[0],_nall*3*sizeof(double));
       memcpy(host_tag_cast.begin(),host_tag,_nall*sizeof(int));
       #else
-      int wl=0;
+      vec3d *host_p=reinterpret_cast<vec3d*>(&(host_ptr[0][0]));
+      vec4d_t *vp=reinterpret_cast<vec4d_t*>(&(v[0]));
+      #if (LAL_USE_OMP == 1)
+      #pragma omp parallel for schedule(static)
+      #endif
       for (int i=0; i<_nall; i++) {
-        v[wl]=host_ptr[i][0];
-        v[wl+1]=host_ptr[i][1];
-        v[wl+2]=host_ptr[i][2];
-        v[wl+3]=host_tag[i];
-        wl+=4;
+        vp[i].x=host_p[i].x;
+        vp[i].y=host_p[i].y;
+        vp[i].z=host_p[i].z;
+        vp[i].w=host_tag[i];
       }
       #endif
       _time_cast+=MPI_Wtime()-t;
diff --git a/lib/gpu/lal_aux_fun1.h b/lib/gpu/lal_aux_fun1.h
index 5b7150d950..be00abbcef 100644
--- a/lib/gpu/lal_aux_fun1.h
+++ b/lib/gpu/lal_aux_fun1.h
@@ -40,170 +40,521 @@
     nbor_begin+=offset;                                                      \
   }
 
-#if (ARCH < 300)
+#define nbor_info_p(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
+                    i, numj, stride, nbor_end, nbor_begin)                   \
+    i=nbor_mem[ii];                                                          \
+    nbor_begin=ii+nbor_stride;                                               \
+    numj=nbor_mem[nbor_begin];                                               \
+    nbor_begin+=nbor_stride+ii*(t_per_atom-1);                               \
+    stride=fast_mul(t_per_atom,nbor_stride);                                 \
+    nbor_end=nbor_begin+fast_mul(numj/t_per_atom,stride)+(numj &             \
+                                                          (t_per_atom-1));   \
+    nbor_begin+=offset;
 
-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
+#if (SHUFFLE_AVAIL == 0)
+
+#define simd_reduce_add1(width, local, offset, tid, one)                    \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) local[0][tid] += local[0][tid+s];                       \
+  }                                                                         \
+  if (offset==0) one=local[0][tid];
+
+#define simd_reduce_add2(width, local, offset, tid, one, two)               \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
     }                                                                       \
   }                                                                         \
   if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+  }
+
+#define simd_reduce_add3(width, local, offset, tid, one, two, three)        \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
     }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
-        ei+=inum;                                                           \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+  }
+
+#define simd_reduce_add6(width, local, offset, tid, one, two, three,        \
+                         four, five, six)                                   \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  local[2][tid]=three;                                                      \
+  local[3][tid]=four;                                                       \
+  local[4][tid]=five;                                                       \
+  local[5][tid]=six;                                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+      local[2][tid] += local[2][tid+s];                                     \
+      local[3][tid] += local[3][tid+s];                                     \
+      local[4][tid] += local[4][tid+s];                                     \
+      local[5][tid] += local[5][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    one=local[0][tid];                                                      \
+    two=local[1][tid];                                                      \
+    three=local[2][tid];                                                    \
+    four=local[3][tid];                                                     \
+    five=local[4][tid];                                                     \
+    six=local[5][tid];                                                      \
+  }
+
+#define simd_reduce_arr(trip, width, local, offset, tid, arr)               \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    simdsync();                                                             \
+    if (offset < s) {                                                       \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r]=local[r][tid];                                                 \
+  }
+
+#define block_reduce_add1(width, local, tid, one)                           \
+  local[0][tid]=one;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) local[0][tid] += local[0][tid+s];                          \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) local[0][tid] += local[0][tid+s];                        \
+    }                                                                       \
+    if (tid==0) one=local[0][tid];                                          \
+  }
+
+#define block_reduce_add2(width, local, tid, one, two)                      \
+  local[0][tid]=one;                                                        \
+  local[1][tid]=two;                                                        \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      local[0][tid] += local[0][tid+s];                                     \
+      local[1][tid] += local[1][tid+s];                                     \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        local[0][tid] += local[0][tid+s];                                   \
+        local[1][tid] += local[1][tid+s];                                   \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      one=local[0][tid];                                                    \
+      two=local[1][tid];                                                    \
+    }                                                                       \
+  }
+
+#define block_reduce_arr(trip, width, local, tid, arr)                      \
+  for (int r=0; r<trip; r++)                                                \
+    local[r][tid]=arr[r];                                                   \
+  for (unsigned int s=BLOCK_SIZE_X/2; s>width/2; s>>=1) {                   \
+    __syncthreads();                                                        \
+    if (tid < s) {                                                          \
+      for (int r=0; r<trip; r++)                                            \
+        local[r][tid] += local[r][tid+s];                                   \
+    }                                                                       \
+  }                                                                         \
+  if (tid<width) {                                                          \
+    for (unsigned int s=width/2; s>0; s>>=1) {                              \
+      simdsync();                                                           \
+      if (tid < s) {                                                        \
+        for (int r=0; r<trip; r++)                                          \
+          local[r][tid] += local[r][tid+s];                                 \
+      }                                                                     \
+    }                                                                       \
+    if (tid==0) {                                                           \
+      for (int r=0; r<trip; r++)                                            \
+        arr[r]=local[r][tid];                                               \
+    }                                                                       \
+  }
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[6][BLOCK_BIO_PAIR];
+#define local_allocate_store_ellipse()                                      \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[6][BLOCK_ELLIPSE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
       }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    red_acc[4][tid]=e_coul;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<5; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    e_coul=red_acc[4][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    const int ev_stride=NUM_BLOCKS_X;                                       \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
 #else
 
-#define store_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define simd_reduce_add1(width, one)                                        \
+  for (unsigned int s=width/2; s>0; s>>=1) one += shfl_down(one, s, width);
+
+#define simd_reduce_add2(width, one, two)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+  }
+
+#define simd_reduce_add3(width, one, two, three)                            \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+  }
+
+#define simd_reduce_add6(width, one, two, three, four, five, six)           \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    one += shfl_down(one, s, width);                                        \
+    two += shfl_down(two, s, width);                                        \
+    three += shfl_down(three, s, width);                                    \
+    four += shfl_down(four, s, width);                                      \
+    five += shfl_down(five, s, width);                                      \
+    six += shfl_down(six, s, width);                                        \
+  }
+
+#define simd_reduce_arr(trip, width, arr)                                   \
+  for (unsigned int s=width/2; s>0; s>>=1) {                                \
+    for (int r=0; r<trip; r++)                                              \
+      arr[r] += shfl_down(arr[r], s, width);                                \
+  }
+
+#if (EVFLAG == 1)
+
+#define local_allocate_store_pair()                                         \
+    __local acctyp red_acc[7][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_charge()                                       \
+    __local acctyp red_acc[8][BLOCK_PAIR / SIMD_SIZE];
+#define local_allocate_store_bio()                                          \
+    __local acctyp red_acc[8][BLOCK_BIO_PAIR / SIMD_SIZE];
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()                                        \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
 #define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-      e_coul += shfl_xor(e_coul, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
+  if (offset==0 && ii<inum) ans[ii]=f;                                      \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
-    ans[ii]=f;                                                              \
   }
 
+#else
+
+#define local_allocate_store_pair()
+#define local_allocate_store_charge()
+#define local_allocate_store_bio()
+#define local_allocate_store_ellipse()
+#define local_allocate_store_three()
+
+#define store_answers(f, energy, virial, ii, inum, tid,                     \
+                      t_per_atom, offset, eflag, vflag, ans, engv)          \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#define store_answers_q(f, energy, e_coul, virial, ii, inum, tid,           \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) ans[ii]=f;
+
+#endif
+
 #endif
 
diff --git a/lib/gpu/lal_base_atomic.cpp b/lib/gpu/lal_base_atomic.cpp
index 42925aaeec..d35919105d 100644
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@@ -21,12 +21,15 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0)  {
+BaseAtomicT::BaseAtomic() : _compiled(false), _max_bytes(0), _onetype(0) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseAtomicT::~BaseAtomic() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -49,7 +56,7 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
                              const int max_nbors, const int maxspecial,
                              const double cell_size, const double gpu_split,
                              FILE *_screen, const void *pair_program,
-                             const char *k_name) {
+                             const char *k_name, const int onetype) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -64,28 +71,29 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
 
   int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name,onetype);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -102,8 +110,8 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
 }
 
 template <class numtyp, class acctyp>
-void BaseAtomicT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseAtomicT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -164,8 +172,8 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -177,13 +185,27 @@ inline void BaseAtomicT::build_nbor_list(const int inum, const int host_inum,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseAtomicT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success) {
+                          const int nall, double **host_x, int *host_type,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -207,8 +229,8 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -218,14 +240,28 @@ void BaseAtomicT::compute(const int f_ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int ** BaseAtomicT::compute(const int ago, const int inum_full,
-                                 const int nall, double **host_x, int *host_type,
-                                 double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag,
-                                 const bool vflag, const bool eatom,
-                                 const bool vatom, int &host_start,
-                                 int **ilist, int **jnum,
-                                 const double cpu_time, bool &success) {
+                            const int nall, double **host_x, int *host_type,
+                            double *sublo, double *subhi, tagint *tag,
+                            int **nspecial, tagint **special,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom,
+                            int &host_start, int **ilist, int **jnum,
+                            const double cpu_time, bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -254,8 +290,8 @@ int ** BaseAtomicT::compute(const int ago, const int inum_full,
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -270,19 +306,46 @@ double BaseAtomicT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseAtomicT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
-  if (_compiled)
+                                  const char *kname, const int onetype) {
+  if (_compiled && _onetype==onetype)
     return;
+  _onetype=onetype;
 
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseAtomic<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_atomic.h b/lib/gpu/lal_base_atomic.h
index c97f42c50e..701675390f 100644
--- a/lib/gpu/lal_base_atomic.h
+++ b/lib/gpu/lal_base_atomic.h
@@ -53,10 +53,11 @@ class BaseAtomic {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const void *pair_program, const char *k_name,
+                  const int onetype=0);
 
   /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
@@ -100,7 +101,7 @@ class BaseAtomic {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -179,23 +180,31 @@ class BaseAtomic {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
 
  protected:
   bool _compiled;
-  int _block_size, _threads_per_atom;
+  int _block_size, _threads_per_atom, _onetype;
   double _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
+  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k,
+                       const int onetype);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_charge.cpp b/lib/gpu/lal_base_charge.cpp
index d5a6e06222..b0d08e4df7 100644
--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@@ -27,6 +27,9 @@ BaseChargeT::BaseCharge() : _compiled(false), _max_bytes(0) {
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseChargeT::~BaseCharge() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -64,21 +71,11 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
 
   int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
@@ -88,6 +85,17 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
   _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name);
 
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -104,8 +112,8 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
 }
 
 template <class numtyp, class acctyp>
-void BaseChargeT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseChargeT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(1+add_kernels,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -166,8 +174,8 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -179,14 +187,28 @@ inline void BaseChargeT::build_nbor_list(const int inum, const int host_inum,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void BaseChargeT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success, double *host_q,
-                               const int nlocal, double *boxlo, double *prd) {
+                          const int nall, double **host_x, int *host_type,
+                          int *ilist, int *numj, int **firstneigh,
+                          const bool eflag_in, const bool vflag_in,
+                          const bool eatom, const bool vatom,
+                          int &host_start, const double cpu_time,
+                          bool &success, double *host_q,
+                          const int nlocal, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -215,8 +237,8 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
   device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -226,15 +248,29 @@ void BaseChargeT::compute(const int f_ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** BaseChargeT::compute(const int ago, const int inum_full,
-                                const int nall, double **host_x, int *host_type,
-                                double *sublo, double *subhi, tagint *tag,
-                                int **nspecial, tagint **special, const bool eflag,
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
-                                double *host_q, double *boxlo, double *prd) {
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum,
+                           const double cpu_time, bool &success,
+                           double *host_q, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -269,8 +305,8 @@ int** BaseChargeT::compute(const int ago, const int inum_full,
   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -292,13 +328,37 @@ void BaseChargeT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseCharge<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_charge.h b/lib/gpu/lal_base_charge.h
index b6d3e9e3f8..6b8761092a 100644
--- a/lib/gpu/lal_base_charge.h
+++ b/lib/gpu/lal_base_charge.h
@@ -57,7 +57,7 @@ class BaseCharge {
                   const void *pair_program, const char *k_name);
 
   /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
@@ -103,7 +103,7 @@ class BaseCharge {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -177,9 +177,15 @@ class BaseCharge {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -194,7 +200,7 @@ class BaseCharge {
 
   void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_dipole.cpp b/lib/gpu/lal_base_dipole.cpp
index 57773a3b80..9781065b13 100644
--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@@ -27,6 +27,9 @@ BaseDipoleT::BaseDipole() : _compiled(false), _max_bytes(0) {
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseDipoleT::~BaseDipole() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -65,30 +72,30 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
     _gpu_host=1;
 
   _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
 
   int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
   compile_kernels(*ucl_device,pair_program,k_name);
 
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
+
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -168,8 +175,8 @@ inline void BaseDipoleT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -183,12 +190,26 @@ template <class numtyp, class acctyp>
 void BaseDipoleT::compute(const int f_ago, const int inum_full,
                           const int nall, double **host_x, int *host_type,
                           int *ilist, int *numj, int **firstneigh,
-                          const bool eflag, const bool vflag,
+                          const bool eflag_in, const bool vflag_in,
                           const bool eatom, const bool vatom,
                           int &host_start, const double cpu_time,
                           bool &success, double *host_q, double **host_mu,
                           const int nlocal, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -219,8 +240,8 @@ void BaseDipoleT::compute(const int f_ago, const int inum_full,
   device->precompute(f_ago,nlocal,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -232,14 +253,28 @@ template <class numtyp, class acctyp>
 int** BaseDipoleT::compute(const int ago, const int inum_full,
                            const int nall, double **host_x, int *host_type,
                            double *sublo, double *subhi, tagint *tag,
-                           int **nspecial, tagint **special, const bool eflag,
-                           const bool vflag, const bool eatom,
-                           const bool vatom, int &host_start,
-                           int **ilist, int **jnum,
+                           int **nspecial, tagint **special,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom,
+                           int &host_start, int **ilist, int **jnum,
                            const double cpu_time, bool &success,
                            double *host_q, double **host_mu,
                            double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -277,8 +312,8 @@ int** BaseDipoleT::compute(const int ago, const int inum_full,
   device->precompute(ago,inum_full,nall,host_x,host_type,success,host_q,
                      boxlo, prd);
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -300,14 +335,38 @@ void BaseDipoleT::compile_kernels(UCL_Device &dev, const void *pair_str,
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
   q_tex.get_texture(*pair_program,"q_tex");
   mu_tex.get_texture(*pair_program,"mu_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseDipole<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_dipole.h b/lib/gpu/lal_base_dipole.h
index 856b69b56b..f7cefd9066 100644
--- a/lib/gpu/lal_base_dipole.h
+++ b/lib/gpu/lal_base_dipole.h
@@ -102,7 +102,7 @@ class BaseDipole {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -176,9 +176,16 @@ class BaseDipole {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -187,14 +194,14 @@ class BaseDipole {
 
  protected:
   bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
+  int _block_size, _threads_per_atom;
   double  _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_dpd.cpp b/lib/gpu/lal_base_dpd.cpp
index e4fd80fcc3..4b6a964bfb 100644
--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@@ -27,6 +27,9 @@ BaseDPDT::BaseDPD() : _compiled(false), _max_bytes(0) {
   nbor=new Neighbor();
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -36,6 +39,10 @@ BaseDPDT::~BaseDPD() {
   k_pair_fast.clear();
   k_pair.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -47,9 +54,9 @@ int BaseDPDT::bytes_per_atom_atomic(const int max_nbors) const {
 template <class numtyp, class acctyp>
 int BaseDPDT::init_atomic(const int nlocal, const int nall,
                           const int max_nbors, const int maxspecial,
-                          const double cell_size,
-                          const double gpu_split, FILE *_screen,
-                          const void *pair_program, const char *k_name) {
+                          const double cell_size, const double gpu_split,
+                          FILE *_screen, const void *pair_program,
+                          const char *k_name, const int onetype) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -63,31 +70,30 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  _threads_per_atom=device->threads_per_charge();
-  if (_threads_per_atom>1 && gpu_nbor==0) {
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else
-    _nbor_data=&(nbor->dev_nbor);
+  _threads_per_atom=device->threads_per_atom();
 
   int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
   atom=&device->atom;
 
   _block_size=device->pair_block_size();
-  _block_bio_size=device->block_bio_pair();
-  compile_kernels(*ucl_device,pair_program,k_name);
+  compile_kernels(*ucl_device,pair_program,k_name,onetype);
+
+  if (_threads_per_atom>1 && gpu_nbor==0) {
+    nbor->packing(true);
+    _nbor_data=&(nbor->dev_packed);
+  } else
+    _nbor_data=&(nbor->dev_nbor);
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+  if (success!=0)
+    return success;
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -167,8 +173,8 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   if (bytes>_max_an_bytes)
@@ -179,16 +185,30 @@ inline void BaseDPDT::build_nbor_list(const int inum, const int host_inum,
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BaseDPDT::compute(const int f_ago, const int inum_full,
-                       const int nall, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag,
-                       const bool eatom, const bool vatom,
-                       int &host_start, const double cpu_time,
-                       bool &success, tagint *tag, double **host_v,
-                       const double dtinvsqrt, const int seed, const int timestep,
+void BaseDPDT::compute(const int f_ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag_in,
+                       const bool vflag_in, const bool eatom,
+                       const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, tagint *tag,
+                       double **host_v, const double dtinvsqrt,
+                       const int seed, const int timestep,
                        const int nlocal, double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -218,8 +238,8 @@ void BaseDPDT::compute(const int f_ago, const int inum_full,
   _seed = seed;
   _timestep = timestep;
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 }
@@ -231,8 +251,8 @@ template <class numtyp, class acctyp>
 int** BaseDPDT::compute(const int ago, const int inum_full,
                         const int nall, double **host_x, int *host_type,
                         double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag,
-                        const bool vflag, const bool eatom,
+                        int **nspecial, tagint **special, const bool eflag_in,
+                        const bool vflag_in, const bool eatom,
                         const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success,
@@ -240,6 +260,20 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
                         const int seed, const int timestep,
                         double *boxlo, double *prd) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -275,8 +309,8 @@ int** BaseDPDT::compute(const int ago, const int inum_full,
   _seed = seed;
   _timestep = timestep;
 
-  loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -291,20 +325,48 @@ double BaseDPDT::host_memory_usage_atomic() const {
 
 template <class numtyp, class acctyp>
 void BaseDPDT::compile_kernels(UCL_Device &dev, const void *pair_str,
-                                  const char *kname) {
-  if (_compiled)
+                               const char *kname, const int onetype) {
+  if (_compiled && _onetype==onetype)
     return;
 
+  _onetype=onetype;
+
   std::string s_fast=std::string(kname)+"_fast";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_pair_fast.set_function(*pair_program,s_fast.c_str());
   k_pair.set_function(*pair_program,kname);
   pos_tex.get_texture(*pair_program,"pos_tex");
   vel_tex.get_texture(*pair_program,"vel_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype) oclstring+=" -DONETYPE="+device->toa(_onetype);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_pair_noev.set_function(*pair_program_noev,s_fast.c_str());
+  #else
+  k_pair_sel = &k_pair_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair_fast.max_subgroup_size(_block_size);
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseDPD<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_dpd.h b/lib/gpu/lal_base_dpd.h
index 5d1573c1a9..9eb56993af 100644
--- a/lib/gpu/lal_base_dpd.h
+++ b/lib/gpu/lal_base_dpd.h
@@ -52,7 +52,8 @@ class BaseDPD {
   int init_atomic(const int nlocal, const int nall, const int max_nbors,
                   const int maxspecial, const double cell_size,
                   const double gpu_split, FILE *screen,
-                  const void *pair_program, const char *k_name);
+                  const void *pair_program, const char *k_name,
+                  const int onetype=0);
 
   /// Estimate the overhead for GPU context changes and CPU driver
   void estimate_gpu_overhead();
@@ -101,7 +102,7 @@ class BaseDPD {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -177,9 +178,16 @@ class BaseDPD {
   Neighbor *nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
-  UCL_Kernel k_pair_fast, k_pair;
+  UCL_Program *pair_program, *pair_program_noev;
+  UCL_Kernel k_pair_fast, k_pair, k_pair_noev, *k_pair_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_pair_sel = &k_pair_fast;
+    else k_pair_sel = &k_pair_noev;
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -191,13 +199,14 @@ class BaseDPD {
 
  protected:
   bool _compiled;
-  int _block_size, _block_bio_size, _threads_per_atom;
+  int _block_size, _threads_per_atom, _onetype;
   double  _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
   UCL_D_Vec<int> *_nbor_data;
 
-  void compile_kernels(UCL_Device &dev, const void *pair_string, const char *k);
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  void compile_kernels(UCL_Device &dev, const void *pair_string,
+                       const char *k, const int onetype);
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_ellipsoid.cpp b/lib/gpu/lal_base_ellipsoid.cpp
index 524705ed41..98411a8033 100644
--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@@ -29,7 +29,8 @@ const char *ellipsoid_nbor=0;
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
+BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0),
+                                  host_olist_size(0) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -37,6 +38,10 @@ BaseEllipsoidT::BaseEllipsoid() : _compiled(false), _max_bytes(0) {
   ellipsoid_program=nullptr;
   lj_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  ellipsoid_program_noev=nullptr;
+  lj_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -53,6 +58,14 @@ BaseEllipsoidT::~BaseEllipsoid() {
   if (nbor_program) delete nbor_program;
   if (ellipsoid_program) delete ellipsoid_program;
   if (lj_program) delete lj_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_ellipsoid_noev.clear();
+  k_ellipsoid_sphere_noev.clear();
+  k_sphere_ellipsoid_noev.clear();
+  k_lj_fast.clear();
+  if (ellipsoid_program_noev) delete ellipsoid_program_noev;
+  if (lj_program_noev) delete lj_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -89,11 +102,6 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,true,1);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
@@ -102,6 +110,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   _block_size=device->block_ellipse();
   compile_kernels(*ucl_device,ellipsoid_program,lj_program,k_name,ellip_sphere);
 
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,true,1);
+  if (success!=0)
+    return success;
+
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
 
@@ -133,12 +146,11 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,
   if (_multiple_forms && gpu_nbor!=0)
     return -9;
 
-  if (_multiple_forms)
+  if (_multiple_forms) {
     ans->force.zero();
-
-  // Memory for ilist ordered by particle type
-  if (host_olist.alloc(nbor->max_atoms(),*ucl_device)!=UCL_SUCCESS)
-    return -3;
+    host_olist_size = nbor->max_atoms();
+    host_olist = new int[nbor->max_atoms()];
+  }
 
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
 
@@ -160,7 +172,10 @@ template <class numtyp, class acctyp>
 void BaseEllipsoidT::clear_base() {
   // Output any timing information
   output_times();
-  host_olist.clear();
+  if (host_olist_size) {
+    host_olist_size = 0;
+    delete []host_olist;
+  }
 
   time_nbor1.clear();
   time_ellipsoid.clear();
@@ -206,10 +221,14 @@ void BaseEllipsoidT::output_times() {
   MPI_Reduce(&_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,
              device->replica());
   double max_mb=mpi_max_bytes/(1024*1024);
-  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
+
+  #ifdef USE_OPENCL
+  // Workaround for timing issue on Intel OpenCL
+  if (times[3] > 80e6) times[3]=0.0;
+  #endif
 
   if (device->replica_me()==0)
-    if (screen && times[5]>0.0) {
+    if (screen && times[7]>0.0) {
       int replica_size=device->replica_size();
 
       fprintf(screen,"\n\n-------------------------------------");
@@ -218,9 +237,8 @@ void BaseEllipsoidT::output_times() {
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (device->procs_per_gpu()==1 && t_time>0) {
+      if (device->procs_per_gpu()==1 && times[3]>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[5]/replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/replica_size);
         if (nbor->gpu_nbor()>0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/replica_size);
@@ -229,13 +247,15 @@ void BaseEllipsoidT::output_times() {
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/replica_size);
         fprintf(screen,"LJ calc:         %.4f s.\n",times[4]/replica_size);
       }
-      if (nbor->gpu_nbor()==2)
-        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[9]/replica_size);
       if (times[6]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[6]/replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",_threads_per_atom);
+      fprintf(screen,"Lanes / atom:    %d.\n",_threads_per_atom);
+      fprintf(screen,"Vector width:    %d.\n", device->simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      if (nbor->gpu_nbor()==2)
+        fprintf(screen,"CPU Neighbor:    %.4f s.\n",times[9]/replica_size);
+      fprintf(screen,"CPU Cast/Pack:   %.4f s.\n",times[5]/replica_size);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[7]/replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[8]/replica_size);
       fprintf(screen,"-------------------------------------");
@@ -256,11 +276,13 @@ void BaseEllipsoidT::pack_nbors(const int GX, const int BX, const int start,
   if (shared_types) {
     k_nbor_fast.set_size(GX,BX);
     k_nbor_fast.run(&atom->x, &cut_form, &nbor->dev_nbor, &stride, &start,
-                    &inum, &nbor->dev_packed, &form_low, &form_high);
+                    &inum, &nbor->dev_packed, &form_low, &form_high,
+                    &_threads_per_atom);
   } else {
     k_nbor.set_size(GX,BX);
     k_nbor.run(&atom->x, &cut_form, &ntypes, &nbor->dev_nbor, &stride,
-               &start, &inum, &nbor->dev_packed, &form_low, &form_high);
+               &start, &inum, &nbor->dev_packed, &form_low, &form_high,
+               &_threads_per_atom);
   }
 }
 
@@ -298,7 +320,7 @@ void BaseEllipsoidT::reset_nbors(const int nall, const int inum,
         p++;
       }
     }
-    nbor->get_host(inum,host_olist.begin(),numj,firstneigh,block_size());
+    nbor->get_host(inum,host_olist,numj,firstneigh,block_size());
     nbor->copy_unpacked(inum,mn);
     return;
   }
@@ -330,8 +352,8 @@ inline void BaseEllipsoidT::build_nbor_list(const int inum, const int host_inum,
   atom->cast_copy_x(host_x,host_type);
 
   int mn;
-  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, inum, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
   nbor->copy_unpacked(inum,mn);
   _last_ellipse=inum;
   _max_last_ellipse=inum;
@@ -348,11 +370,18 @@ template <class numtyp, class acctyp>
 int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
                              const int nall, double **host_x, int *host_type,
                              int *ilist, int *numj, int **firstneigh,
-                             const bool eflag, const bool vflag,
+                             const bool eflag_in, const bool vflag_in,
                              const bool eatom, const bool vatom,
                              int &host_start, const double cpu_time,
                              bool &success, double **host_quat) {
   acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     zero_timers();
@@ -373,7 +402,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
   }
   int *list;
   if (_multiple_forms)
-    list=host_olist.begin();
+    list=host_olist;
   else
     list=ilist;
 
@@ -384,7 +413,7 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
   atom->add_quat_data();
 
   loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom,list);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,list,inum);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
   return list;
@@ -394,15 +423,23 @@ int* BaseEllipsoidT::compute(const int f_ago, const int inum_full,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall,
-                              double **host_x, int *host_type, double *sublo,
-                              double *subhi, tagint *tag, int **nspecial,
-                              tagint **special, const bool eflag, const bool vflag,
+int** BaseEllipsoidT::compute(const int ago, const int inum_full,
+                              const int nall, double **host_x, int *host_type,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              const bool eflag_in, const bool vflag_in,
                               const bool eatom, const bool vatom,
                               int &host_start, int **ilist, int **jnum,
                               const double cpu_time, bool &success,
                               double **host_quat) {
   acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     zero_timers();
@@ -435,7 +472,7 @@ int** BaseEllipsoidT::compute(const int ago, const int inum_full, const int nall
   *jnum=nbor->host_acc.begin();
 
   loop(eflag,vflag);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
   device->add_ans_object(ans);
   hd_balancer.stop_timer();
 
@@ -462,25 +499,26 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
   std::string s_lj=kns+"_lj";
   std::string s_lj_fast=kns+"_lj_fast";
 
-  std::string flags=device->compile_string();
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
 
   if (nbor_program) delete nbor_program;
   nbor_program=new UCL_Program(dev);
-  nbor_program->load_string(ellipsoid_nbor,flags.c_str());
+  nbor_program->load_string(ellipsoid_nbor,oclstring.c_str(),nullptr,screen);
   k_nbor_fast.set_function(*nbor_program,"kernel_nbor_fast");
   k_nbor.set_function(*nbor_program,"kernel_nbor");
   neigh_tex.get_texture(*nbor_program,"pos_tex");
 
   if (ellipsoid_program) delete ellipsoid_program;
   ellipsoid_program=new UCL_Program(dev);
-  ellipsoid_program->load_string(ellipsoid_string,flags.c_str());
+  ellipsoid_program->load_string(ellipsoid_string,oclstring.c_str(),
+                                 nullptr,screen);
   k_ellipsoid.set_function(*ellipsoid_program,kname);
   pos_tex.get_texture(*ellipsoid_program,"pos_tex");
   quat_tex.get_texture(*ellipsoid_program,"quat_tex");
 
   if (lj_program) delete lj_program;
   lj_program=new UCL_Program(dev);
-  lj_program->load_string(lj_string,flags.c_str());
+  lj_program->load_string(lj_string,oclstring.c_str(),nullptr,screen);
   k_sphere_ellipsoid.set_function(*lj_program,s_sphere_ellipsoid.c_str());
   k_lj_fast.set_function(*lj_program,s_lj_fast.c_str());
   k_lj.set_function(*lj_program,s_lj.c_str());
@@ -489,7 +527,52 @@ void BaseEllipsoidT::compile_kernels(UCL_Device &dev,
   lj_pos_tex.get_texture(*lj_program,"pos_tex");
   lj_quat_tex.get_texture(*lj_program,"quat_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (ellipsoid_program_noev) delete ellipsoid_program_noev;
+  ellipsoid_program_noev=new UCL_Program(dev);
+  ellipsoid_program_noev->load_string(ellipsoid_string,oclstring.c_str(),
+                                      nullptr,screen);
+  k_ellipsoid_noev.set_function(*ellipsoid_program_noev,kname);
+
+  if (lj_program_noev) delete lj_program_noev;
+  lj_program_noev=new UCL_Program(dev);
+  lj_program_noev->load_string(lj_string,oclstring.c_str(),nullptr,screen);
+  k_sphere_ellipsoid_noev.set_function(*lj_program_noev,
+                                       s_sphere_ellipsoid.c_str());
+  k_lj_fast_noev.set_function(*lj_program_noev,s_lj_fast.c_str());
+  if (e_s)
+    k_ellipsoid_sphere_noev.set_function(*lj_program_noev,
+                                         s_ellipsoid_sphere.c_str());
+  #else
+  k_elps_sel = &k_ellipsoid;
+  k_elps_sphere_sel = &k_ellipsoid_sphere;
+  k_sphere_elps_sel = &k_sphere_ellipsoid;
+  k_lj_sel = &k_lj_fast;
+  #endif
+
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_lj_fast.max_subgroup_size(_block_size);
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid.max_subgroup_size(_block_size));
+    if (e_s)
+      mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere.max_subgroup_size(_block_size));
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_lj_fast_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_sphere_ellipsoid_noev.max_subgroup_size(_block_size));
+    if (e_s)
+      mx_subgroup_sz = std::min(mx_subgroup_sz, k_ellipsoid_sphere_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseEllipsoid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_ellipsoid.h b/lib/gpu/lal_base_ellipsoid.h
index dc1e624a2f..f30a0062d2 100644
--- a/lib/gpu/lal_base_ellipsoid.h
+++ b/lib/gpu/lal_base_ellipsoid.h
@@ -88,10 +88,10 @@ class BaseEllipsoid {
     ans->resize(nlocal, success);
     if (_multiple_forms) ans->force.zero();
 
-    if (olist_size>static_cast<int>(host_olist.numel())) {
-      host_olist.clear();
-      int new_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
-      success=success && (host_olist.alloc(new_size,*ucl_device)==UCL_SUCCESS);
+    if (olist_size>host_olist_size) {
+      if (host_olist_size) delete []host_olist;
+      host_olist_size=static_cast<int>(static_cast<double>(olist_size)*1.10);
+      host_olist = new int[host_olist_size];
     }
 
     nbor->resize(nlocal,host_inum,max_nbors,success);
@@ -116,7 +116,7 @@ class BaseEllipsoid {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_nbor1.add_to_total();
       time_ellipsoid.add_to_total();
       if (_multiple_forms) {
@@ -223,14 +223,40 @@ class BaseEllipsoid {
   /// Neighbor data
   Neighbor *nbor;
   /// ilist with particles sorted by type
-  UCL_H_Vec<int> host_olist;
+  int *host_olist;
+  int host_olist_size;
 
   // ------------------------- DEVICE KERNELS -------------------------
   UCL_Program *nbor_program, *ellipsoid_program, *lj_program;
+  UCL_Program *ellipsoid_program_noev, *lj_program_noev;
   UCL_Kernel k_nbor_fast, k_nbor;
   UCL_Kernel k_ellipsoid, k_ellipsoid_sphere, k_sphere_ellipsoid;
   UCL_Kernel k_lj_fast, k_lj;
+  UCL_Kernel k_ellipsoid_noev, k_ellipsoid_sphere_noev;
+  UCL_Kernel k_sphere_ellipsoid_noev, k_lj_fast_noev;
+  UCL_Kernel *k_elps_sel, *k_elps_sphere_sel, *k_sphere_elps_sel, *k_lj_sel;
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (_multiple_forms == false) {
+      if (eflag || vflag) k_elps_sel = &k_ellipsoid;
+      else k_elps_sel = &k_ellipsoid_noev;
+    } else {
+      if (eflag || vflag) {
+        k_elps_sel = &k_ellipsoid;
+        k_elps_sphere_sel = &k_ellipsoid_sphere;
+        k_sphere_elps_sel = &k_sphere_ellipsoid;
+        k_lj_sel = &k_lj_fast;
+      } else {
+        k_elps_sel = &k_ellipsoid_noev;
+        k_elps_sphere_sel = &k_ellipsoid_sphere_noev;
+        k_sphere_elps_sel = &k_sphere_ellipsoid_noev;
+        k_lj_sel = &k_lj_fast_noev;
+      }
+    }
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex, quat_tex, lj_pos_tex, lj_quat_tex, neigh_tex;
@@ -240,7 +266,6 @@ class BaseEllipsoid {
   int _block_size, _threads_per_atom;
   double  _max_bytes, _max_an_bytes;
   double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
 
   // True if we want to use fast GB-sphere or sphere-sphere calculations
   bool _multiple_forms;
@@ -250,7 +275,7 @@ class BaseEllipsoid {
   void compile_kernels(UCL_Device &dev, const void *ellipsoid_string,
                        const void *lj_string, const char *kname,const bool e_s);
 
-  virtual void loop(const bool _eflag, const bool _vflag) = 0;
+  virtual int loop(const int eflag, const int vflag) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_base_three.cpp b/lib/gpu/lal_base_three.cpp
index cfc138aea2..660385eb56 100644
--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@@ -20,7 +20,7 @@ namespace LAMMPS_AL {
 extern Device<PRECISION,ACC_PRECISION> global_device;
 
 template <class numtyp, class acctyp>
-BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
+BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0), _onetype(-1) {
   device=&global_device;
   ans=new Answer<numtyp,acctyp>();
   nbor=new Neighbor();
@@ -29,6 +29,9 @@ BaseThreeT::BaseThree() : _compiled(false), _max_bytes(0) {
   #endif
   pair_program=nullptr;
   ucl_device=nullptr;
+  #if defined(LAL_OCL_EV_JIT)
+  pair_program_noev=nullptr;
+  #endif
 }
 
 template <class numtyp, class acctyp>
@@ -44,12 +47,18 @@ BaseThreeT::~BaseThree() {
   k_pair.clear();
   k_short_nbor.clear();
   if (pair_program) delete pair_program;
+  #if defined(LAL_OCL_EV_JIT)
+  k_three_center_noev.clear();
+  k_three_end_noev.clear();
+  k_pair_noev.clear();
+  if (pair_program_noev) delete pair_program_noev;
+  #endif
 }
 
 template <class numtyp, class acctyp>
 int BaseThreeT::bytes_per_atom_atomic(const int max_nbors) const {
   int b=device->atom.bytes_per_atom()+ans->bytes_per_atom()+
-         nbor->bytes_per_atom(max_nbors);
+    nbor->bytes_per_atom(max_nbors);
   #ifdef THREE_CONCURRENT
   b+=ans2->bytes_per_atom();
   #endif
@@ -62,7 +71,9 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
                            const double cell_size, const double gpu_split,
                            FILE *_screen, const void *pair_program,
                            const char *two, const char *three_center,
-                           const char *three_end, const char *short_nbor) {
+                           const char *three_end, const char *short_nbor,
+                           const int onetype, const int onetype3,
+                           const int spq, const int tpa_override) {
   screen=_screen;
 
   int gpu_nbor=0;
@@ -77,24 +88,16 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
   if (host_nlocal>0)
     _gpu_host=1;
 
-  _threads_per_atom=device->threads_per_atom();
-  if (_threads_per_atom>1 && gpu_nbor==0) { // neigh no and tpa > 1
-    nbor->packing(true);
-    _nbor_data=&(nbor->dev_packed);
-  } else  // neigh yes or tpa == 1
-    _nbor_data=&(nbor->dev_nbor);
-  if (_threads_per_atom*_threads_per_atom>device->warp_size())
-    return -10;
+  // Allow forcing threads per atom to 1 for tersoff due to subg sync issue
+  if (tpa_override)
+    _threads_per_atom=tpa_override;
+  else
+    _threads_per_atom=device->threads_per_three();
 
   int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
   if (success!=0)
     return success;
 
-  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
-                  max_nbors,cell_size,false,_threads_per_atom);
-  if (success!=0)
-    return success;
-
   if (ucl_device!=device->gpu) _compiled=false;
 
   ucl_device=device->gpu;
@@ -110,7 +113,19 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
 
   _block_pair=device->pair_block_size();
   _block_size=device->block_ellipse();
-  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,short_nbor);
+  compile_kernels(*ucl_device,pair_program,two,three_center,three_end,
+                  short_nbor,onetype,onetype3,spq);
+
+  while (_threads_per_atom*_threads_per_atom>device->simd_size())
+    _threads_per_atom = _threads_per_atom / 2;
+
+  if (_threads_per_atom*_threads_per_atom>device->simd_size())
+    return -10;
+
+  success = device->init_nbor(nbor,nall,host_nlocal,nall,maxspecial,
+                              _gpu_host,max_nbors,cell_size,true,1,true);
+  if (success!=0)
+    return success;
 
   // Initialize host-device load balancer
   hd_balancer.init(device,gpu_nbor,gpu_split);
@@ -121,22 +136,21 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
 
   pos_tex.bind_float(atom->x,4);
 
+  int ef_nall=nall;
+  if (ef_nall==0)
+    ef_nall=2000;
+
   _max_an_bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   #ifdef THREE_CONCURRENT
   _max_an_bytes+=ans2->gpu_bytes();
   #endif
 
-  int ef_nall=nall;
-  if (ef_nall==0)
-    ef_nall=2000;
-  dev_short_nbor.alloc(ef_nall*(2+max_nbors),*(this->ucl_device),UCL_READ_WRITE);
-
   return 0;
 }
 
 template <class numtyp, class acctyp>
-void BaseThreeT::estimate_gpu_overhead() {
-  device->estimate_gpu_overhead(1,_gpu_overhead,_driver_overhead);
+void BaseThreeT::estimate_gpu_overhead(const int add_kernels) {
+  device->estimate_gpu_overhead(4+add_kernels,_gpu_overhead,_driver_overhead);
 }
 
 template <class numtyp, class acctyp>
@@ -152,7 +166,6 @@ void BaseThreeT::clear_atomic() {
   time_pair.clear();
   hd_balancer.clear();
 
-  dev_short_nbor.clear();
   nbor->clear();
   ans->clear();
   #ifdef THREE_CONCURRENT
@@ -186,6 +199,7 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 
   // now the requirement is removed, allowing to work within pair hybrid
   nbor->get_host(nlist,ilist,numj,firstneigh,block_size());
+  nbor->copy_unpacked(nlist,mn);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   #ifdef THREE_CONCURRENT
@@ -201,24 +215,32 @@ int * BaseThreeT::reset_nbors(const int nall, const int inum, const int nlist,
 // Build neighbor list on device
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
-                                       const int nall, double **host_x,
-                                       int *host_type, double *sublo,
-                                       double *subhi, tagint *tag,
-                                       int **nspecial, tagint **special,
-                                       bool &success) {
+inline void BaseThreeT::build_nbor_list(const int inum, const int host_inum,
+                                        const int nall, double **host_x,
+                                        int *host_type, double *sublo,
+                                        double *subhi, tagint *tag,
+                                        int **nspecial, tagint **special,
+                                        bool &success) {
   success=true;
   resize_atom(inum,nall,success);
   resize_local(nall,host_inum,nbor->max_nbors(),success);
   if (!success)
-    return 0;
+    return;
   atom->cast_copy_x(host_x,host_type);
 
   _nall = nall;
 
+  // Increase the effective sub-domain size for neighbors of ghosts
+  // This is still inefficient because we are calculating neighbors for more
+  // ghosts than necessary due to increased ghost cutoff
+  const double ncut=nbor->cutoff()*2.0;
+  for (int i=0; i<3; i++) sublo[i]-=ncut;
+  for (int i=0; i<3; i++) subhi[i]+=ncut;
+
   int mn;
-  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi, tag,
-                        nspecial, special, success, mn);
+  nbor->build_nbor_list(host_x, nall, host_inum, nall, *atom, sublo, subhi,
+                        tag, nspecial, special, success, mn, ans->error_flag);
+  nbor->copy_unpacked(nall,mn);
 
   double bytes=ans->gpu_bytes()+nbor->gpu_bytes();
   #ifdef THREE_CONCURRENT
@@ -226,7 +248,6 @@ inline int BaseThreeT::build_nbor_list(const int inum, const int host_inum,
   #endif
   if (bytes>_max_an_bytes)
     _max_an_bytes=bytes;
-  return mn;
 }
 
 // ---------------------------------------------------------------------------
@@ -236,10 +257,24 @@ template <class numtyp, class acctyp>
 void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
                          const int nlist, double **host_x, int *host_type,
                          int *ilist, int *numj, int **firstneigh,
-                         const bool eflag, const bool vflag, const bool eatom,
-                         const bool vatom, int &host_start,
+                         const bool eflag_in, const bool vflag_in,
+                         const bool eatom, const bool vatom, int &host_start,
                          const double cpu_time, bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -260,19 +295,12 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
     reset_nbors(nall, inum, nlist, ilist, numj, firstneigh, success);
     if (!success)
       return;
-    _max_nbors = nbor->max_nbor_loop(nlist,numj,ilist);
   }
 
   atom->cast_x_data(host_x,host_type);
   hd_balancer.start_timer();
   atom->add_x_data(host_x,host_type);
 
-  // re-allocate dev_short_nbor if necessary
-  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    dev_short_nbor.resize((2+_max_nbors)*_nmax);
-  }
-
   // _ainum to be used in loop() for short neighbor list build
   _ainum = nlist;
 
@@ -282,11 +310,11 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
   #ifdef THREE_CONCURRENT
   ucl_device->sync();
   #endif
-  loop(eflag,vflag,evatom);
-  ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=loop(eflag,vflag,evatom,success);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans);
   #ifdef THREE_CONCURRENT
-  ans2->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   device->add_ans_object(ans2);
   #endif
   hd_balancer.stop_timer();
@@ -296,15 +324,29 @@ void BaseThreeT::compute(const int f_ago, const int inum_full, const int nall,
 // Reneighbor on GPU if necessary and then compute forces, virials, energies
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int ** BaseThreeT::compute(const int ago, const int inum_full,
-                                 const int nall, double **host_x, int *host_type,
-                                 double *sublo, double *subhi, tagint *tag,
-                                 int **nspecial, tagint **special, const bool eflag,
-                                 const bool vflag, const bool eatom,
-                                 const bool vatom, int &host_start,
-                                 int **ilist, int **jnum,
-                                 const double cpu_time, bool &success) {
+int ** BaseThreeT::compute(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, const bool eflag_in,
+                           const bool vflag_in, const bool eatom,
+                           const bool vatom, int &host_start,
+                           int **ilist, int **jnum,
+                           const double cpu_time, bool &success) {
   acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -323,7 +365,7 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
 
   // Build neighbor list on GPU if necessary
   if (ago==0) {
-    _max_nbors = build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
+    build_nbor_list(inum, inum_full-inum, nall, host_x, host_type,
                     sublo, subhi, tag, nspecial, special, success);
     if (!success)
       return nullptr;
@@ -336,12 +378,6 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   *ilist=nbor->host_ilist.begin();
   *jnum=nbor->host_acc.begin();
 
-  // re-allocate dev_short_nbor if necessary
-  if (nall*(2+_max_nbors) > dev_short_nbor.cols()) {
-    int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    dev_short_nbor.resize((2+_max_nbors)*_nmax);
-  }
-
   // _ainum to be used in loop() for short neighbor list build
   _ainum = nall;
 
@@ -351,11 +387,11 @@ int ** BaseThreeT::compute(const int ago, const int inum_full,
   #ifdef THREE_CONCURRENT
   ucl_device->sync();
   #endif
-  loop(eflag,vflag,evatom);
-  ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=loop(eflag,vflag,evatom,success);
+  ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans);
   #ifdef THREE_CONCURRENT
-  ans2->copy_answers(eflag,vflag,eatom,vatom);
+  ans2->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   device->add_ans_object(ans2);
   #endif
   hd_balancer.stop_timer();
@@ -372,14 +408,24 @@ double BaseThreeT::host_memory_usage_atomic() const {
 template <class numtyp, class acctyp>
 void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
                                  const char *two, const char *three_center,
-                                 const char *three_end, const char* short_nbor) {
-  if (_compiled)
+                                 const char *three_end, const char* short_nbor,
+                                 const int onetype, const int onetype3,
+                                 const int spq) {
+  if (_compiled && _onetype==onetype && _onetype3==onetype3 && _spq==spq)
     return;
 
+  _onetype=onetype;
+  _onetype3=onetype3;
+  _spq=spq;
+
   std::string vatom_name=std::string(three_end)+"_vatom";
   if (pair_program) delete pair_program;
   pair_program=new UCL_Program(dev);
-  pair_program->load_string(pair_str,device->compile_string().c_str());
+  std::string oclstring = device->compile_string()+" -DEVFLAG=1";
+  if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
+                     " -DONETYPE3="+device->toa(_onetype3);
+  if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
+  pair_program->load_string(pair_str,oclstring.c_str(),nullptr,screen);
   k_three_center.set_function(*pair_program,three_center);
   k_three_end.set_function(*pair_program,three_end);
   k_three_end_vatom.set_function(*pair_program,vatom_name.c_str());
@@ -387,12 +433,50 @@ void BaseThreeT::compile_kernels(UCL_Device &dev, const void *pair_str,
   k_short_nbor.set_function(*pair_program,short_nbor);
   pos_tex.get_texture(*pair_program,"pos_tex");
 
+  #if defined(LAL_OCL_EV_JIT)
+  oclstring = device->compile_string()+" -DEVFLAG=0";
+  if (_onetype>=0) oclstring+=" -DONETYPE="+device->toa(_onetype)+
+                     " -DONETYPE3="+device->toa(_onetype3);
+  if (_spq) oclstring+=" -DSPQ="+device->toa(_spq);
+  if (pair_program_noev) delete pair_program_noev;
+  pair_program_noev=new UCL_Program(dev);
+  pair_program_noev->load_string(pair_str,oclstring.c_str(),nullptr,screen);
+  k_three_center_noev.set_function(*pair_program_noev,three_center);
+  k_three_end_noev.set_function(*pair_program_noev,three_end);
+  k_pair_noev.set_function(*pair_program_noev,two);
+  #else
+  k_sel = &k_pair;
+  k_3center_sel = &k_three_center;
+  k_3end_sel = &k_three_end;
+  #endif
+
   #ifdef THREE_CONCURRENT
   k_three_end.cq(ucl_device->cq(_end_command_queue));
   k_three_end_vatom.cq(ucl_device->cq(_end_command_queue));
+  #if defined(LAL_OCL_EV_JIT)
+  k_three_end_noev.cq(ucl_device->cq(_end_command_queue));
+  #endif
   #endif
 
   _compiled=true;
+
+  #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0))
+  if (dev.cl_device_version() >= 210) {
+    size_t mx_subgroup_sz = k_pair.max_subgroup_size(_block_size);
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_vatom.max_subgroup_size(_block_size));
+    #if defined(LAL_OCL_EV_JIT)
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_pair_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_center_noev.max_subgroup_size(_block_size));
+    mx_subgroup_sz = std::min(mx_subgroup_sz, k_three_end_noev.max_subgroup_size(_block_size));
+    #endif
+    if (_threads_per_atom > mx_subgroup_sz)
+      _threads_per_atom = mx_subgroup_sz;
+    device->set_simd_size(mx_subgroup_sz);
+  }
+  #endif
+
 }
 
 template class BaseThree<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_base_three.h b/lib/gpu/lal_base_three.h
index 36129e6168..3e830d4217 100644
--- a/lib/gpu/lal_base_three.h
+++ b/lib/gpu/lal_base_three.h
@@ -59,10 +59,12 @@ class BaseThree {
                  const double gpu_split, FILE *screen,
                  const void *pair_program, const char *k_two,
                  const char *k_three_center, const char *k_three_end,
-                 const char *k_short_nbor=nullptr);
+                 const char *k_short_nbor=nullptr, const int onetype=-1,
+                 const int onetype3=-1, const int spq=0,
+                 const int tpa_override=0);
 
   /// Estimate the overhead for GPU context changes and CPU driver
-  void estimate_gpu_overhead();
+  void estimate_gpu_overhead(const int add_kernels=0);
 
   /// Check if there is enough storage for atom arrays and realloc if not
   /** \param success set to false if insufficient memory **/
@@ -109,7 +111,7 @@ class BaseThree {
   /// Accumulate timers
   inline void acc_timers() {
     if (device->time_device()) {
-      nbor->acc_timers();
+      nbor->acc_timers(screen);
       time_pair.add_to_total();
       atom->acc_timers();
       ans->acc_timers();
@@ -134,9 +136,9 @@ class BaseThree {
                     int *numj, int **firstneigh, bool &success);
 
   /// Build neighbor list on device
-  int build_nbor_list(const int inum, const int host_inum,
-                       const int nall, double **host_x, int *host_type,
-                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+  void build_nbor_list(const int inum, const int host_inum, const int nall,
+                       double **host_x, int *host_type, double *sublo,
+                       double *subhi, tagint *tag, int **nspecial,
                        tagint **special, bool &success);
 
   /// Pair loop with host neighboring
@@ -147,12 +149,12 @@ class BaseThree {
                int &host_start, const double cpu_time, bool &success);
 
   /// Pair loop with device neighboring
-  int ** compute(const int ago, const int inum_full,
-                 const int nall, double **host_x, int *host_type, double *sublo,
-                 double *subhi, tagint *tag, int **nspecial,
-                 tagint **special, const bool eflag, const bool vflag,
-                 const bool eatom, const bool vatom, int &host_start,
-                 int **ilist, int **numj, const double cpu_time, bool &success);
+  int ** compute(const int ago, const int inum_full, const int nall,
+                 double **host_x, int *host_type, double *sublo,
+                 double *subhi, tagint *tag, int **nspecial, tagint **special,
+                 const bool eflag, const bool vflag, const bool eatom,
+                 const bool vatom, int &host_start, int **ilist,
+                 int **numj, const double cpu_time, bool &success);
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -188,14 +190,29 @@ class BaseThree {
   /// Neighbor data
   Neighbor *nbor;
 
-  UCL_D_Vec<int> dev_short_nbor;
   UCL_Kernel k_short_nbor;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Program *pair_program;
+  UCL_Program *pair_program, *pair_program_noev;
   UCL_Kernel k_pair, k_three_center, k_three_end, k_three_end_vatom;
+  UCL_Kernel k_pair_noev, k_three_center_noev, k_three_end_noev;
+  UCL_Kernel *k_sel, *k_3center_sel, *k_3end_sel;
   inline int block_pair() { return _block_pair; }
   inline int block_size() { return _block_size; }
+  inline void set_kernel(const int eflag, const int vflag) {
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) {
+      k_sel = &k_pair;
+      k_3center_sel = &k_three_center;
+      k_3end_sel = &k_three_end;
+    } else {
+      k_sel = &k_pair_noev;
+      k_3center_sel = &k_three_center_noev;
+      k_3end_sel = &k_three_end_noev;
+    }
+    #endif
+  }
+
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture pos_tex;
@@ -203,18 +220,19 @@ class BaseThree {
  protected:
   bool _compiled;
   int _block_pair, _block_size, _threads_per_atom, _end_command_queue;
-  int _gpu_nbor;
+  int _gpu_nbor, _onetype, _onetype3, _spq;
   double _max_bytes, _max_an_bytes;
-  int _max_nbors, _ainum, _nall;
+  int _ainum, _nall;
   double _gpu_overhead, _driver_overhead;
-  UCL_D_Vec<int> *_nbor_data;
 
   void compile_kernels(UCL_Device &dev, const void *pair_string,
                        const char *two, const char *three_center,
-                       const char *three_end, const char* short_nbor);
+                       const char *three_end, const char* short_nbor,
+                       const int onetype, const int onetype3,
+                       const int spq);
 
-  virtual void loop(const bool _eflag, const bool _vflag,
-                    const int evatom) = 0;
+  virtual int loop(const int eflag, const int vflag, const int evatom,
+                   bool &success) = 0;
 };
 
 }
diff --git a/lib/gpu/lal_beck.cpp b/lib/gpu/lal_beck.cpp
index be1722c32c..57551d9787 100644
--- a/lib/gpu/lal_beck.cpp
+++ b/lib/gpu/lal_beck.cpp
@@ -113,20 +113,9 @@ double BeckT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BeckT::loop(const bool _eflag, const bool _vflag) {
+int BeckT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &beck1, &beck2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &beck1, &beck2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
@@ -147,6 +136,7 @@ void BeckT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Beck<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_beck.cu b/lib/gpu/lal_beck.cu
index f24132b9a2..a2a15e4d21 100644
--- a/lib/gpu/lal_beck.cu
+++ b/lib/gpu/lal_beck.cu
@@ -39,22 +39,25 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,14 +101,14 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp term6 = pow(term1,(numtyp)-3);
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -116,9 +119,9 @@ __kernel void k_beck(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
@@ -137,6 +140,9 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 beck1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 beck2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -144,19 +150,19 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
     beck2[tid]=beck2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -200,14 +206,14 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp term6 = pow(term1,(numtyp)-3);
           numtyp term1inv = ucl_recip(term1);
           numtyp e = beck2[mtype].x*ucl_exp((numtyp)-1.0*r*term4);
           e -= beck2[mtype].y*term6*((numtyp)1.0+((numtyp)2.709+(numtyp)3.0*aaij*aaij)*term1inv);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -218,8 +224,8 @@ __kernel void k_beck_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_beck.h b/lib/gpu/lal_beck.h
index 638f1bf626..c6413ed766 100644
--- a/lib/gpu/lal_beck.h
+++ b/lib/gpu/lal_beck.h
@@ -72,7 +72,7 @@ class Beck : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_beck_ext.cpp b/lib/gpu/lal_beck_ext.cpp
index dcba4e4f40..ab65237e27 100644
--- a/lib/gpu/lal_beck_ext.cpp
+++ b/lib/gpu/lal_beck_ext.cpp
@@ -55,7 +55,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
   int init_ok=0;
   if (world_me==0)
     init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta,
-                      AA, BB, special_lj, inum, nall, 300,
+                      AA, BB, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);
 
   BLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **aa,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BLMF.init(ntypes, cutsq, aa, alpha, beta, AA, BB,
-                        special_lj, inum, nall, 300, maxspecial,
+                        special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     BLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_born.cpp b/lib/gpu/lal_born.cpp
index 4a6b789687..c4796b3450 100644
--- a/lib/gpu/lal_born.cpp
+++ b/lib/gpu/lal_born.cpp
@@ -138,20 +138,9 @@ double BornT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornT::loop(const bool _eflag, const bool _vflag) {
+int BornT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -159,8 +148,8 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1,&coeff2,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1,&coeff2,
                           &cutsq_sigma, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
@@ -176,6 +165,7 @@ void BornT::loop(const bool _eflag, const bool _vflag) {
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Born<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born.cu b/lib/gpu/lal_born.cu
index f9fea6d618..825175af8f 100644
--- a/lib/gpu/lal_born.cu
+++ b/lib/gpu/lal_born.cu
@@ -40,22 +40,25 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -92,12 +95,12 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].w);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -108,9 +111,9 @@ __kernel void k_born(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_born_fast(const __global numtyp4 *restrict x_,
@@ -130,27 +133,30 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -187,12 +193,12 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
             + coeff2[mtype].z*r2inv*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].w);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -203,8 +209,8 @@ __kernel void k_born_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_born.h b/lib/gpu/lal_born.h
index 2a7f355d69..3f5277b682 100644
--- a/lib/gpu/lal_born.h
+++ b/lib/gpu/lal_born.h
@@ -82,7 +82,7 @@ class Born : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_born_coul_long.cpp b/lib/gpu/lal_born_coul_long.cpp
index 1b147395f6..8c7084f4a4 100644
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@@ -129,20 +129,9 @@ double BornCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int BornCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -150,8 +139,8 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force,
@@ -170,6 +159,7 @@ void BornCoulLongT::loop(const bool _eflag, const bool _vflag) {
                    &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BornCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born_coul_long.cu b/lib/gpu/lal_born_coul_long.cu
index 14e644b45a..d38a101c30 100644
--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@@ -48,6 +48,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -57,18 +60,18 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -124,7 +127,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cutsq_sigma[mtype].y) {
@@ -133,7 +136,7 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -144,9 +147,9 @@ __kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -169,28 +172,31 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -246,7 +252,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cutsq_sigma[mtype].y) {
@@ -255,7 +261,7 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -266,8 +272,8 @@ __kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_born_coul_long.h b/lib/gpu/lal_born_coul_long.h
index e383d18e0c..a33b8f436a 100644
--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@@ -80,7 +80,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_born_coul_long_cs.cu b/lib/gpu/lal_born_coul_long_cs.cu
index 6f04fcea94..077ec2f74f 100644
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@@ -63,6 +63,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -72,18 +75,18 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -155,7 +158,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e = prefactor*_erfc;
             if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -167,7 +170,7 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -178,9 +181,9 @@ __kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
@@ -203,28 +206,31 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -296,7 +302,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e = prefactor*_erfc;
             if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -308,7 +314,7 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -319,8 +325,8 @@ __kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_born_coul_long_cs_ext.cpp b/lib/gpu/lal_born_coul_long_cs_ext.cpp
index badc8b0808..fc6b89692f 100644
--- a/lib/gpu/lal_born_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_cs_ext.cpp
@@ -60,7 +60,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma, offset,
-                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                           gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                           host_special_coul, qqrd2e, g_ewald);
 
@@ -80,7 +80,7 @@ int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma, offset,
-                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                             gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                             host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_born_coul_long_ext.cpp b/lib/gpu/lal_born_coul_long_ext.cpp
index d0825529b1..9d17f2fa7d 100644
--- a/lib/gpu/lal_born_coul_long_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_ext.cpp
@@ -60,7 +60,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma, offset,
-                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                           gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                           host_special_coul, qqrd2e, g_ewald);
 
@@ -80,7 +80,7 @@ int borncl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNCLMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma, offset,
-                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                             gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                             host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_born_coul_wolf.cpp b/lib/gpu/lal_born_coul_wolf.cpp
index 1624dd9d50..e6caebbab8 100644
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@@ -131,20 +131,9 @@ double BornCoulWolfT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
+int BornCoulWolfT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -152,8 +141,8 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q,
@@ -171,6 +160,7 @@ void BornCoulWolfT::loop(const bool _eflag, const bool _vflag) {
                    &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BornCoulWolf<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_born_coul_wolf.cu b/lib/gpu/lal_born_coul_wolf.cu
index 0eeda48ec0..aefcac8127 100644
--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@@ -51,6 +51,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -60,18 +63,18 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -79,7 +82,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -137,7 +140,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -149,7 +152,7 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -160,9 +163,9 @@ __kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
@@ -186,28 +189,31 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -216,7 +222,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -273,7 +279,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -285,7 +291,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -296,8 +302,7 @@ __kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_born_coul_wolf.h b/lib/gpu/lal_born_coul_wolf.h
index fa53f48939..0aad07dfa5 100644
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@@ -81,7 +81,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_born_coul_wolf_cs.cu b/lib/gpu/lal_born_coul_wolf_cs.cu
index b957b8be69..866d256f33 100644
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@@ -52,6 +52,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -61,18 +64,18 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -80,7 +83,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -139,7 +142,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             acctyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -151,7 +154,7 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -162,9 +165,9 @@ __kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
@@ -188,28 +191,31 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -218,7 +224,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -276,7 +282,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             acctyp e=v_sh;
             if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
@@ -288,7 +294,7 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].w);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -299,8 +305,8 @@ __kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
index e2211644af..ae162a7c52 100644
--- a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
@@ -60,7 +60,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e,
                           alf, e_shift, f_shift);
@@ -81,7 +81,7 @@ int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma,
-                            offset, special_lj, inum, nall, 300,
+                            offset, special_lj, inum, nall, max_nbors,
                             maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                             host_cut_coulsq, host_special_coul, qqrd2e,
                             alf, e_shift, f_shift);
diff --git a/lib/gpu/lal_born_coul_wolf_ext.cpp b/lib/gpu/lal_born_coul_wolf_ext.cpp
index d664f30212..bc38db1b9c 100644
--- a/lib/gpu/lal_born_coul_wolf_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_ext.cpp
@@ -60,7 +60,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e,
                           alf, e_shift, f_shift);
@@ -81,7 +81,7 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNCWMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                             host_born3, host_a, host_c, host_d, sigma,
-                            offset, special_lj, inum, nall, 300,
+                            offset, special_lj, inum, nall, max_nbors,
                             maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                             host_cut_coulsq, host_special_coul, qqrd2e,
                             alf, e_shift, f_shift);
diff --git a/lib/gpu/lal_born_ext.cpp b/lib/gpu/lal_born_ext.cpp
index 63991889d9..2321a1264d 100644
--- a/lib/gpu/lal_born_ext.cpp
+++ b/lib/gpu/lal_born_ext.cpp
@@ -58,7 +58,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   if (world_me==0)
     init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                         host_born3, host_a, host_c, host_d, sigma,
-                        offset, special_lj, inum, nall, 300,
+                        offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   BORNMF.device->world_barrier();
@@ -77,7 +77,7 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     if (gpu_rank==i && world_me!=0)
       init_ok=BORNMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
                           host_born3, host_a, host_c, host_d, sigma,
-                          offset, special_lj, inum, nall, 300,
+                          offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen);
 
     BORNMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_buck.cpp b/lib/gpu/lal_buck.cpp
index 5a335a1e51..01411775e1 100644
--- a/lib/gpu/lal_buck.cpp
+++ b/lib/gpu/lal_buck.cpp
@@ -130,20 +130,9 @@ double BuckT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckT::loop(const bool _eflag, const bool _vflag) {
+int BuckT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,8 +140,8 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -165,6 +154,7 @@ void BuckT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Buck<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck.cu b/lib/gpu/lal_buck.cu
index 0f9044cefc..958c7bdd4d 100644
--- a/lib/gpu/lal_buck.cu
+++ b/lib/gpu/lal_buck.cu
@@ -39,22 +39,25 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -91,11 +94,11 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -106,9 +109,9 @@ __kernel void k_buck(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
@@ -127,27 +130,30 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -184,11 +190,11 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
           energy+=factor_lj*(e-coeff2[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -199,8 +205,8 @@ __kernel void k_buck_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_buck.h b/lib/gpu/lal_buck.h
index 7a09fae5dd..5755dea230 100644
--- a/lib/gpu/lal_buck.h
+++ b/lib/gpu/lal_buck.h
@@ -77,7 +77,7 @@ class Buck : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_buck_coul.cpp b/lib/gpu/lal_buck_coul.cpp
index 25607eae17..c3c70e6d4d 100644
--- a/lib/gpu/lal_buck_coul.cpp
+++ b/lib/gpu/lal_buck_coul.cpp
@@ -122,20 +122,9 @@ double BuckCoulT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
+int BuckCoulT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -143,8 +132,8 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -158,6 +147,7 @@ void BuckCoulT::loop(const bool _eflag, const bool _vflag) {
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BuckCoul<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck_coul.cu b/lib/gpu/lal_buck_coul.cu
index 163c8e4362..2aaa9c9b3d 100644
--- a/lib/gpu/lal_buck_coul.cu
+++ b/lib/gpu/lal_buck_coul.cu
@@ -47,6 +47,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -119,14 +122,14 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -137,9 +140,9 @@ __kernel void k_buck_coul(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
@@ -162,29 +165,32 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -236,14 +242,14 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < cutsq[mtype].y) {
             numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv;
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -254,8 +260,8 @@ __kernel void k_buck_coul_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_buck_coul.h b/lib/gpu/lal_buck_coul.h
index eebba78eb0..bd2afcf9d8 100644
--- a/lib/gpu/lal_buck_coul.h
+++ b/lib/gpu/lal_buck_coul.h
@@ -78,7 +78,7 @@ class BuckCoul : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_buck_coul_ext.cpp b/lib/gpu/lal_buck_coul_ext.cpp
index 2a089e2040..9cf8f9b00e 100644
--- a/lib/gpu/lal_buck_coul_ext.cpp
+++ b/lib/gpu/lal_buck_coul_ext.cpp
@@ -58,7 +58,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
@@ -78,7 +78,7 @@ int buckc_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen,
                        host_cut_ljsq, host_cut_coulsq,
                        host_special_coul, qqrd2e);
diff --git a/lib/gpu/lal_buck_coul_long.cpp b/lib/gpu/lal_buck_coul_long.cpp
index 1c0288c2d8..60205a2ad6 100644
--- a/lib/gpu/lal_buck_coul_long.cpp
+++ b/lib/gpu/lal_buck_coul_long.cpp
@@ -126,20 +126,9 @@ double BuckCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int BuckCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,8 +136,8 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -163,6 +152,7 @@ void BuckCoulLongT::loop(const bool _eflag, const bool _vflag) {
                    &_cut_coulsq, &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class BuckCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_buck_coul_long.cu b/lib/gpu/lal_buck_coul_long.cu
index b1bbf67bc2..f5ce3a7d11 100644
--- a/lib/gpu/lal_buck_coul_long.cu
+++ b/lib/gpu/lal_buck_coul_long.cu
@@ -48,6 +48,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -57,18 +60,18 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -126,7 +129,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < coeff1[mtype].w) {
@@ -134,7 +137,7 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -145,9 +148,9 @@ __kernel void k_buck_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -171,28 +174,31 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       coeff2[tid]=coeff2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -250,7 +256,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < coeff1[mtype].w) {
@@ -258,7 +264,7 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-coeff2[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -269,8 +275,8 @@ __kernel void k_buck_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_buck_coul_long.h b/lib/gpu/lal_buck_coul_long.h
index e2d69475cf..fa978a70be 100644
--- a/lib/gpu/lal_buck_coul_long.h
+++ b/lib/gpu/lal_buck_coul_long.h
@@ -78,7 +78,7 @@ class BuckCoulLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_buck_coul_long_ext.cpp b/lib/gpu/lal_buck_coul_long_ext.cpp
index c7e1cd1e35..393ccc3feb 100644
--- a/lib/gpu/lal_buck_coul_long_ext.cpp
+++ b/lib/gpu/lal_buck_coul_long_ext.cpp
@@ -59,7 +59,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                        host_a, host_c, offset, special_lj, inum, nall, 300,
+                        host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -78,7 +78,7 @@ int buckcl_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKCLMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                        host_a, host_c, offset, special_lj, inum, nall, 300,
+                        host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_buck_ext.cpp b/lib/gpu/lal_buck_ext.cpp
index cc8b77c0a9..738b33337d 100644
--- a/lib/gpu/lal_buck_ext.cpp
+++ b/lib/gpu/lal_buck_ext.cpp
@@ -56,7 +56,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
   int init_ok=0;
   if (world_me==0)
     init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   BUCKMF.device->world_barrier();
@@ -74,7 +74,7 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=BUCKMF.init(ntypes, cutsq, host_rhoinv, host_buck1, host_buck2,
-                       host_a, host_c, offset, special_lj, inum, nall, 300,
+                       host_a, host_c, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
     BUCKMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_charmm.cpp b/lib/gpu/lal_charmm.cpp
new file mode 100644
index 0000000000..811a431cc7
--- /dev/null
+++ b/lib/gpu/lal_charmm.cpp
@@ -0,0 +1,166 @@
+/***************************************************************************
+                               charmm.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "charmm_cl.h"
+#elif defined(USE_CUDART)
+const char *charmm_long=0;
+#else
+#include "charmm_cubin.h"
+#endif
+
+#include "lal_charmm.h"
+#include <cassert>
+namespace LAMMPS_AL {
+#define CHARMMT CHARMM<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+CHARMMT::CHARMM() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+CHARMMT::~CHARMM() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int CHARMMT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int CHARMMT::init(const int ntypes, double host_cut_bothsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double *host_special_lj, const int nlocal, const int nall,
+                   const int max_nbors, const int maxspecial,
+                   const double cell_size, const double gpu_split,
+                   FILE *_screen, double host_cut_ljsq,
+                   const double host_cut_coulsq, double *host_special_coul,
+                   const double qqrd2e, const double cut_lj_innersq,
+                   const double cut_coul_innersq, const double denom_lj,
+                   const double denom_coul, double **epsilon,
+                   double **sigma, const bool mix_arithmetic) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,charmm,"k_charmm");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_bio_shared_types=this->device->max_bio_shared_types();
+  if (this->_block_bio_size>=64 && mix_arithmetic &&
+      lj_types<=max_bio_shared_types)
+    shared_types=true;
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  int h_size=lj_types*lj_types;
+  if (h_size<max_bio_shared_types)
+    h_size=max_bio_shared_types;
+  UCL_H_Vec<numtyp> host_write(h_size*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  for (int i=0; i<h_size*32; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_lj3,host_lj4);
+
+  if (shared_types) {
+    ljd.alloc(max_bio_shared_types,*(this->ucl_device),UCL_READ_ONLY);
+    this->atom->self_pack2(ntypes,ljd,host_write,epsilon,sigma);
+  }
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_bothsq = host_cut_bothsq;
+  _cut_coulsq = host_cut_coulsq;
+  _cut_ljsq = host_cut_ljsq;
+  _cut_lj_innersq = cut_lj_innersq;
+  _cut_coul_innersq = cut_coul_innersq;
+  _qqrd2e=qqrd2e;
+  _denom_lj=denom_lj;
+  _denom_coul=denom_coul;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+ljd.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void CHARMMT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  ljd.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double CHARMMT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(CHARMM<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+int CHARMMT::loop(const int eflag, const int vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->_block_bio_size;
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
+                          &this->nbor->dev_nbor, this->_nbor_data,
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
+                          &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                          &_cut_coul_innersq, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &ljd, &sp_lj,
+                     &this->nbor->dev_nbor, this->_nbor_data,
+                     &this->ans->force, &this->ans->engv, &eflag,
+                     &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                     &_cut_coulsq, &_qqrd2e, &_denom_lj, &_denom_coul,
+                     &_cut_bothsq, &_cut_ljsq, &_cut_lj_innersq,
+                     &_cut_coul_innersq, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+  return GX;
+}
+
+template class CHARMM<PRECISION,ACC_PRECISION>;
+}
diff --git a/lib/gpu/lal_charmm.cu b/lib/gpu/lal_charmm.cu
new file mode 100644
index 0000000000..42fb810796
--- /dev/null
+++ b/lib/gpu/lal_charmm.cu
@@ -0,0 +1,303 @@
+// **************************************************************************
+//                               charmm.cu
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//
+//  Device code for acceleration of the charmm/coul pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_charmm(const __global numtyp4 *restrict x_,
+                       const __global numtyp2 *restrict ljd,
+                       const __global numtyp *restrict sp_lj,
+                       const __global int *dev_nbor,
+                       const __global int *dev_packed,
+                       __global acctyp4 *restrict ans,
+                       __global acctyp *restrict engv,
+                       const int eflag, const int vflag,
+                       const int inum, const int nbor_pitch,
+                       const __global numtyp *restrict q_,
+                       const numtyp cut_coulsq, const numtyp qqrd2e,
+                       const numtyp denom_lj,
+                       const numtyp denom_coul,
+                       const numtyp cut_bothsq,
+                       const numtyp cut_ljsq,
+                       const numtyp cut_lj_innersq,
+                       const numtyp cut_coul_innersq,
+                       const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  int n_stride;
+  local_allocate_store_bio();
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp rinv = ucl_rsqrt(rsq);
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
+          if (rsq > cut_coul_innersq) {
+            numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
+              denom_coul;
+            forcecoul *= switch3;
+          }
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (EVFLAG && eflag) {
+          e_coul += forcecoul;
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (EVFLAG && vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
+
+__kernel void k_charmm_fast(const __global numtyp4 *restrict x_,
+                            const __global numtyp2 *restrict ljd_in,
+                            const __global numtyp *restrict sp_lj_in,
+                            const __global int *dev_nbor,
+                            const __global int *dev_packed,
+                            __global acctyp4 *restrict ans,
+                            __global acctyp *restrict engv,
+                            const int eflag, const int vflag,
+                            const int inum, const int nbor_pitch,
+                            const __global numtyp *restrict q_,
+                            const numtyp cut_coulsq, const numtyp qqrd2e,
+                            const numtyp denom_lj,
+                            const numtyp denom_coul,
+                            const numtyp cut_bothsq,
+                            const numtyp cut_ljsq,
+                            const numtyp cut_lj_innersq,
+                            const numtyp cut_coul_innersq,
+                            const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_bio();
+
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_BIO_SHARED_TYPES)
+    ljd[tid]=ljd_in[tid];
+  if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
+    ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
+
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cut_bothsq) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp forcecoul, force_lj, force, switch1;
+        numtyp lj3, lj4;
+
+        if (rsq < cut_ljsq) {
+          numtyp eps = ucl_sqrt(ljd[itype].x*ljd[jtype].x);
+          numtyp sig6 = (numtyp)0.5 * (ljd[itype].y+ljd[jtype].y);
+
+          numtyp sig_r_6 = sig6*sig6*r2inv;
+          sig_r_6 = sig_r_6*sig_r_6*sig_r_6;
+          lj4 = (numtyp)4.0*eps*sig_r_6;
+          lj3 = lj4*sig_r_6;
+          force_lj = factor_lj*((numtyp)12.0 * lj3 - (numtyp)6.0 * lj4);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq);
+            numtyp switch2 = (numtyp)12.0*rsq*switch1*(rsq-cut_lj_innersq)*
+                             denom_lj;
+            switch1 *= switch1;
+            switch1 *= (cut_ljsq+(numtyp)2.0*rsq-(numtyp)3.0*cut_lj_innersq)*
+                       denom_lj;
+            switch2 *= lj3-lj4;
+            force_lj = force_lj*switch1+switch2;
+          }
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp rinv = ucl_rsqrt(rsq);
+          fetch(forcecoul,j,q_tex);
+          forcecoul *= factor_coul * qqrd2e * qtmp * rinv;
+          if (rsq > cut_coul_innersq) {
+            numtyp switch3 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + (numtyp)2.0*rsq - (numtyp)3.0*cut_coul_innersq) *
+              denom_coul;
+            forcecoul *= switch3;
+          }
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = (force_lj + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (EVFLAG && eflag) {
+          e_coul += forcecoul;
+          if (rsq < cut_ljsq) {
+            numtyp e=lj3-lj4;
+            if (rsq > cut_lj_innersq)
+              e *= switch1;
+            energy+=factor_lj*e;
+          }
+        }
+        if (EVFLAG && vflag) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+  } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
+}
diff --git a/lib/gpu/lal_charmm.h b/lib/gpu/lal_charmm.h
new file mode 100644
index 0000000000..0793d7ca0f
--- /dev/null
+++ b/lib/gpu/lal_charmm.h
@@ -0,0 +1,89 @@
+/***************************************************************************
+                                charmm.h
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Class for acceleration of the charmm/coul pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#ifndef LAL_CHARMM_
+#define LAL_CHARMM_
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CHARMM : public BaseCharge<numtyp, acctyp> {
+ public:
+  CHARMM();
+  ~CHARMM();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double host_cut_bothsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double cut_lj_innersq,
+           const double cut_coul_innersq, const double denom_lj,
+           const double denom_coul, double **epsilon, double **sigma,
+           const bool mix_arithmetic);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// x = lj1, y = lj2, z = lj3, w = lj4
+  UCL_D_Vec<numtyp4> lj1;
+  /// x = epsilon, y = sigma
+  UCL_D_Vec<numtyp2> ljd;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _qqrd2e, _denom_lj, _denom_coul;
+
+  numtyp _cut_coulsq, _cut_bothsq, _cut_ljsq, _cut_lj_innersq;
+  numtyp _cut_coul_innersq;
+
+ private:
+  bool _allocated;
+  int loop(const int eflag, const int vflag);
+};
+
+}
+
+#endif
diff --git a/lib/gpu/lal_charmm_ext.cpp b/lib/gpu/lal_charmm_ext.cpp
new file mode 100644
index 0000000000..bed2f21933
--- /dev/null
+++ b/lib/gpu/lal_charmm_ext.cpp
@@ -0,0 +1,137 @@
+/***************************************************************************
+                             charmm_long_ext.cpp
+                             -------------------
+                            W. Michael Brown (ORNL)
+
+  Functions for LAMMPS access to charmm/coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : brownw@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_charmm.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CHARMM<PRECISION,ACC_PRECISION> CRMMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double *special_lj, const int inum,
+                   const int nall, const int max_nbors, const int maxspecial,
+                   const double cell_size, int &gpu_mode, FILE *screen,
+                   double host_cut_ljsq, double host_cut_coulsq,
+                   double *host_special_coul, const double qqrd2e,
+                   const double cut_lj_innersq, const double cut_coul_innersq,
+                   const double denom_lj, const double denom_coul,
+                   double **epsilon, double **sigma,
+                   const bool mix_arithmetic) {
+  CRMMF.clear();
+  gpu_mode=CRMMF.device->gpu_mode();
+  double gpu_split=CRMMF.device->particle_split();
+  int first_gpu=CRMMF.device->first_device();
+  int last_gpu=CRMMF.device->last_device();
+  int world_me=CRMMF.device->world_me();
+  int gpu_rank=CRMMF.device->gpu_rank();
+  int procs_per_gpu=CRMMF.device->procs_per_gpu();
+
+  CRMMF.device->init_message(screen,"lj/charmm/coul/charmm",first_gpu,
+                              last_gpu);
+
+  bool message=false;
+  if (CRMMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                special_lj, inum, nall, max_nbors, maxspecial, cell_size,
+                gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                host_special_coul, qqrd2e, cut_lj_innersq, cut_coul_innersq,
+                denom_lj, denom_coul, epsilon, sigma, mix_arithmetic);
+
+  CRMMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CRMMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
+                          host_lj4, special_lj, inum, nall, max_nbors,
+                          maxspecial, cell_size, gpu_split, screen,
+                          host_cut_ljsq, host_cut_coulsq, host_special_coul,
+                          qqrd2e, cut_lj_innersq, cut_coul_innersq, denom_lj,
+                          denom_coul, epsilon, sigma, mix_arithmetic);
+
+    CRMMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CRMMF.estimate_gpu_overhead();
+
+  return init_ok;
+}
+
+void crm_gpu_clear() {
+  CRMMF.clear();
+}
+
+int** crm_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success, double *host_q, double *boxlo,
+                          double *prd) {
+  return CRMMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}
+
+void crm_gpu_compute(const int ago, const int inum_full, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       const int nlocal, double *boxlo, double *prd) {
+  CRMMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,
+                 eflag,vflag,eatom,vatom,host_start,cpu_time,success,host_q,
+                 nlocal,boxlo,prd);
+}
+
+double crm_gpu_bytes() {
+  return CRMMF.host_memory_usage();
+}
+
+
diff --git a/lib/gpu/lal_charmm_long.cpp b/lib/gpu/lal_charmm_long.cpp
index a78996a7d5..8008b1fbb3 100644
--- a/lib/gpu/lal_charmm_long.cpp
+++ b/lib/gpu/lal_charmm_long.cpp
@@ -131,20 +131,9 @@ double CHARMMLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
+int CHARMMLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->_block_bio_size;
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -152,8 +141,8 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &ljd, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &ljd, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -171,6 +160,7 @@ void CHARMMLongT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CHARMMLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_charmm_long.cu b/lib/gpu/lal_charmm_long.cu
index 4e9802f368..77793d0e83 100644
--- a/lib/gpu/lal_charmm_long.cu
+++ b/lib/gpu/lal_charmm_long.cu
@@ -47,18 +47,21 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
+  int n_stride;
+  local_allocate_store_bio();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -122,7 +125,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cut_ljsq) {
@@ -132,7 +135,7 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -143,9 +146,9 @@ __kernel void k_charmm_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
@@ -168,6 +171,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp2 ljd[MAX_BIO_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_bio();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_BIO_SHARED_TYPES)
@@ -175,20 +181,20 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
   if (tid+BLOCK_BIO_PAIR<MAX_BIO_SHARED_TYPES)
     ljd[tid+BLOCK_BIO_PAIR]=ljd_in[tid+BLOCK_BIO_PAIR];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -258,7 +264,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < cut_ljsq) {
@@ -268,7 +274,7 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -277,10 +283,9 @@ __kernel void k_charmm_long_fast(const __global numtyp4 *restrict x_,
           virial[5] += dely*delz*force;
         }
       }
-
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_charmm_long.h b/lib/gpu/lal_charmm_long.h
index 5d9d9ea50b..69f1a0734a 100644
--- a/lib/gpu/lal_charmm_long.h
+++ b/lib/gpu/lal_charmm_long.h
@@ -79,7 +79,7 @@ class CHARMMLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_charmm_long_ext.cpp b/lib/gpu/lal_charmm_long_ext.cpp
index 743b510825..13565f5682 100644
--- a/lib/gpu/lal_charmm_long_ext.cpp
+++ b/lib/gpu/lal_charmm_long_ext.cpp
@@ -60,7 +60,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                offset, special_lj, inum, nall, 300, maxspecial, cell_size,
+                offset, special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                 gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
                 host_special_coul, qqrd2e, g_ewald, cut_lj_innersq, denom_lj,
                 epsilon,sigma,mix_arithmetic);
@@ -80,7 +80,7 @@ int crml_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CRMLMF.init(ntypes, cut_bothsq, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum, nall, 300,
+                          host_lj4, offset, special_lj, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald,  cut_lj_innersq, denom_lj, epsilon,
diff --git a/lib/gpu/lal_colloid.cpp b/lib/gpu/lal_colloid.cpp
index c441d50968..fec7a3ad5f 100644
--- a/lib/gpu/lal_colloid.cpp
+++ b/lib/gpu/lal_colloid.cpp
@@ -140,20 +140,9 @@ double ColloidT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ColloidT::loop(const bool _eflag, const bool _vflag) {
+int ColloidT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -161,8 +150,8 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &colloid1, &colloid2, &form,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -176,6 +165,7 @@ void ColloidT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Colloid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_colloid.cu b/lib/gpu/lal_colloid.cu
index 4983142aa0..8a20f0c400 100644
--- a/lib/gpu/lal_colloid.cu
+++ b/lib/gpu/lal_colloid.cu
@@ -42,22 +42,25 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -146,7 +149,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(numtyp)0.0;
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -160,7 +163,7 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -171,9 +174,9 @@ __kernel void k_colloid(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
@@ -198,6 +201,9 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 colloid2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -205,23 +211,23 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
     colloid1[tid]=colloid1_in[tid];
     colloid2[tid]=colloid2_in[tid];
     form[tid]=form_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -310,7 +316,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(numtyp)0.0;
           if (form[mtype]==0) {
             e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -325,7 +331,7 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -336,8 +342,8 @@ __kernel void k_colloid_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_colloid.h b/lib/gpu/lal_colloid.h
index 35426007d8..43f14cd354 100644
--- a/lib/gpu/lal_colloid.h
+++ b/lib/gpu/lal_colloid.h
@@ -81,7 +81,7 @@ class Colloid : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_colloid_ext.cpp b/lib/gpu/lal_colloid_ext.cpp
index 961ad75925..dcfd1a6d34 100644
--- a/lib/gpu/lal_colloid_ext.cpp
+++ b/lib/gpu/lal_colloid_ext.cpp
@@ -60,7 +60,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
                         host_lj4, offset, special_lj, host_a12, host_a1,
                         host_a2, host_d1, host_d2, host_sigma3,
-                        host_sigma6, host_form, inum, nall, 300,
+                        host_sigma6, host_form, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   COLLMF.device->world_barrier();
@@ -80,7 +80,7 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
       init_ok=COLLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           offset, special_lj, host_a12, host_a1, host_a2,
                           host_d1, host_d2, host_sigma3, host_sigma6, host_form,
-                          inum, nall, 300, maxspecial,
+                          inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen);
 
     COLLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_coul.cpp b/lib/gpu/lal_coul.cpp
index 3e29215c91..df9eeae667 100644
--- a/lib/gpu/lal_coul.cpp
+++ b/lib/gpu/lal_coul.cpp
@@ -125,20 +125,9 @@ double CoulT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulT::loop(const bool _eflag, const bool _vflag) {
+int CoulT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void CoulT::loop(const bool _eflag, const bool _vflag) {
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Coul<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul.cu b/lib/gpu/lal_coul.cu
index 03fc568c77..c4da81a3a2 100644
--- a/lib/gpu/lal_coul.cu
+++ b/lib/gpu/lal_coul.cu
@@ -46,22 +46,25 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,10 +101,10 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -112,9 +115,9 @@ __kernel void k_coul(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
@@ -134,25 +137,28 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=_cutsq[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -189,10 +195,10 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -203,8 +209,8 @@ __kernel void k_coul_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_coul.h b/lib/gpu/lal_coul.h
index 38472375fb..7298536dea 100644
--- a/lib/gpu/lal_coul.h
+++ b/lib/gpu/lal_coul.h
@@ -75,7 +75,7 @@ class Coul : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_debye.cpp b/lib/gpu/lal_coul_debye.cpp
index 08ceb99300..1107708ca8 100644
--- a/lib/gpu/lal_coul_debye.cpp
+++ b/lib/gpu/lal_coul_debye.cpp
@@ -126,20 +126,9 @@ double CoulDebyeT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
+int CoulDebyeT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -147,8 +136,8 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q, &cutsq,
@@ -162,6 +151,7 @@ void CoulDebyeT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CoulDebye<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul_debye.cu b/lib/gpu/lal_coul_debye.cu
index e7f0b97e23..ba922f04a6 100644
--- a/lib/gpu/lal_coul_debye.cu
+++ b/lib/gpu/lal_coul_debye.cu
@@ -47,22 +47,25 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -102,10 +105,10 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -116,9 +119,9 @@ __kernel void k_coul_debye(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
@@ -140,6 +143,9 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
   __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -147,19 +153,19 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
     cutsq[tid]=_cutsq[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -199,10 +205,10 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul+=qqrd2e*scale[mtype]*qtmp*rinv*screening*factor_coul;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -213,8 +219,7 @@ __kernel void k_coul_debye_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_coul_debye.h b/lib/gpu/lal_coul_debye.h
index 13e4c5b0c6..9054df1995 100644
--- a/lib/gpu/lal_coul_debye.h
+++ b/lib/gpu/lal_coul_debye.h
@@ -76,7 +76,7 @@ class CoulDebye : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_debye_ext.cpp b/lib/gpu/lal_coul_debye_ext.cpp
index af54746def..516dca5df8 100644
--- a/lib/gpu/lal_coul_debye_ext.cpp
+++ b/lib/gpu/lal_coul_debye_ext.cpp
@@ -54,7 +54,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300,
+    init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
 
   CDEMF.device->world_barrier();
@@ -71,7 +71,7 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, 300,
+      init_ok=CDEMF.init(ntypes, host_scale, cutsq, host_special_coul, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, qqrd2e, kappa);
 
     CDEMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_coul_dsf.cpp b/lib/gpu/lal_coul_dsf.cpp
index fe1fbfede7..1a56e84b52 100644
--- a/lib/gpu/lal_coul_dsf.cpp
+++ b/lib/gpu/lal_coul_dsf.cpp
@@ -110,20 +110,9 @@ double CoulDSFT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
+int CoulDSFT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -131,8 +120,8 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -148,6 +137,7 @@ void CoulDSFT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CoulDSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul_dsf.cu b/lib/gpu/lal_coul_dsf.cu
index 190fb5b7fd..5241cb5097 100644
--- a/lib/gpu/lal_coul_dsf.cu
+++ b/lib/gpu/lal_coul_dsf.cu
@@ -48,30 +48,33 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -111,11 +114,11 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -126,9 +129,9 @@ __kernel void k_coul_dsf(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
@@ -147,30 +150,33 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -210,11 +216,11 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -225,8 +231,7 @@ __kernel void k_coul_dsf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_coul_dsf.h b/lib/gpu/lal_coul_dsf.h
index 3d57898f81..a33e98f836 100644
--- a/lib/gpu/lal_coul_dsf.h
+++ b/lib/gpu/lal_coul_dsf.h
@@ -70,7 +70,7 @@ class CoulDSF : public BaseCharge<numtyp, acctyp> {
  private:
   bool _allocated;
   numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_dsf_ext.cpp b/lib/gpu/lal_coul_dsf_ext.cpp
index 2d18f9f94d..e21c70ae4b 100644
--- a/lib/gpu/lal_coul_dsf_ext.cpp
+++ b/lib/gpu/lal_coul_dsf_ext.cpp
@@ -55,7 +55,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+    init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen, host_cut_coulsq, host_special_coul,
                       qqrd2e, e_shift, f_shift, alpha);
 
@@ -73,7 +73,7 @@ int cdsf_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CDMF.init(ntypes, inum, nall, 300, maxspecial, cell_size,
+      init_ok=CDMF.init(ntypes, inum, nall, max_nbors, maxspecial, cell_size,
                         gpu_split, screen, host_cut_coulsq, host_special_coul,
                         qqrd2e, e_shift, f_shift, alpha);
 
diff --git a/lib/gpu/lal_coul_ext.cpp b/lib/gpu/lal_coul_ext.cpp
index 9779526d62..370c186123 100644
--- a/lib/gpu/lal_coul_ext.cpp
+++ b/lib/gpu/lal_coul_ext.cpp
@@ -54,7 +54,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, 300,
+    init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, qqrd2e);
 
   COULMF.device->world_barrier();
@@ -71,7 +71,7 @@ int coul_gpu_init(const int ntypes, double **host_scale,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, 300,
+      init_ok=COULMF.init(ntypes, host_scale, cutsq, special_coul, inum, nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen, qqrd2e);
 
     COULMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_coul_long.cpp b/lib/gpu/lal_coul_long.cpp
index 02097a2c61..36c1cd751f 100644
--- a/lib/gpu/lal_coul_long.cpp
+++ b/lib/gpu/lal_coul_long.cpp
@@ -116,20 +116,9 @@ double CoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CoulLongT::loop(const bool _eflag, const bool _vflag) {
+int CoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -137,8 +126,8 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &scale, &sp_cl,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &scale, &sp_cl,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv,
                           &eflag, &vflag, &ainum, &nbor_pitch,
@@ -153,6 +142,7 @@ void CoulLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_coul_long.cu b/lib/gpu/lal_coul_long.cu
index 7adcdbbabc..f8a33e90a2 100644
--- a/lib/gpu/lal_coul_long.cu
+++ b/lib/gpu/lal_coul_long.cu
@@ -29,100 +29,6 @@ _texture( q_tex,int2);
 #define q_tex q_
 #endif
 
-#if (ARCH < 300)
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-                                                                            \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=e_coul;                                                 \
-                                                                            \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-                                                                            \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    e_coul=red_acc[3][tid];                                                 \
-                                                                            \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-                                                                            \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-                                                                            \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-                                                                            \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#else
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#endif
-
 __kernel void k_coul_long(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const int lj_types,
@@ -140,22 +46,25 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -197,10 +106,10 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += prefactor*(_erfc-factor_coul);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -211,9 +120,11 @@ __kernel void k_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -233,24 +144,27 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
     scale[tid]=scale_in[tid];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -292,10 +206,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += prefactor*(_erfc-factor_coul);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -306,8 +220,10 @@ __kernel void k_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                   vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_coul_long.h b/lib/gpu/lal_coul_long.h
index 0668e0fd02..a89b8e447c 100644
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@@ -74,7 +74,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {
 
  protected:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_coul_long_cs.cu b/lib/gpu/lal_coul_long_cs.cu
index 85c9d84bdb..dfbc771adc 100644
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@@ -43,100 +43,6 @@ _texture( q_tex,int2);
 #define EPS_EWALD (acctyp)(1.0e-6)
 #define EPS_EWALD_SQR (acctyp)(1.0e-12)
 
-#if (ARCH < 300)
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-                                                                            \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=e_coul;                                                 \
-                                                                            \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-                                                                            \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    e_coul=red_acc[3][tid];                                                 \
-                                                                            \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-                                                                            \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-                                                                            \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-                                                                            \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#else
-
-#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
-      *ap1=(acctyp)0;                                                       \
-      ap1+=inum;                                                            \
-      *ap1=e_coul*(acctyp)0.5;                                              \
-      ap1+=inum;                                                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        *ap1=virial[i]*(acctyp)0.5;                                         \
-        ap1+=inum;                                                          \
-      }                                                                     \
-    }                                                                       \
-    ans[ii]=f;                                                              \
-  }
-
-#endif
-
 __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
                           const __global numtyp *restrict scale,
                           const int lj_types,
@@ -154,22 +60,25 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_cl[0]=sp_cl_in[0];
   sp_cl[1]=sp_cl_in[1];
   sp_cl[2]=sp_cl_in[2];
   sp_cl[3]=sp_cl_in[3];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -229,12 +138,12 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = prefactor*_erfc;
           if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -245,9 +154,11 @@ __kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
@@ -267,24 +178,27 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_cl[4];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<4)
     sp_cl[tid]=sp_cl_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
     scale[tid]=scale_in[tid];
 
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp e_coul, virial[6];
+  if (EVFLAG) {
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -344,12 +258,12 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = prefactor*_erfc;
           if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
           e_coul += e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -360,8 +274,9 @@ __kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  acctyp energy;
+  if (EVFLAG) energy=(acctyp)0.0;
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_coul_long_cs_ext.cpp b/lib/gpu/lal_coul_long_cs_ext.cpp
index ae57eb2038..df92619f2f 100644
--- a/lib/gpu/lal_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_coul_long_cs_ext.cpp
@@ -54,7 +54,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+    init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                       cell_size, gpu_split, screen, host_cut_coulsq,
                       host_special_coul, qqrd2e, g_ewald);
 
@@ -72,7 +72,7 @@ int clcs_gpu_init(const int ntypes, double **host_scale,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+      init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_coulsq,
                         host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_coul_long_ext.cpp b/lib/gpu/lal_coul_long_ext.cpp
index 653b4be4f3..1d9dcfdeca 100644
--- a/lib/gpu/lal_coul_long_ext.cpp
+++ b/lib/gpu/lal_coul_long_ext.cpp
@@ -54,7 +54,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+    init_ok=CLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                       cell_size, gpu_split, screen, host_cut_coulsq,
                       host_special_coul, qqrd2e, g_ewald);
 
@@ -72,7 +72,7 @@ int cl_gpu_init(const int ntypes, double **host_scale,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=CLMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+      init_ok=CLMF.init(ntypes, host_scale, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_coulsq,
                         host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_device.cpp b/lib/gpu/lal_device.cpp
index 911cdda383..a65c3d8810 100644
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@@ -18,12 +18,18 @@
 #include <map>
 #include <cmath>
 #include <cstdlib>
-#ifdef _OPENMP
+#if (LAL_USE_OMP == 1)
 #include <omp.h>
 #endif
 
 #if defined(USE_OPENCL)
 #include "device_cl.h"
+
+#ifdef LAL_OCL_EXTRA_ARGS
+#define LAL_DM_STRINGIFY(x) #x
+#define LAL_PRE_STRINGIFY(x) LAL_DM_STRINGIFY(x)
+#endif
+
 #elif defined(USE_CUDART)
 const char *device=0;
 #else
@@ -45,40 +51,44 @@ DeviceT::~Device() {
 }
 
 template <class numtyp, class acctyp>
-int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                         const int last_gpu, const int gpu_mode,
-                         const double p_split, const int nthreads,
-                         const int t_per_atom, const double cell_size,
-                         char *ocl_vendor, const int block_pair) {
-  _nthreads=nthreads;
-  #ifdef _OPENMP
-  omp_set_num_threads(nthreads);
-  #endif
+int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                         const int first_gpu_id, const int gpu_mode,
+                         const double p_split, const int t_per_atom,
+                         const double user_cell_size, char *ocl_args,
+                         const int ocl_platform, char *device_type_flags,
+                         const int block_pair) {
   _threads_per_atom=t_per_atom;
   _threads_per_charge=t_per_atom;
+  _threads_per_three=t_per_atom;
 
   if (_device_init)
     return 0;
   _device_init=true;
   _comm_world=replica; //world;
   _comm_replica=replica;
-  _first_device=first_gpu;
-  _last_device=last_gpu;
+  int ndevices=ngpu;
+  _first_device=first_gpu_id;
   _gpu_mode=gpu_mode;
   _particle_split=p_split;
-  _cell_size=cell_size;
+  _user_cell_size=user_cell_size;
   _block_pair=block_pair;
-  // support selecting platform though "package device" keyword.
-  // "0:generic" will select platform 0 and tune for generic device
-  // "1:fermi" will select platform 1 and tune for Nvidia Fermi gpu
-  if (ocl_vendor) {
-    char *sep = nullptr;
-    if ((sep = strstr(ocl_vendor,":"))) {
-      *sep = '\0';
-      _platform_id = atoi(ocl_vendor);
-      ocl_vendor = sep+1;
-    }
-  }
+
+  // support selecting OpenCL platform id with "package platform" keyword
+  if (ocl_platform >= 0)
+    _platform_id = ocl_platform;
+
+  gpu=new UCL_Device();
+
+  // ---------------------- OpenCL Compiler Args -------------------------
+  std::string extra_args="";
+  if (ocl_args) extra_args+=":"+std::string(ocl_args);
+  #ifdef LAL_OCL_EXTRA_ARGS
+  extra_args+=":" LAL_PRE_STRINGIFY(LAL_OCL_EXTRA_ARGS);
+  #endif
+  for (int i=0; i<extra_args.length(); i++)
+    if (extra_args[i]==':') extra_args[i]=' ';
+
+  // --------------------------- MPI setup -------------------------------
 
   // Get the rank/size within the world
   MPI_Comm_rank(_comm_world,&_world_me);
@@ -124,10 +134,132 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   int node_rank;
   MPI_Comm_rank(node_comm,&node_rank);
 
+  // ------------------- Device selection parameters----------------------
+
+  if (ndevices > procs_per_node)
+    ndevices = procs_per_node;
+
+  // --------------------- OCL Platform Selection  -----------------------
+
+  // Setup OpenCL platform and parameters based on platform
+  // and device type specifications
+  std::string ocl_vstring="";
+  if (device_type_flags != nullptr) ocl_vstring=device_type_flags;
+
+  // Setup the OpenCL platform
+  // If multiple platforms and no user platform specified,
+  // try to match platform from config matching any user specified
+  // device type. Give preference to platforms with GPUs.
+  // Priority under these conditions to platform with device with
+  // highest compute unit count.
+  int pres;
+  enum UCL_DEVICE_TYPE type=UCL_GPU;
+  #ifndef USE_OPENCL
+  pres=gpu->set_platform(0);
+  #else
+  if (_platform_id>=0)
+    pres=gpu->set_platform(_platform_id);
+  else {
+    std::string vendor="";
+    if (device_type_flags!=nullptr) {
+      if (ocl_vstring=="intelgpu")
+        vendor="intel";
+      else if (ocl_vstring=="intelcpu") {
+        vendor="intel";
+        type=UCL_CPU;
+      } else if (ocl_vstring=="nvidiagpu")
+        vendor="nvidia";
+      else if (ocl_vstring=="amdgpu")
+        vendor="amd";
+      else if (ocl_vstring=="applegpu")
+        vendor="apple";
+    }
+    pres=gpu->auto_set_platform(type,vendor,ndevices,_first_device);
+  }
+  #endif
+  if (pres != UCL_SUCCESS)
+    return -12;
+
+  // ------------------------ Device Selection ---------------------------
+  if (_first_device > -1 && _first_device >= gpu->num_devices())
+    return -2;
+  if (ndevices > gpu->num_devices())
+    return -2;
+  if (_first_device + ndevices > gpu->num_devices())
+    return -2;
+  if (gpu->num_devices()==0)
+    return -2;
+
+  // Fully specified deviceIDs
+  if (_first_device > -1 && ndevices > 0)
+    _last_device = _first_device + ndevices - 1;
+
+  // Find deviceID with most CUs (priority given to the accelerator type)
+  if (_first_device < 0) {
+    int best_device = 0;
+    int best_cus = gpu->cus(0);
+    bool type_match = (gpu->device_type(0) == type);
+    for (int i = 1; i < gpu->num_devices(); i++) {
+      if (type_match==true && gpu->device_type(i)!=type)
+        continue;
+      if (type_match == false && gpu->device_type(i) == type) {
+        type_match = true;
+        best_cus = gpu->cus(i);
+        best_device = i;
+      }
+      if (gpu->cus(i) > best_cus) {
+        best_cus = gpu->cus(i);
+        best_device = i;
+      }
+    }
+    _first_device = _last_device = best_device;
+    type = gpu->device_type(_first_device);
+
+    if (ndevices > 0) {
+      // Expand range to meet specified number of devices
+      while (_last_device - _first_device < ndevices - 1) {
+        if (_last_device + 1 == gpu->num_devices())
+          _first_device--;
+        else if (_first_device == 0)
+          _last_device++;
+        else {
+          if (gpu->device_type(_last_device+1)==type &&
+              gpu->device_type(_first_device-1)!=type)
+            _last_device++;
+          else if (gpu->device_type(_last_device+1)!=type &&
+                   gpu->device_type(_first_device-1)==type)
+            _first_device--;
+          else if (gpu->cus(_last_device+1) > gpu->cus(_first_device-1))
+            _last_device++;
+          else
+            _first_device--;
+        }
+      }
+    }
+  }
+
+  // If ngpus not specified, expand range to include matching devices
+  if (ndevices == 0) {
+    for (int i = _first_device; i < gpu->num_devices(); i++) {
+      if (gpu->device_type(i)==gpu->device_type(_first_device) &&
+          gpu->cus(i)==gpu->cus(_first_device))
+        _last_device = i;
+      else
+        break;
+    }
+    ndevices = _last_device - _first_device + 1;
+    if (ndevices > procs_per_node) {
+      ndevices = procs_per_node;
+      _last_device=_first_device + ndevices - 1;
+    }
+  }
+
+  // ------------------------ MPI Device ID Setup -----------------------
+
   // set the device ID
   _procs_per_gpu=static_cast<int>(ceil(static_cast<double>(procs_per_node)/
-                                       (last_gpu-first_gpu+1)));
-  int my_gpu=node_rank/_procs_per_gpu+first_gpu;
+                                       ndevices));
+  int my_gpu=node_rank/_procs_per_gpu+_first_device;
 
   // Time on the device only if 1 proc per gpu
   _time_device=true;
@@ -146,27 +278,51 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
   MPI_Comm_split(node_comm,my_gpu,0,&_comm_gpu);
   MPI_Comm_rank(_comm_gpu,&_gpu_rank);
 
-  gpu=new UCL_Device();
-  if (my_gpu>=gpu->num_devices())
-    return -2;
-
-  #ifndef CUDA_PROXY
+  #if !defined(CUDA_PROXY) && !defined(CUDA_MPS_SUPPORT)
   if (_procs_per_gpu>1 && gpu->sharing_supported(my_gpu)==false)
     return -7;
   #endif
 
-  if (gpu->set_platform_accelerator(_platform_id)!=UCL_SUCCESS)
-    return -12;
+  // --------------- Device Configuration and Setup  -------------------------
 
   if (gpu->set(my_gpu)!=UCL_SUCCESS)
     return -6;
 
-  gpu->push_command_queue();
-  gpu->set_command_queue(1);
+  #if !defined(USE_OPENCL) && !defined(USE_HIP)
+  if (gpu->arch()<7.0) {
+    gpu->push_command_queue();
+    gpu->set_command_queue(1);
+  }
+  #endif
 
   _long_range_precompute=0;
 
-  if (set_ocl_params(ocl_vendor)!=0)
+  // If OpenCL parameters not specified by user, try to auto detect
+  // best option from the platform config
+  #ifdef USE_OPENCL
+  if (device_type_flags==nullptr) {
+    std::string pname = gpu->platform_name();
+    for (int i=0; i<pname.length(); i++)
+      if (pname[i]<='z' && pname[i]>='a')
+        pname[i]=toupper(pname[i]);
+    if (pname.find("NVIDIA")!=std::string::npos)
+      ocl_vstring="nvidiagpu";
+    else if (pname.find("INTEL")!=std::string::npos) {
+      if (gpu->device_type()==UCL_GPU)
+        ocl_vstring="intelgpu";
+      else if (gpu->device_type()==UCL_CPU)
+        ocl_vstring="intelcpu";
+    } else if (pname.find("AMD")!=std::string::npos) {
+      if (gpu->device_type()==UCL_GPU)
+        ocl_vstring="amdgpu";
+    } else if (pname.find("APPLE")!=std::string::npos) {
+      if (gpu->device_type()==UCL_GPU)
+        ocl_vstring="applegpu";
+    }
+  }
+  #endif
+
+  if (set_ocl_params(ocl_vstring, extra_args)!=0)
     return -11;
 
   int flag=0;
@@ -175,71 +331,90 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
       flag=compile_kernels();
     gpu_barrier();
   }
+
+  // Setup auto bin size calculation for calls from atom::sort
+  // - This is repeated in neighbor init with additional info
+  if (_user_cell_size<0.0) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _neighbor_shared.setup_auto_cell_size(true,0,_simd_size);
+    #else
+    _neighbor_shared.setup_auto_cell_size(false,0,_simd_size);
+    #endif
+  } else
+    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,_simd_size);
+
   return flag;
 }
 
 template <class numtyp, class acctyp>
-int DeviceT::set_ocl_params(char *ocl_vendor) {
+int DeviceT::set_ocl_params(std::string s_config, std::string extra_args) {
   #ifdef USE_OPENCL
-  std::string s_vendor=OCL_DEFAULT_VENDOR;
-  if (ocl_vendor!=nullptr)
-    s_vendor=ocl_vendor;
-  if (s_vendor=="none")
-    s_vendor="generic";
 
-  if (s_vendor=="kepler") {
-    _ocl_vendor_name="NVIDIA Kepler";
-    #if defined (__APPLE__) || defined(MACOSX)
-    _ocl_vendor_string="-DKEPLER_OCL -DNO_OCL_PTX";
-    #else
-    _ocl_vendor_string="-DKEPLER_OCL";
-    #endif
-  } else if (s_vendor=="fermi") {
-    _ocl_vendor_name="NVIDIA Fermi";
-    _ocl_vendor_string="-DFERMI_OCL";
-  } else if (s_vendor=="cypress") {
-    _ocl_vendor_name="AMD Cypress";
-    _ocl_vendor_string="-DCYPRESS_OCL";
-  } else if (s_vendor=="phi") {
-    _ocl_vendor_name="Intel Phi";
-    _ocl_vendor_string="-DPHI_OCL";
-  } else if (s_vendor=="intel") {
-    _ocl_vendor_name="Intel CPU";
-    _ocl_vendor_string="-DINTEL_OCL";
-  } else if (s_vendor=="generic") {
-    _ocl_vendor_name="GENERIC";
-    _ocl_vendor_string="-DGENERIC_OCL";
-  } else {
-    _ocl_vendor_name="CUSTOM";
-    _ocl_vendor_string="-DUSE_OPENCL";
-    int token_count=0;
-    std::string params[13];
-    char *pch = strtok(ocl_vendor,",");
+  #include "lal_pre_ocl_config.h"
+
+  if (s_config=="" || s_config=="none")
+    s_config="generic";
+
+  int config_index=-1;
+  for (int i=0; i<nconfigs; i++)
+    if (s_config==std::string(ocl_config_names[i]))
+      config_index=i;
+
+  if (config_index != -1)
+    s_config=ocl_config_strings[config_index];
+
+  _ocl_config_name="CUSTOM";
+  int token_count=0;
+  std::string params[18];
+  char ocl_config[2048];
+  strcpy(ocl_config,s_config.c_str());
+  char *pch = strtok(ocl_config,",");
+  _ocl_config_name=pch;
+  pch = strtok(nullptr,",");
+  if (pch == nullptr) return -11;
+  while (pch != nullptr) {
+    if (token_count==18)
+      return -11;
+    params[token_count]=pch;
+    token_count++;
     pch = strtok(nullptr,",");
-    if (pch == nullptr) return -11;
-    while (pch != nullptr) {
-      if (token_count==13)
-        return -11;
-      params[token_count]=pch;
-      token_count++;
-      pch = strtok(nullptr,",");
-    }
-    _ocl_vendor_string+=" -DMEM_THREADS="+params[0]+
-                        " -DTHREADS_PER_ATOM="+params[1]+
-                        " -DTHREADS_PER_CHARGE="+params[2]+
-                        " -DBLOCK_PAIR="+params[3]+
-                        " -DMAX_SHARED_TYPES="+params[4]+
-                        " -DBLOCK_NBOR_BUILD="+params[5]+
-                        " -DBLOCK_BIO_PAIR="+params[6]+
-                        " -DBLOCK_ELLIPSE="+params[7]+
-                        " -DWARP_SIZE="+params[8]+
-                        " -DPPPM_BLOCK_1D="+params[9]+
-                        " -DBLOCK_CELL_2D="+params[10]+
-                        " -DBLOCK_CELL_ID="+params[11]+
-                        " -DMAX_BIO_SHARED_TYPES="+params[12];
   }
-  _ocl_compile_string="-cl-std=CL1.2 -cl-fast-relaxed-math -cl-mad-enable "+std::string(OCL_INT_TYPE)+" "+
-                      std::string(OCL_PRECISION_COMPILE)+" "+_ocl_vendor_string;
+
+  _ocl_compile_string="-cl-mad-enable ";
+  if (params[4]!="0") _ocl_compile_string+="-cl-fast-relaxed-math ";
+  _ocl_compile_string+=std::string(OCL_INT_TYPE)+" "+
+    std::string(OCL_PRECISION_COMPILE);
+  if (gpu->has_subgroup_support())
+    _ocl_compile_string+=" -DUSE_OPENCL_SUBGROUPS";
+  #ifdef LAL_USE_OLD_NEIGHBOR
+  _ocl_compile_string+=" -DLAL_USE_OLD_NEIGHBOR";
+  #endif
+
+  _ocl_compile_string += " -DCONFIG_ID="+params[0]+
+                         " -DSIMD_SIZE="+params[1]+
+                         " -DMEM_THREADS="+params[2];
+  if (gpu->has_shuffle_support()==false)
+    _ocl_compile_string+=" -DSHUFFLE_AVAIL=0";
+  else
+    _ocl_compile_string+=" -DSHUFFLE_AVAIL="+params[3];
+  _ocl_compile_string += " -DFAST_MATH="+params[4]+
+
+                         " -DTHREADS_PER_ATOM="+params[5]+
+                         " -DTHREADS_PER_CHARGE="+params[6]+
+                         " -DTHREADS_PER_THREE="+params[7]+
+
+                         " -DBLOCK_PAIR="+params[8]+
+                         " -DBLOCK_BIO_PAIR="+params[9]+
+                         " -DBLOCK_ELLIPSE="+params[10]+
+                         " -DPPPM_BLOCK_1D="+params[11]+
+                         " -DBLOCK_NBOR_BUILD="+params[12]+
+                         " -DBLOCK_CELL_2D="+params[13]+
+                         " -DBLOCK_CELL_ID="+params[14]+
+
+                         " -DMAX_SHARED_TYPES="+params[15]+
+                         " -DMAX_BIO_SHARED_TYPES="+params[16]+
+                         " -DPPPM_MAX_SPLINE="+params[17];
+  _ocl_compile_string += extra_args;
   #endif
   return 0;
 }
@@ -269,8 +444,10 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
   else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
     gpu_nbor=2;
   #if !defined(USE_CUDPP) && !defined(USE_HIP_DEVICE_SORT)
-  if (gpu_nbor==1)
-    gpu_nbor=2;
+  if (gpu_nbor==1) gpu_nbor=2;
+  #endif
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (gpu_nbor==1) gpu_nbor=2;
   #endif
 
   if (_init_count==0) {
@@ -328,14 +505,15 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
 
 template <class numtyp, class acctyp>
 int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
-                  const int host_nlocal, const int nall,
-                  const int maxspecial, const int gpu_host,
-                  const int max_nbors, const double cell_size,
-                  const bool pre_cut, const int threads_per_atom) {
+                       const int host_nlocal, const int nall,
+                       const int maxspecial, const int gpu_host,
+                       const int max_nbors, const double cutoff,
+                       const bool pre_cut, const int threads_per_atom,
+                       const bool ilist_map) {
   int ef_nlocal=nlocal;
   if (_particle_split<1.0 && _particle_split>0.0)
     ef_nlocal=static_cast<int>(_particle_split*nlocal);
- 
+
   int gpu_nbor=0;
   if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
     gpu_nbor=1;
@@ -345,16 +523,27 @@ int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
   if (gpu_nbor==1)
     gpu_nbor=2;
   #endif
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
 
   if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
                   *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
                   _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device, compile_string()))
+                  _simd_size, _time_device, compile_string(), ilist_map))
     return -3;
-  if (_cell_size<0.0)
-    nbor->cell_size(cell_size,cell_size);
-  else
-    nbor->cell_size(_cell_size,cell_size);
+
+  if (_user_cell_size<0.0) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _neighbor_shared.setup_auto_cell_size(true,cutoff,nbor->simd_size());
+    #else
+    _neighbor_shared.setup_auto_cell_size(false,cutoff,nbor->simd_size());
+    #endif
+  } else
+    _neighbor_shared.setup_auto_cell_size(false,_user_cell_size,
+                                          nbor->simd_size());
+  nbor->set_cutoff(cutoff);
 
   return 0;
 }
@@ -389,13 +578,21 @@ void DeviceT::init_message(FILE *screen, const char *name,
     fprintf(screen,"-------------------------------------\n");
     fprintf(screen,"- Using acceleration for %s:\n",name);
     fprintf(screen,"-  with %d proc(s) per device.\n",_procs_per_gpu);
-    #ifdef _OPENMP
-    fprintf(screen,"-  with %d thread(s) per proc.\n",_nthreads);
+    #if (LAL_USE_OMP == 1)
+    fprintf(screen,"-  with %d thread(s) per proc.\n", omp_get_max_threads());
     #endif
     #ifdef USE_OPENCL
-    fprintf(screen,"-  with OpenCL Parameters for: %s\n",
-            _ocl_vendor_name.c_str());
+    fprintf(screen,"-  with OpenCL Parameters for: %s (%d)\n",
+            _ocl_config_name.c_str(),_config_id);
     #endif
+    if (shuffle_avail())
+      fprintf(screen,"-  Horizontal vector operations: ENABLED\n");
+    else
+      fprintf(screen,"-  Horizontal vector operations: DISABLED\n");
+    if (gpu->shared_memory(first_gpu))
+      fprintf(screen,"-  Shared memory system: Yes\n");
+    else
+      fprintf(screen,"-  Shared memory system: No\n");
     fprintf(screen,"-------------------------------------");
     fprintf(screen,"-------------------------------------\n");
 
@@ -431,7 +628,8 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
                                     double &gpu_overhead,
                                     double &gpu_driver_overhead) {
   UCL_H_Vec<int> *host_data_in=nullptr, *host_data_out=nullptr;
-  UCL_D_Vec<int> *dev_data_in=nullptr, *dev_data_out=nullptr, *kernel_data=nullptr;
+  UCL_D_Vec<int> *dev_data_in=nullptr, *dev_data_out=nullptr,
+    *kernel_data=nullptr;
   UCL_Timer *timers_in=nullptr, *timers_out=nullptr, *timers_kernel=nullptr;
   UCL_Timer over_timer(*gpu);
 
@@ -472,7 +670,7 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
   gpu_overhead=0.0;
   gpu_driver_overhead=0.0;
 
-  for (int i=0; i<10; i++) {
+  for (int z=0; z<11; z++) {
     gpu->sync();
     gpu_barrier();
     over_timer.start();
@@ -486,9 +684,11 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       timers_in[i].stop();
     }
 
+    const int numel=1;
     for (int i=0; i<kernel_calls; i++) {
       timers_kernel[i].start();
-      zero(kernel_data[i],1);
+      k_zero.set_size(1,_block_pair);
+      k_zero.run(&(kernel_data[i]),&numel);
       timers_kernel[i].stop();
     }
 
@@ -498,9 +698,12 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
       timers_out[i].stop();
     }
     over_timer.stop();
-
-    double time=over_timer.seconds();
+    #ifndef GERYON_OCL_FLUSH
+    if (_data_out_estimate)
+      dev_data_out[0].flush();
+    #endif
     driver_time=MPI_Wtime()-driver_time;
+    double time=over_timer.seconds();
 
     if (time_device()) {
       for (int i=0; i<_data_in_estimate; i++)
@@ -513,9 +716,12 @@ void DeviceT::estimate_gpu_overhead(const int kernel_calls,
 
     double mpi_time, mpi_driver_time;
     MPI_Allreduce(&time,&mpi_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
-    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,gpu_comm());
-    gpu_overhead+=mpi_time;
-    gpu_driver_overhead+=mpi_driver_time;
+    MPI_Allreduce(&driver_time,&mpi_driver_time,1,MPI_DOUBLE,MPI_MAX,
+                  gpu_comm());
+    if (z>0) {
+      gpu_overhead+=mpi_time;
+      gpu_driver_overhead+=mpi_driver_time;
+    }
   }
   gpu_overhead/=10.0;
   gpu_driver_overhead/=10.0;
@@ -567,19 +773,22 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
-  double t_time=times[0]+times[1]+times[2]+times[3]+times[4];
+
+  #ifdef USE_OPENCL
+  // Workaround for timing issue on Intel OpenCL
+  if (times[3] > 80e6) times[3]=0.0;
+  #endif
 
   if (replica_me()==0)
-    if (screen && times[5]>0.0) {
+    if (screen && times[6]>0.0) {
       fprintf(screen,"\n\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
       fprintf(screen,"      Device Time Info (average): ");
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device() && t_time>0) {
+      if (time_device() && times[3]>0) {
         fprintf(screen,"Data Transfer:   %.4f s.\n",times[0]/_replica_size);
-        fprintf(screen,"Data Cast/Pack:  %.4f s.\n",times[4]/_replica_size);
         fprintf(screen,"Neighbor copy:   %.4f s.\n",times[1]/_replica_size);
         if (nbor.gpu_nbor()>0)
           fprintf(screen,"Neighbor build:  %.4f s.\n",times[2]/_replica_size);
@@ -587,13 +796,15 @@ void DeviceT::output_times(UCL_Timer &time_pair, Answer<numtyp,acctyp> &ans,
           fprintf(screen,"Neighbor unpack: %.4f s.\n",times[2]/_replica_size);
         fprintf(screen,"Force calc:      %.4f s.\n",times[3]/_replica_size);
       }
-      if (nbor.gpu_nbor()==2)
-        fprintf(screen,"Neighbor (CPU):  %.4f s.\n",times[8]/_replica_size);
       if (times[5]>0)
         fprintf(screen,"Device Overhead: %.4f s.\n",times[5]/_replica_size);
       fprintf(screen,"Average split:   %.4f.\n",avg_split);
-      fprintf(screen,"Threads / atom:  %d.\n",threads_per_atom);
+      fprintf(screen,"Lanes / atom:    %d.\n",threads_per_atom);
+      fprintf(screen,"Vector width:    %d.\n", simd_size());
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
+      if (nbor.gpu_nbor()==2)
+        fprintf(screen,"CPU Neighbor:    %.4f s.\n",times[8]/_replica_size);
+      fprintf(screen,"CPU Cast/Pack:   %.4f s.\n",times[4]/_replica_size);
       fprintf(screen,"CPU Driver_Time: %.4f s.\n",times[6]/_replica_size);
       fprintf(screen,"CPU Idle_Time:   %.4f s.\n",times[7]/_replica_size);
 
@@ -612,24 +823,29 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
                                   const double max_bytes,
                                   const double cpu_time,
                                   const double idle_time, FILE *screen) {
-  double single[8], times[8];
+  double single[9], times[9];
 
   single[0]=time_out.total_seconds();
   single[1]=time_in.total_seconds()+atom.transfer_time()+atom.cast_time();
   single[2]=time_map.total_seconds();
   single[3]=time_rho.total_seconds();
   single[4]=time_interp.total_seconds();
-  single[5]=ans.transfer_time()+ans.cast_time();
+  single[5]=ans.transfer_time();
   single[6]=cpu_time;
   single[7]=idle_time;
+  single[8]=ans.cast_time();
 
-  MPI_Reduce(single,times,8,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
+  MPI_Reduce(single,times,9,MPI_DOUBLE,MPI_SUM,0,_comm_replica);
 
   double my_max_bytes=max_bytes+atom.max_gpu_bytes();
   double mpi_max_bytes;
   MPI_Reduce(&my_max_bytes,&mpi_max_bytes,1,MPI_DOUBLE,MPI_MAX,0,_comm_replica);
   double max_mb=mpi_max_bytes/(1024.0*1024.0);
-  double t_time=times[0]+times[1]+times[2]+times[3]+times[4]+times[5];
+  #ifdef USE_OPENCL
+  // Workaround for timing issue on Intel OpenCL
+  if (times[3] > 80e6) times[3]=0.0;
+  #endif
+
 
   if (replica_me()==0)
     if (screen && times[6]>0.0) {
@@ -639,7 +855,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
       fprintf(screen,"\n-------------------------------------");
       fprintf(screen,"--------------------------------\n");
 
-      if (time_device() && t_time>0) {
+      if (time_device() && times[3]>0) {
         fprintf(screen,"Data Out:        %.4f s.\n",times[0]/_replica_size);
         fprintf(screen,"Data In:         %.4f s.\n",times[1]/_replica_size);
         fprintf(screen,"Kernel (map):    %.4f s.\n",times[2]/_replica_size);
@@ -649,12 +865,13 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
                 (times[0]+times[2]+times[3])/_replica_size);
         fprintf(screen,"Total interp:    %.4f s.\n",
                 (times[1]+times[4])/_replica_size);
-        fprintf(screen,"Force copy/cast: %.4f s.\n",times[5]/_replica_size);
+        fprintf(screen,"Force copy:      %.4f s.\n",times[5]/_replica_size);
         fprintf(screen,"Total:           %.4f s.\n",
                 (times[0]+times[1]+times[2]+times[3]+times[4]+times[5])/
                 _replica_size);
       }
       fprintf(screen,"CPU Poisson:     %.4f s.\n",times[6]/_replica_size);
+      fprintf(screen,"CPU Data Cast:   %.4f s.\n",times[8]/_replica_size);
       fprintf(screen,"CPU Idle Time:   %.4f s.\n",times[7]/_replica_size);
       fprintf(screen,"Max Mem / Proc:  %.2f MB.\n",max_mb);
 
@@ -699,14 +916,15 @@ int DeviceT::compile_kernels() {
           return flag;
 
   dev_program=new UCL_Program(*gpu);
-  int success=dev_program->load_string(device,compile_string().c_str());
+  int success=dev_program->load_string(device,compile_string().c_str(),
+                                       nullptr,stderr);
   if (success!=UCL_SUCCESS)
     return -6;
   k_zero.set_function(*dev_program,"kernel_zero");
   k_info.set_function(*dev_program,"kernel_info");
   _compiled=true;
 
-  UCL_Vector<int,int> gpu_lib_data(15,*gpu,UCL_NOT_PINNED);
+  UCL_Vector<int,int> gpu_lib_data(19,*gpu,UCL_NOT_PINNED);
   k_info.set_size(1,1);
   k_info.run(&gpu_lib_data);
   gpu_lib_data.update_host(false);
@@ -717,39 +935,81 @@ int DeviceT::compile_kernels() {
     return -4;
   #endif
 
-  _num_mem_threads=gpu_lib_data[1];
-  _warp_size=gpu_lib_data[2];
-  if (_threads_per_atom<1)
-    _threads_per_atom=gpu_lib_data[3];
-  if (_threads_per_charge<1)
-    _threads_per_charge=gpu_lib_data[13];
-  _pppm_max_spline=gpu_lib_data[4];
-  _pppm_block=gpu_lib_data[5];
-  if (_block_pair == -1) _block_pair=gpu_lib_data[6];
-  _max_shared_types=gpu_lib_data[7];
-  _block_cell_2d=gpu_lib_data[8];
-  _block_cell_id=gpu_lib_data[9];
-  _block_nbor_build=gpu_lib_data[10];
-  _block_bio_pair=gpu_lib_data[11];
-  _max_bio_shared_types=gpu_lib_data[12];
-  _block_ellipse=gpu_lib_data[14];
+  _config_id=gpu_lib_data[1];
 
-  if (static_cast<size_t>(_block_pair)>gpu->group_size())
-    _block_pair=gpu->group_size();
-  if (static_cast<size_t>(_block_bio_pair)>gpu->group_size())
-    _block_bio_pair=gpu->group_size();
-  if (_threads_per_atom>_warp_size)
-    _threads_per_atom=_warp_size;
-  if (_warp_size%_threads_per_atom!=0)
+  if (sizeof(numtyp)==sizeof(float))
+    _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp32_width());
+  else
+    _simd_size=std::max(gpu_lib_data[2],gpu->preferred_fp64_width());
+
+  _num_mem_threads=gpu_lib_data[3];
+  _shuffle_avail=gpu_lib_data[4];
+  _fast_math=gpu_lib_data[5];
+
+  if (_threads_per_atom<1)
+    _threads_per_atom=gpu_lib_data[6];
+  if (_threads_per_charge<1)
+    _threads_per_charge=gpu_lib_data[7];
+  if (_threads_per_three<1)
+    _threads_per_three=gpu_lib_data[8];
+
+  if (_block_pair == -1) {
+    _block_pair=gpu_lib_data[9];
+    _block_bio_pair=gpu_lib_data[10];
+    _block_ellipse=gpu_lib_data[11];
+  } else {
+    _block_bio_pair=_block_pair;
+    _block_ellipse=_block_pair;
+  }
+  _pppm_block=gpu_lib_data[12];
+  _block_nbor_build=gpu_lib_data[13];
+  _block_cell_2d=gpu_lib_data[14];
+  _block_cell_id=gpu_lib_data[15];
+
+  _max_shared_types=gpu_lib_data[16];
+  _max_bio_shared_types=gpu_lib_data[17];
+  _pppm_max_spline=gpu_lib_data[18];
+
+  if (static_cast<size_t>(_block_pair)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_bio_pair)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_ellipse)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_pppm_block)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_nbor_build)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_cell_2d)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_block_cell_2d)>gpu->group_size_dim(1) ||
+      static_cast<size_t>(_block_cell_id)>gpu->group_size_dim(0) ||
+      static_cast<size_t>(_max_shared_types*_max_shared_types*
+                          sizeof(numtyp)*17 > gpu->slm_size()) ||
+      static_cast<size_t>(_max_bio_shared_types*2*sizeof(numtyp) >
+                          gpu->slm_size()))
+    return -13;
+
+  if (_block_pair % _simd_size != 0 || _block_bio_pair % _simd_size != 0 ||
+      _block_ellipse % _simd_size != 0 || _pppm_block % _simd_size != 0 ||
+      _block_nbor_build % _simd_size != 0 ||
+      _block_pair < _max_shared_types * _max_shared_types ||
+      _block_bio_pair * 2 < _max_bio_shared_types ||
+      _pppm_block < _pppm_max_spline * _pppm_max_spline)
+    return -11;
+
+  if (_threads_per_atom>_simd_size)
+    _threads_per_atom=_simd_size;
+  if (_simd_size%_threads_per_atom!=0)
     _threads_per_atom=1;
   if (_threads_per_atom & (_threads_per_atom - 1))
     _threads_per_atom=1;
-  if (_threads_per_charge>_warp_size)
-    _threads_per_charge=_warp_size;
-  if (_warp_size%_threads_per_charge!=0)
+  if (_threads_per_charge>_simd_size)
+    _threads_per_charge=_simd_size;
+  if (_simd_size%_threads_per_charge!=0)
     _threads_per_charge=1;
   if (_threads_per_charge & (_threads_per_charge - 1))
     _threads_per_charge=1;
+  if (_threads_per_three>_simd_size)
+    _threads_per_three=_simd_size;
+  if (_simd_size%_threads_per_three!=0)
+    _threads_per_three=1;
+  if (_threads_per_three & (_threads_per_three - 1))
+    _threads_per_three=1;
 
   return flag;
 }
@@ -765,14 +1025,16 @@ Device<PRECISION,ACC_PRECISION> global_device;
 }
 
 using namespace LAMMPS_AL;
-int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                    const int last_gpu, const int gpu_mode,
-                    const double particle_split, const int nthreads,
-                    const int t_per_atom, const double cell_size,
-                    char *opencl_vendor, const int block_pair) {
-  return global_device.init_device(world,replica,first_gpu,last_gpu,gpu_mode,
-                                   particle_split,nthreads,t_per_atom,
-                                   cell_size,opencl_vendor,block_pair);
+int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                    const int first_gpu_id, const int gpu_mode,
+                    const double particle_split, const int t_per_atom,
+                    const double user_cell_size, char *opencl_config,
+                    const int ocl_platform, char *device_type_flags,
+                    const int block_pair) {
+  return global_device.init_device(world,replica,ngpu,first_gpu_id,gpu_mode,
+                                   particle_split,t_per_atom,user_cell_size,
+                                   opencl_config,ocl_platform,
+                                   device_type_flags,block_pair);
 }
 
 void lmp_clear_device() {
@@ -780,8 +1042,16 @@ void lmp_clear_device() {
 }
 
 double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                      double **vatom, double *virial, double &ecoul) {
-  return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul);
+                      double **vatom, double *virial, double &ecoul,
+                      int &error_flag) {
+  return global_device.fix_gpu(f,tor,eatom,vatom,virial,ecoul,error_flag);
+}
+
+double lmp_gpu_update_bin_size(const double subx, const double suby,
+                               const double subz, const int nlocal,
+                               const double cut) {
+  return global_device._neighbor_shared.update_cell_size(subx, suby,
+                                                         subz, nlocal, cut);
 }
 
 bool lmp_gpu_config(const std::string &category, const std::string &setting)
diff --git a/lib/gpu/lal_device.cu b/lib/gpu/lal_device.cu
index afc7a0b988..61341964b2 100644
--- a/lib/gpu/lal_device.cu
+++ b/lib/gpu/lal_device.cu
@@ -26,20 +26,30 @@ __kernel void kernel_zero(__global int *restrict mem,
 }
 
 __kernel void kernel_info(__global int *info) {
-  info[0]=ARCH;
-  info[1]=MEM_THREADS;
-  info[2]=WARP_SIZE;
-  info[3]=THREADS_PER_ATOM;
-  info[4]=PPPM_MAX_SPLINE;
-  info[5]=PPPM_BLOCK_1D;
-  info[6]=BLOCK_PAIR;
-  info[7]=MAX_SHARED_TYPES;
-  info[8]=BLOCK_CELL_2D;
-  info[9]=BLOCK_CELL_ID;
-  info[10]=BLOCK_NBOR_BUILD;
-  info[11]=BLOCK_BIO_PAIR;
-  info[12]=MAX_BIO_SHARED_TYPES;
-  info[13]=THREADS_PER_CHARGE;
-  info[14]=BLOCK_ELLIPSE;
-}
+  #ifdef __CUDA_ARCH__
+  info[0]=__CUDA_ARCH__;
+  #else
+  info[0]=0;
+  #endif
+  info[1]=CONFIG_ID;
+  info[2]=SIMD_SIZE;
+  info[3]=MEM_THREADS;
+  info[4]=SHUFFLE_AVAIL;
+  info[5]=FAST_MATH;
 
+  info[6]=THREADS_PER_ATOM;
+  info[7]=THREADS_PER_CHARGE;
+  info[8]=THREADS_PER_THREE;
+
+  info[9]=BLOCK_PAIR;
+  info[10]=BLOCK_BIO_PAIR;
+  info[11]=BLOCK_ELLIPSE;
+  info[12]=PPPM_BLOCK_1D;
+  info[13]=BLOCK_NBOR_BUILD;
+  info[14]=BLOCK_CELL_2D;
+  info[15]=BLOCK_CELL_ID;
+
+  info[16]=MAX_SHARED_TYPES;
+  info[17]=MAX_BIO_SHARED_TYPES;
+  info[18]=PPPM_MAX_SPLINE;
+}
diff --git a/lib/gpu/lal_device.h b/lib/gpu/lal_device.h
index 21bd039c42..1db6ae3127 100644
--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@@ -39,22 +39,23 @@ class Device {
 
   /// Initialize the device for use by this process
   /** Sets up a per-device MPI communicator for load balancing and initializes
-    * the device (>=first_gpu and <=last_gpu) that this proc will be using
+    * the device (ngpu starting at first_gpu_id) that this proc will be using
     * Returns:
     * -  0 if successful
     * - -2 if GPU not found
     * - -4 if GPU library not compiled for GPU
     * - -6 if GPU could not be initialized for use
     * - -7 if accelerator sharing is not currently allowed on system
-    * - -11 if vendor_string has the wrong number of parameters **/
-  int init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
-                  const int last_gpu, const int gpu_mode,
-                  const double particle_split, const int nthreads,
-                  const int t_per_atom, const double cell_size,
-                  char *vendor_string, const int block_pair);
+    * - -11 if config_string has the wrong number of parameters **/
+  int init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                  const int first_gpu_id, const int gpu_mode,
+                  const double particle_split, const int t_per_atom,
+                  const double user_cell_size, char *config_string,
+                  const int ocl_platform, char *device_type_flags,
+                  const int block_pair);
 
   /// Initialize the device for Atom storage
-  /** \param charge True if charges need to be stored 
+  /** \param charge True if charges need to be stored
     * \param rot True if quaternions need to be stored
     * \param nlocal Total number of local particles to allocate memory for
     * \param nall Total number of local+ghost particles
@@ -94,10 +95,11 @@ class Device {
     *                 1 if gpu_nbor is true, and host needs a half nbor list,
     *                 2 if gpu_nbor is true, and host needs a full nbor list
     * \param max_nbors Initial number of rows in the neighbor matrix
-    * \param cell_size cutoff+skin
+    * \param cutoff cutoff+skin
     * \param pre_cut True if cutoff test will be performed in separate kernel
     *                than the force kernel
     * \param threads_per_atom value to be used by the neighbor list only
+    * \param ilist_map true if ilist mapping data structures used (3-body)
     *
     * Returns:
     * -  0 if successful
@@ -108,8 +110,9 @@ class Device {
   int init_nbor(Neighbor *nbor, const int nlocal,
                 const int host_nlocal, const int nall,
                 const int maxspecial, const int gpu_host,
-                const int max_nbors, const double cell_size,
-                const bool pre_cut, const int threads_per_atom);
+                const int max_nbors, const double cutoff,
+                const bool pre_cut, const int threads_per_atom,
+                const bool ilist_map = false);
 
   /// Output a message for pair_style acceleration with device stats
   void init_message(FILE *screen, const char *name,
@@ -161,13 +164,16 @@ class Device {
 
   /// Add "answers" (force,energies,etc.) into LAMMPS structures
   inline double fix_gpu(double **f, double **tor, double *eatom,
-                        double **vatom, double *virial, double &ecoul) {
+                        double **vatom, double *virial, double &ecoul,
+                        int &error_flag) {
+    error_flag=0;
     atom.data_unavail();
     if (ans_queue.empty()==false) {
       stop_host_timer();
       double evdw=0.0;
       while (ans_queue.empty()==false) {
-        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul);
+        evdw+=ans_queue.front()->get_answers(f,tor,eatom,vatom,virial,ecoul,
+                                             error_flag);
         ans_queue.pop();
       }
       return evdw;
@@ -195,8 +201,6 @@ class Device {
 
   /// Return the number of procs sharing a device (size of device communicator)
   inline int procs_per_gpu() const { return _procs_per_gpu; }
-  /// Return the number of threads per proc
-  inline int num_threads() const { return _nthreads; }
   /// My rank within all processes
   inline int world_me() const { return _world_me; }
   /// Total number of processes
@@ -228,45 +232,49 @@ class Device {
   /// True if device is being timed
   inline bool time_device() const { return _time_device; }
 
+  /// Accelerator device configuration id
+  inline int config_id() const { return _config_id; }
+  /// Number of threads executing concurrently on same multiproc
+  inline int simd_size() const { return _simd_size; }
   /// Return the number of threads accessing memory simulatenously
   inline int num_mem_threads() const { return _num_mem_threads; }
+  /// 1 if horizontal vector operations enabled, 0 otherwise
+  inline int shuffle_avail() const { return _shuffle_avail; }
+  /// For OpenCL, 0 if fast-math options disabled, 1 enabled
+  inline int fast_math() const { return _fast_math; }
+
   /// Return the number of threads per atom for pair styles
   inline int threads_per_atom() const { return _threads_per_atom; }
   /// Return the number of threads per atom for pair styles using charge
   inline int threads_per_charge() const { return _threads_per_charge; }
+  /// Return the number of threads per atom for 3-body pair styles
+  inline int threads_per_three() const { return _threads_per_three; }
+
   /// Return the min of the pair block size or the device max block size
   inline int pair_block_size() const { return _block_pair; }
-  /// Return the maximum number of atom types that can be used with shared mem
-  inline int max_shared_types() const { return _max_shared_types; }
-  /// Return the maximum order for PPPM splines
-  inline int pppm_max_spline() const { return _pppm_max_spline; }
-  /// Return the block size for PPPM kernels
-  inline int pppm_block() const { return _pppm_block; }
-  /// Return the block size for neighbor binning
-  inline int block_cell_2d() const { return _block_cell_2d; }
-  /// Return the block size for atom mapping for neighbor builds
-  inline int block_cell_id() const { return _block_cell_id; }
-  /// Return the block size for neighbor build kernel
-  inline int block_nbor_build() const { return _block_nbor_build; }
   /// Return the block size for "bio" pair styles
   inline int block_bio_pair() const { return _block_bio_pair; }
   /// Return the block size for "ellipse" pair styles
   inline int block_ellipse() const { return _block_ellipse; }
+  /// Return the block size for PPPM kernels
+  inline int pppm_block() const { return _pppm_block; }
+  /// Return the block size for neighbor build kernel
+  inline int block_nbor_build() const { return _block_nbor_build; }
+  /// Return the block size for neighbor binning
+  inline int block_cell_2d() const { return _block_cell_2d; }
+  /// Return the block size for atom mapping for neighbor builds
+  inline int block_cell_id() const { return _block_cell_id; }
+
+  /// Return the maximum number of atom types that can be used with shared mem
+  inline int max_shared_types() const { return _max_shared_types; }
   /// Return the maximum number of atom types for shared mem with "bio" styles
   inline int max_bio_shared_types() const { return _max_bio_shared_types; }
+  /// Return the maximum order for PPPM splines
+  inline int pppm_max_spline() const { return _pppm_max_spline; }
+
   /// Architecture gpu code compiled for (returns 0 for OpenCL)
   inline double ptx_arch() const { return _ptx_arch; }
-  /// Number of threads executing concurrently on same multiproc
-  inline int warp_size() const { return _warp_size; }
-
-  // -------------------- SHARED DEVICE ROUTINES --------------------
-  // Perform asynchronous zero of integer array
-  void zero(UCL_D_Vec<int> &mem, const int numel) {
-    int num_blocks=static_cast<int>(ceil(static_cast<double>(numel)/
-                                    _block_pair));
-    k_zero.set_size(num_blocks,_block_pair);
-    k_zero.run(&mem,&numel);
-  }
+  inline void set_simd_size(int simd_sz) { _simd_size = simd_sz; }
 
   // -------------------------- DEVICE DATA -------------------------
 
@@ -304,35 +312,7 @@ class Device {
   }
 
   inline std::string compile_string() { return _ocl_compile_string; }
-
- private:
-  std::queue<Answer<numtyp,acctyp> *> ans_queue;
-  int _init_count;
-  bool _device_init, _host_timer_started, _time_device;
-  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
-  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
-      _replica_size;
-  int _gpu_mode, _first_device, _last_device, _platform_id, _nthreads;
-  double _particle_split;
-  double _cpu_full;
-  double _ptx_arch;
-  double _cell_size; // -1 if the cutoff is used
-
-  int _num_mem_threads, _warp_size, _threads_per_atom, _threads_per_charge;
-  int _pppm_max_spline, _pppm_block;
-  int _block_pair, _block_ellipse, _max_shared_types;
-  int _block_cell_2d, _block_cell_id, _block_nbor_build;
-  int _block_bio_pair, _max_bio_shared_types;
-
-  UCL_Program *dev_program;
-  UCL_Kernel k_zero, k_info;
-  bool _compiled;
-  int compile_kernels();
-
-  int _data_in_estimate, _data_out_estimate;
-
-  std::string _ocl_vendor_name, _ocl_vendor_string, _ocl_compile_string;
-  int set_ocl_params(char *);
+  inline std::string ocl_config_name() { return _ocl_config_name; }
 
   template <class t>
   inline std::string toa(const t& in) {
@@ -342,6 +322,34 @@ class Device {
     return o.str();
   }
 
+ private:
+  std::queue<Answer<numtyp,acctyp> *> ans_queue;
+  int _init_count;
+  bool _device_init, _host_timer_started, _time_device;
+  MPI_Comm _comm_world, _comm_replica, _comm_gpu;
+  int _procs_per_gpu, _gpu_rank, _world_me, _world_size, _replica_me,
+      _replica_size;
+  int _gpu_mode, _first_device, _last_device, _platform_id;
+  double _particle_split;
+  double _cpu_full;
+  double _ptx_arch;
+  double _user_cell_size; // -1 if the cutoff is used
+
+  int _config_id, _simd_size, _num_mem_threads, _shuffle_avail, _fast_math;
+  int _threads_per_atom, _threads_per_charge, _threads_per_three;
+  int _block_pair, _block_bio_pair, _block_ellipse;
+  int _pppm_block, _block_nbor_build, _block_cell_2d, _block_cell_id;
+  int _max_shared_types, _max_bio_shared_types, _pppm_max_spline;
+
+  UCL_Program *dev_program;
+  UCL_Kernel k_zero, k_info;
+  bool _compiled;
+  int compile_kernels();
+
+  int _data_in_estimate, _data_out_estimate;
+
+  std::string _ocl_config_name, _ocl_config_string, _ocl_compile_string;
+  int set_ocl_params(std::string, std::string);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_lj.cpp b/lib/gpu/lal_dipole_lj.cpp
index b0929e2ffb..ffdeb41ca8 100644
--- a/lib/gpu/lal_dipole_lj.cpp
+++ b/lib/gpu/lal_dipole_lj.cpp
@@ -125,20 +125,9 @@ double DipoleLJT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
+int DipoleLJT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -165,6 +154,7 @@ void DipoleLJT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class DipoleLJ<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_lj.cu b/lib/gpu/lal_dipole_lj.cu
index a3ed0d8d40..cbe68ff692 100644
--- a/lib/gpu/lal_dipole_lj.cu
+++ b/lib/gpu/lal_dipole_lj.cu
@@ -31,106 +31,178 @@ _texture_2d( mu_tex,int4);
 #define mu_tex mu_
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=tor.x;                                                  \
-    red_acc[4][tid]=tor.y;                                                  \
-    red_acc[5][tid]=tor.z;                                                  \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<6; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z,       \
+                     tor.x, tor.y, tor.z);                                  \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    tor.x=red_acc[3][tid];                                                  \
-    tor.y=red_acc[4][tid];                                                  \
-    tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      red_acc[7][tid]=ecoul;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<8; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
-      ecoul=red_acc[7][tid];                                                \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#if (EVFLAG == 1)
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
 #define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
   }
 
 #endif
+#endif
 
 __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
                           const __global numtyp4 *restrict lj1,
@@ -151,6 +223,9 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -160,22 +235,19 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -305,7 +377,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
@@ -324,7 +396,7 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -335,9 +407,9 @@ __kernel void k_dipole_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
 __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
@@ -361,33 +433,33 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -518,7 +590,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0;
           if (rsq < lj1[mtype].w) {
             e = qtmp*qj*rinv;
@@ -537,7 +609,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -548,8 +620,7 @@ __kernel void k_dipole_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_dipole_lj.h b/lib/gpu/lal_dipole_lj.h
index bd312324c6..395a7472ba 100644
--- a/lib/gpu/lal_dipole_lj.h
+++ b/lib/gpu/lal_dipole_lj.h
@@ -77,7 +77,7 @@ class DipoleLJ : public BaseDipole<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_lj_ext.cpp b/lib/gpu/lal_dipole_lj_ext.cpp
index 0a94969c8b..90c9935913 100644
--- a/lib/gpu/lal_dipole_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_ext.cpp
@@ -57,7 +57,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e);
 
@@ -76,7 +76,7 @@ int dpl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
diff --git a/lib/gpu/lal_dipole_lj_sf.cpp b/lib/gpu/lal_dipole_lj_sf.cpp
index dcf95bb126..6b40ffaa11 100644
--- a/lib/gpu/lal_dipole_lj_sf.cpp
+++ b/lib/gpu/lal_dipole_lj_sf.cpp
@@ -125,20 +125,9 @@ double DipoleLJSFT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
+int DipoleLJSFT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -165,6 +154,7 @@ void DipoleLJSFT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class DipoleLJSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_lj_sf.cu b/lib/gpu/lal_dipole_lj_sf.cu
index 8032ae82ed..717d8959ba 100644
--- a/lib/gpu/lal_dipole_lj_sf.cu
+++ b/lib/gpu/lal_dipole_lj_sf.cu
@@ -32,106 +32,178 @@ _texture_2d( mu_tex,int4);
 #define mu_tex mu_
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=tor.x;                                                  \
-    red_acc[4][tid]=tor.y;                                                  \
-    red_acc[5][tid]=tor.z;                                                  \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<6; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z,       \
+                     tor.x, tor.y, tor.z);                                  \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    tor.x=red_acc[3][tid];                                                  \
-    tor.y=red_acc[4][tid];                                                  \
-    tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      red_acc[7][tid]=ecoul;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<8; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
-      ecoul=red_acc[7][tid];                                                \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#if (EVFLAG == 1)
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
 #define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-      engv[ei]=e_coul*(acctyp)0.5;                                          \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                     \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
   }
 
 #endif
+#endif
 
 __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict lj1,
@@ -152,6 +224,9 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -161,22 +236,19 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -333,7 +405,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
@@ -357,7 +429,7 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -367,9 +439,9 @@ __kernel void k_dipole_lj_sf(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
 __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
@@ -394,33 +466,33 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -576,7 +648,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
         tor.y+=fq*ticoul.y;
         tor.z+=fq*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < lj1[mtype].w) {
             numtyp fac = (numtyp)1.0-ucl_sqrt(rsq*rcutcoul2inv);
@@ -600,7 +672,7 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*e;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -611,8 +683,8 @@ __kernel void k_dipole_lj_sf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_dipole_lj_sf.h b/lib/gpu/lal_dipole_lj_sf.h
index ae73508065..088d8df03e 100644
--- a/lib/gpu/lal_dipole_lj_sf.h
+++ b/lib/gpu/lal_dipole_lj_sf.h
@@ -77,7 +77,7 @@ class DipoleLJSF : public BaseDipole<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_lj_sf_ext.cpp b/lib/gpu/lal_dipole_lj_sf_ext.cpp
index 3626e8305e..0879702887 100644
--- a/lib/gpu/lal_dipole_lj_sf_ext.cpp
+++ b/lib/gpu/lal_dipole_lj_sf_ext.cpp
@@ -57,7 +57,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                         host_lj4, special_lj, inum, nall, 300,
+                         host_lj4, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
@@ -76,7 +76,7 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLSFMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                           special_lj, inum, nall, 300, maxspecial,
+                           special_lj, inum, nall, max_nbors, maxspecial,
                            cell_size, gpu_split, screen, host_cut_ljsq,
                            host_cut_coulsq, host_special_coul, qqrd2e);
 
diff --git a/lib/gpu/lal_dipole_long_lj.cpp b/lib/gpu/lal_dipole_long_lj.cpp
index 9648e9b15e..5531fa0dc9 100644
--- a/lib/gpu/lal_dipole_long_lj.cpp
+++ b/lib/gpu/lal_dipole_long_lj.cpp
@@ -128,20 +128,9 @@ double DipoleLongLJT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) {
+int DipoleLongLJT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -149,8 +138,8 @@ void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -168,6 +157,7 @@ void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class DipoleLongLJ<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_dipole_long_lj.cu b/lib/gpu/lal_dipole_long_lj.cu
index 3aafba43aa..407b63f93e 100644
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@@ -31,106 +31,178 @@ _texture_2d( mu_tex,int4);
 #define mu_tex mu_
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=tor.x;                                                  \
-    red_acc[4][tid]=tor.y;                                                  \
-    red_acc[5][tid]=tor.z;                                                  \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<6; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add6(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z,       \
+                     tor.x, tor.y, tor.z);                                  \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add2(t_per_atom, red_acc, offset, tid, energy, e_coul); \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    tor.x=red_acc[3][tid];                                                  \
-    tor.y=red_acc[4][tid];                                                  \
-    tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      red_acc[7][tid]=ecoul;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<8; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
-      ecoul=red_acc[7][tid];                                                \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int ev_stride=NUM_BLOCKS_X;                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add2(simd_size(), red_acc, tid, energy, e_coul);       \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]=energy*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+          engv[ei]=e_coul*(acctyp)0.5;                                      \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]=virial[r]*(acctyp)0.5;                                 \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#if (EVFLAG == 1)
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add2(t_per_atom,energy,e_coul);                         \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add2(vwidth, energy, e_coul);                       \
+            if (voffset==0) {                                               \
+              red_acc[6][bnum] = energy;                                    \
+              red_acc[7][bnum] = e_coul;                                    \
+            }                                                               \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+          if (eflag) {                                                      \
+            energy = red_acc[6][tid];                                       \
+            e_coul = red_acc[7][tid];                                       \
+          }                                                                 \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = e_coul = (acctyp)0;                           \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        const int ev_stride=NUM_BLOCKS_X;                                   \
+        if (eflag) {                                                        \
+          simd_reduce_add2(vwidth, energy, e_coul);                         \
+          if (tid==0) {                                                     \
+            engv[ei]=energy*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+            engv[ei]=e_coul*(acctyp)0.5;                                    \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]=virial[r]*(acctyp)0.5;                               \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]=energy*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+        engv[ei]=e_coul*(acctyp)0.5;                                        \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]=virial[i]*(acctyp)0.5;                                   \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
 #define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
                         t_per_atom, offset, eflag, vflag, ans, engv)        \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]=energy*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-      engv[ei]=e_coul*(acctyp)0.5;                                             \
-      ei+=inum;                                                           \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]=virial[i]*(acctyp)0.5;                                        \
-        ei+=inum;                                                         \
-      }                                                                     \
-    }                                                                       \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add6(t_per_atom, f.x, f.y, f.z, tor.x, tor.y, tor.z);       \
+  if (offset==0 && ii<inum) {                                               \
     ans[ii]=f;                                                              \
     ans[ii+inum]=tor;                                                       \
   }
 
 #endif
+#endif
 
 #define MY_PIS (acctyp)1.77245385090551602729
 
@@ -154,6 +226,9 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -163,17 +238,15 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   numtyp pre1 = numtyp(2.0) * g_ewald / MY_PIS;
   numtyp pre2 = numtyp(4.0) * (g_ewald*g_ewald*g_ewald) / MY_PIS;
@@ -182,7 +255,6 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -352,7 +424,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
         tor.y+=qqrd2e*ticoul.y;
         tor.z+=qqrd2e*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) {
             e = qqrd2e*(b0*g0 + b1*g1 + b2*g2);
@@ -368,7 +440,7 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -379,9 +451,9 @@ __kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
 
 __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
@@ -406,26 +478,27 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
-  acctyp4 f;
+  acctyp4 f, tor;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -436,7 +509,6 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -606,7 +678,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
         tor.y+=qqrd2e*ticoul.y;
         tor.z+=qqrd2e*ticoul.z;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           acctyp e = (acctyp)0.0;
           if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) {
             e = qqrd2e*(b0*g0 + b1*g1 + b2*g2);
@@ -622,7 +694,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*force.x;
           virial[1] += dely*force.y;
           virial[2] += delz*force.z;
@@ -633,8 +705,7 @@ __kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_dipole_long_lj.h b/lib/gpu/lal_dipole_long_lj.h
index 77e22a10a7..c8f37efd2b 100644
--- a/lib/gpu/lal_dipole_long_lj.h
+++ b/lib/gpu/lal_dipole_long_lj.h
@@ -77,7 +77,7 @@ class DipoleLongLJ : public BaseDipole<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dipole_long_lj_ext.cpp b/lib/gpu/lal_dipole_long_lj_ext.cpp
index b2751e8a82..fd61706ba9 100644
--- a/lib/gpu/lal_dipole_long_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_long_lj_ext.cpp
@@ -58,7 +58,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_dpd.cpp b/lib/gpu/lal_dpd.cpp
index c5cbc7eb53..f890fb53a3 100644
--- a/lib/gpu/lal_dpd.cpp
+++ b/lib/gpu/lal_dpd.cpp
@@ -52,15 +52,31 @@ int DPDT::init(const int ntypes,
                const int max_nbors, const int maxspecial,
                const double cell_size,
                const double gpu_split, FILE *_screen) {
+  const int max_shared_types=this->device->max_shared_types();
+
+  int onetype=0;
+  #ifdef USE_OPENCL
+  if (maxspecial==0)
+    for (int i=1; i<ntypes; i++)
+      for (int j=i; j<ntypes; j++)
+        if (host_cutsq[i][j]>0) {
+          if (onetype>0)
+            onetype=-1;
+          else if (onetype==0)
+            onetype=i*max_shared_types+j;
+        }
+  if (onetype<0) onetype=0;
+  #endif
+
   int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,_screen,dpd,"k_dpd");
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,dpd,"k_dpd",onetype);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
@@ -117,20 +133,9 @@ double DPDT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void DPDT::loop(const bool _eflag, const bool _vflag) {
+int DPDT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -138,8 +143,8 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->v, &cutsq,
@@ -155,6 +160,7 @@ void DPDT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template <class numtyp, class acctyp>
diff --git a/lib/gpu/lal_dpd.cu b/lib/gpu/lal_dpd.cu
index a29e04fc7f..2794110a92 100644
--- a/lib/gpu/lal_dpd.cu
+++ b/lib/gpu/lal_dpd.cu
@@ -179,16 +179,19 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -249,14 +252,14 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           // unshifted eng of conservative term:
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
           numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
           energy+=factor_dpd*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -267,9 +270,9 @@ __kernel void k_dpd(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
@@ -289,6 +292,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
   if (tid<4)
@@ -296,25 +300,36 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp coeffx=coeff_in[ONETYPE].x;
+  const numtyp coeffy=coeff_in[ONETYPE].y;
+  const numtyp coeffz=coeff_in[ONETYPE].z;
+  const numtyp coeffw=coeff_in[ONETYPE].w;
+  const numtyp cutsq_p=cutsq[ONETYPE];
+  #endif
+
+  int n_stride;
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    #endif
     numtyp4 iv; fetch4(iv,i,vel_tex); //v_[i];
     int itag=iv.w;
 
@@ -322,11 +337,16 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
       int j=dev_packed[nbor];
+      #ifndef ONETYPE
       factor_dpd = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
+      #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int mtype=itype+jx.w;
+      const numtyp cutsq_p=cutsq[mtype];
+      #endif
       numtyp4 jv; fetch4(jv,j,vel_tex); //v_[j];
       int jtag=jv.w;
 
@@ -336,7 +356,7 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<cutsq[mtype]) {
+      if (rsq<cutsq_p) {
         numtyp r=ucl_sqrt(rsq);
         if (r < EPSILON) continue;
 
@@ -345,7 +365,10 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         numtyp delvy = iv.y - jv.y;
         numtyp delvz = iv.z - jv.z;
         numtyp dot = delx*delvx + dely*delvy + delz*delvz;
-        numtyp wd = (numtyp)1.0 - r/coeff[mtype].w;
+        #ifndef ONETYPE
+        const numtyp coeffw=coeff[mtype].w;
+        #endif
+        numtyp wd = (numtyp)1.0 - r/coeffw;
 
         unsigned int tag1=itag, tag2=jtag;
         if (tag1 > tag2) {
@@ -359,24 +382,37 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
         // drag force = -gamma * wd^2 * (delx dot delv) / r
         // random force = sigma * wd * rnd * dtinvsqrt;
 
+        #ifndef ONETYPE
+        const numtyp coeffx=coeff[mtype].x;
+        const numtyp coeffy=coeff[mtype].y;
+        const numtyp coeffz=coeff[mtype].z;
+        #endif
         numtyp force = (numtyp)0.0;
-        if (!tstat_only) force = coeff[mtype].x*wd;
-        force -= coeff[mtype].y*wd*wd*dot*rinv;
-        force += coeff[mtype].z*wd*randnum*dtinvsqrt;
+        if (!tstat_only) force = coeffx*wd;
+        force -= coeffy*wd*wd*dot*rinv;
+        force += coeffz*wd*randnum*dtinvsqrt;
+        #ifndef ONETYPE
         force*=factor_dpd*rinv;
+        #else
+        force*=rinv;
+        #endif
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           // unshifted eng of conservative term:
           // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/cut[itype][jtype]);
           // eng shifted to 0.0 at cutoff
-          numtyp e = (numtyp)0.5*coeff[mtype].x*coeff[mtype].w * wd*wd;
+          numtyp e = (numtyp)0.5*coeffx*coeffw * wd*wd;
+          #ifndef ONETYPE
           energy+=factor_dpd*e;
+          #else
+          energy+=e;
+          #endif
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -387,8 +423,8 @@ __kernel void k_dpd_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_dpd.h b/lib/gpu/lal_dpd.h
index 3c36c39e05..be93d988a3 100644
--- a/lib/gpu/lal_dpd.h
+++ b/lib/gpu/lal_dpd.h
@@ -78,7 +78,7 @@ class DPD : public BaseDPD<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_dpd_ext.cpp b/lib/gpu/lal_dpd_ext.cpp
index d727a87319..7637ff03c0 100644
--- a/lib/gpu/lal_dpd_ext.cpp
+++ b/lib/gpu/lal_dpd_ext.cpp
@@ -55,7 +55,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
   int init_ok=0;
   if (world_me==0)
     init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                       host_cut, special_lj, false, inum, nall, 300,
+                       host_cut, special_lj, false, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   DPDMF.device->world_barrier();
@@ -73,7 +73,7 @@ int dpd_gpu_init(const int ntypes, double **cutsq, double **host_a0,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=DPDMF.init(ntypes, cutsq, host_a0, host_gamma, host_sigma,
-                         host_cut, special_lj, false, inum, nall, 300,
+                         host_cut, special_lj, false, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen);
 
     DPDMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_eam.cpp b/lib/gpu/lal_eam.cpp
index 03479cd16a..cdafe72898 100644
--- a/lib/gpu/lal_eam.cpp
+++ b/lib/gpu/lal_eam.cpp
@@ -52,9 +52,23 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
                const int maxspecial, const double cell_size,
                const double gpu_split, FILE *_screen)
 {
+  int max_shared_types=this->device->max_shared_types();
+
+  int onetype=0;
+  #ifdef USE_OPENCL
+  for (int i=1; i<ntypes; i++)
+    if (host_type2frho[i]>=0 && host_type2frho[i]<=nfrho-1) {
+      if (onetype>0)
+        onetype=-1;
+      else if (onetype==0)
+        onetype=i*max_shared_types+i;
+    }
+  if (onetype<0) onetype=0;
+  #endif
+
   int success;
   success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
-                            gpu_split,_screen,eam,"k_eam");
+                            gpu_split,_screen,eam,"k_eam",onetype);
 
   if (success!=0)
     return success;
@@ -72,6 +86,13 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   k_energy_fast.set_function(*(this->pair_program),"k_energy_fast");
   fp_tex.get_texture(*(this->pair_program),"fp_tex");
   fp_tex.bind_float(_fp,1);
+
+  #if defined(LAL_OCL_EV_JIT)
+  k_energy_fast_noev.set_function(*(this->pair_program_noev),"k_energy_fast");
+  #else
+  k_energy_sel = &k_energy_fast;
+  #endif
+
   _compiled_energy = true;
 
   // Initialize timers for selected GPU
@@ -88,7 +109,6 @@ int EAMT::init(const int ntypes, double host_cutforcesq, int **host_type2rhor,
   int lj_types=ntypes;
   shared_types=false;
 
-  int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
@@ -260,6 +280,9 @@ void EAMT::clear() {
   if (_compiled_energy) {
     k_energy_fast.clear();
     k_energy.clear();
+    #if defined(LAL_OCL_EV_JIT)
+    k_energy_fast_noev.clear();
+    #endif
     _compiled_energy=false;
   }
 
@@ -278,11 +301,18 @@ template <class numtyp, class acctyp>
 void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
                    const int nall, double **host_x, int *host_type,
                    int *ilist, int *numj, int **firstneigh,
-                   const bool eflag, const bool vflag,
+                   const bool eflag_in, const bool vflag_in,
                    const bool eatom, const bool vatom,
                    int &host_start, const double cpu_time,
                    bool &success, void **fp_ptr) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
 
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
@@ -346,12 +376,20 @@ void EAMT::compute(const int f_ago, const int inum_full, const int nlocal,
 template <class numtyp, class acctyp>
 int** EAMT::compute(const int ago, const int inum_full, const int nall,
                     double **host_x, int *host_type, double *sublo,
-                    double *subhi, tagint *tag, int **nspecial, tagint **special,
-                    const bool eflag, const bool vflag, const bool eatom,
+                    double *subhi, tagint *tag, int **nspecial,
+                    tagint **special, const bool eflag_in,
+                    const bool vflag_in, const bool eatom,
                     const bool vatom, int &host_start, int **ilist, int **jnum,
                     const double cpu_time, bool &success, int &inum,
                     void **fp_ptr) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
 
   if (this->device->time_device()) {
     // Put time from the second part to the total time_pair
@@ -430,9 +468,9 @@ void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
 
   loop2(eflag,vflag);
   if (ilist == nullptr)
-    this->ans->copy_answers(eflag,vflag,eatom,vatom);
+    this->ans->copy_answers(eflag,vflag,eatom,vatom, this->ans->inum());
   else
-    this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist);
+    this->ans->copy_answers(eflag,vflag,eatom,vatom, ilist, this->ans->inum());
 
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
@@ -442,20 +480,9 @@ void EAMT::compute2(int *ilist, const bool eflag, const bool vflag,
 // Calculate per-atom energies and forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void EAMT::loop(const bool _eflag, const bool _vflag) {
+int EAMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -464,13 +491,18 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
 
   if (shared_types) {
-    this->k_energy_fast.set_size(GX,BX);
-    this->k_energy_fast.run(&this->atom->x, &type2rhor_z2r, &type2frho,
-                            &rhor_spline2, &frho_spline1,&frho_spline2,
-                            &this->nbor->dev_nbor,  &this->_nbor_data->begin(),
-                            &_fp, &this->ans->engv, &eflag, &ainum,
-                            &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
-                            &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
+    #if defined(LAL_OCL_EV_JIT)
+    if (eflag || vflag) k_energy_sel = &k_energy_fast;
+    else k_energy_sel = &k_energy_fast_noev;
+    #endif
+
+    k_energy_sel->set_size(GX,BX);
+    k_energy_sel->run(&this->atom->x, &type2rhor_z2r, &type2frho,
+                      &rhor_spline2, &frho_spline1,&frho_spline2,
+                      &this->nbor->dev_nbor,  &this->_nbor_data->begin(),
+                      &_fp, &this->ans->engv, &eflag, &ainum,
+                      &nbor_pitch, &_ntypes, &_cutforcesq, &_rdr, &_rdrho,
+                      &_rhomax, &_nrho, &_nr, &this->_threads_per_atom);
   } else {
     this->k_energy.set_size(GX,BX);
     this->k_energy.run(&this->atom->x, &type2rhor_z2r, &type2frho,
@@ -482,6 +514,7 @@ void EAMT::loop(const bool _eflag, const bool _vflag) {
   }
 
   this->time_pair.stop();
+  return ainum;
 }
 
 // ---------------------------------------------------------------------------
@@ -510,8 +543,8 @@ void EAMT::loop2(const bool _eflag, const bool _vflag) {
   this->time_pair2.start();
 
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &_fp, &type2rhor_z2r,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &_fp, &type2rhor_z2r,
                           &rhor_spline1, &z2r_spline1, &z2r_spline2,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
diff --git a/lib/gpu/lal_eam.cu b/lib/gpu/lal_eam.cu
index b22ce7b575..3955f3cc8a 100644
--- a/lib/gpu/lal_eam.cu
+++ b/lib/gpu/lal_eam.cu
@@ -36,6 +36,16 @@ _texture( z2r_sp1_tex,int4);
 _texture( z2r_sp2_tex,int4);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define fp_tex fp_
+#define rhor_sp1_tex rhor_spline1
+#define rhor_sp2_tex rhor_spline2
+#define frho_sp1_tex frho_spline1
+#define frho_sp2_tex frho_spline2
+#define z2r_sp1_tex z2r_spline1
+#define z2r_sp2_tex z2r_spline2
+#endif
+
 #else
 
 #define pos_tex x_
@@ -52,30 +62,33 @@ _texture( z2r_sp2_tex,int4);
 #define MIN(A,B) ((A) < (B) ? (A) : (B))
 #define MAX(A,B) ((A) > (B) ? (A) : (B))
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
+
+#define local_allocate_store_energy_fp()                                    \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
-                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho)         \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=rho;                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s)                                                       \
          red_acc[tid] += red_acc[tid+s];                                    \
       }                                                                     \
       rho=red_acc[tid];                                                     \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     numtyp p = rho*rdrho + (numtyp)1.0;                                     \
     int m=p;                                                                \
     m = MAX(1,MIN(m,nrho-1));                                               \
     p -= m;                                                                 \
     p = MIN(p,(numtyp)1.0);                                                 \
-    int index = type2frho[itype]*(nrho+1)+m;                                \
+    int index = tfrho*(nrho+1)+m;                                           \
     numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
     numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
     fp_[i]=fp;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       if (rho > rhomax) energy += fp*(rho-rhomax);                          \
@@ -83,15 +96,18 @@ _texture( z2r_sp2_tex,int4);
     }                                                                       \
   }
 
+#define local_allocate_store_answers_eam()                                  \
+    __local acctyp red_acc[6][BLOCK_PAIR];
+
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                       offset, elag, vflag, ans, engv)                       \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
     red_acc[0][tid]=f.x;                                                    \
     red_acc[1][tid]=f.y;                                                    \
     red_acc[2][tid]=f.z;                                                    \
     red_acc[3][tid]=energy;                                                 \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         for (int r=0; r<4; r++)                                             \
           red_acc[r][tid] += red_acc[r][tid+s];                             \
@@ -101,10 +117,12 @@ _texture( z2r_sp2_tex,int4);
     f.y=red_acc[1][tid];                                                    \
     f.z=red_acc[2][tid];                                                    \
     energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
+      simdsync();                                                           \
       for (int r=0; r<6; r++)                                               \
         red_acc[r][tid]=virial[r];                                          \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        simdsync();                                                         \
         if (offset < s) {                                                   \
           for (int r=0; r<6; r++)                                           \
             red_acc[r][tid] += red_acc[r][tid+s];                           \
@@ -114,13 +132,13 @@ _texture( z2r_sp2_tex,int4);
         virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       engv[ei]+=energy*(acctyp)0.5;                                         \
       ei+=inum;                                                             \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         engv[ei]=virial[i]*(acctyp)0.5;                                     \
         ei+=inum;                                                           \
@@ -131,53 +149,57 @@ _texture( z2r_sp2_tex,int4);
 
 #else
 
+#define local_allocate_store_energy_fp()
+
 #define store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,           \
-                        eflag,vflag,engv,rdrho,nrho,i,rhomax)               \
+                        eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho)         \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1)                           \
-        rho += shfl_xor(rho, s, t_per_atom);                                \
+      rho += shfl_down(rho, s, t_per_atom);                                 \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     numtyp p = rho*rdrho + (numtyp)1.0;                                     \
     int m=p;                                                                \
     m = MAX(1,MIN(m,nrho-1));                                               \
     p -= m;                                                                 \
     p = MIN(p,(numtyp)1.0);                                                 \
-    int index = type2frho[itype]*(nrho+1)+m;                                \
+    int index = tfrho*(nrho+1)+m;                                           \
     numtyp4 coeff; fetch4(coeff,index,frho_sp1_tex);                        \
     numtyp fp = (coeff.x*p + coeff.y)*p + coeff.z;                          \
     fp_[i]=fp;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       fetch4(coeff,index,frho_sp2_tex);                                     \
       energy = ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;             \
       if (rho > rhomax) energy += fp*(rho-rhomax);                          \
-      engv[ii]=energy;                                          \
+      engv[ii]=energy;                                                      \
     }                                                                       \
   }
 
+#define local_allocate_store_answers_eam()
+
 #define store_answers_eam(f, energy, virial, ii, inum, tid, t_per_atom,     \
                           offset, eflag, vflag, ans, engv)                  \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+      f.x += shfl_down(f.x, s, t_per_atom);                                 \
+      f.y += shfl_down(f.y, s, t_per_atom);                                 \
+      f.z += shfl_down(f.z, s, t_per_atom);                                 \
+      if (EVFLAG) energy += shfl_down(energy, s, t_per_atom);               \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+        for (int r=0; r<6; r++)                                             \
+          virial[r] += shfl_down(virial[r], s, t_per_atom);                 \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       engv[ei]+=energy*(acctyp)0.5;                                         \
       ei+=inum;                                                             \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         engv[ei]=virial[i]*(acctyp)0.5;                                     \
         ei+=inum;                                                           \
@@ -203,21 +225,23 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
                        const numtyp rdr, const numtyp rdrho,
                        const numtyp rhomax, const int nrho,
                        const int nr, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i, itype;
   atom_info(t_per_atom,ii,tid,offset);
 
+  int n_stride;
+  local_allocate_store_energy_fp();
+
   acctyp rho = (acctyp)0;
-  acctyp energy = (acctyp)0;
+  acctyp energy;
+  if (EVFLAG && eflag) energy=(acctyp)0;
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
+    int nbor, nbor_end, numj;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
+    itype=ix.w;
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
@@ -245,10 +269,10 @@ __kernel void k_energy(const __global numtyp4 *restrict x_,
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-
-    store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-        eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
+  const numtyp tfrho=type2frho[itype];
+  store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
+                  eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho);
 }
 
 __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
@@ -267,34 +291,41 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
                             const numtyp rdrho, const numtyp rhomax,
                             const int nrho, const int nr,
                             const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i, itype;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local int type2frho[MAX_SHARED_TYPES];
-
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
-
   if (tid<MAX_SHARED_TYPES) {
     type2frho[tid]=type2frho_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp type2rhor_z2rx=
+    type2rhor_z2r_in[ONETYPE*MAX_SHARED_TYPES+ONETYPE].x;
+  const numtyp tfrho=type2frho_in[ONETYPE];
+  #endif
+
+  int n_stride;
+  local_allocate_store_energy_fp();
 
   acctyp rho = (acctyp)0;
-  acctyp energy = (acctyp)0;
-
-  __syncthreads();
+  acctyp energy;
+  if (EVFLAG && eflag) energy=(acctyp)0;
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    __local int n_stride;
+    int nbor, nbor_end, numj;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
+    #ifndef ONETYPE
+    itype=ix.w;
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
@@ -315,17 +346,23 @@ __kernel void k_energy_fast(const __global numtyp4 *restrict x_,
         p -= m;
         p = MIN(p,(numtyp)1.0);
 
+        #ifndef ONETYPE
         int jtype=fast_mul((int)MAX_SHARED_TYPES,jx.w);
         int mtype = jtype+itype;
         int index = type2rhor_z2r[mtype].x*(nr+1)+m;
+        #else
+        int index = type2rhor_z2rx*(nr+1)+m;
+        #endif
         numtyp4 coeff; fetch4(coeff,index,rhor_sp2_tex);
         rho += ((coeff.x*p + coeff.y)*p + coeff.z)*p + coeff.w;
       }
     } // for nbor
-
-    store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
-                    eflag,vflag,engv,rdrho,nrho,i,rhomax);
   } // if ii
+  #ifndef ONETYPE
+  const numtyp tfrho=type2frho[itype];
+  #endif
+  store_energy_fp(rho,energy,ii,inum,tid,t_per_atom,offset,
+                  eflag,vflag,engv,rdrho,nrho,i,rhomax,tfrho);
 }
 
 __kernel void k_eam(const __global numtyp4 *restrict x_,
@@ -345,19 +382,20 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_answers_eam();
+
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -418,10 +456,10 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += phi;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -431,10 +469,9 @@ __kernel void k_eam(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                    ans,engv);
 }
 
 __kernel void k_eam_fast(const __global numtyp4 *x_,
@@ -453,40 +490,51 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local int2 type2rhor_z2r[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     type2rhor_z2r[tid]=type2rhor_z2r_in[tid];
   }
+  __syncthreads();
+  #else
+  const int oi=ONETYPE*MAX_SHARED_TYPES+ONETYPE;
+  const numtyp type2rhor_z2rx=type2rhor_z2r_in[oi].x;
+  const numtyp type2rhor_z2ry=type2rhor_z2r_in[oi].y;
+  #endif
+
+  int n_stride;
+  local_allocate_store_answers_eam();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp ifp; fetch(ifp,i,fp_tex); //fp_[i];
+    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jw=jx.w;
       int jtype=fast_mul((int)MAX_SHARED_TYPES,jw);
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -503,20 +551,35 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         p = MIN(p,(numtyp)1.0);
 
         numtyp4 coeff;
-        int mtype,index;
+        #ifndef ONETYPE
+        int mtype;
+        #endif
+        int index;
 
+        #ifndef ONETYPE
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
+        #else
+        index = type2rhor_z2rx*(nr+1)+m;
+        #endif
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhoip = (coeff.x*p + coeff.y)*p + coeff.z;
 
+        #ifndef ONETYPE
         mtype = jtype+iw;
         index = type2rhor_z2r[mtype].x*(nr+1)+m;
+        #else
+        index = type2rhor_z2rx*(nr+1)+m;
+        #endif
         fetch4(coeff,index,rhor_sp1_tex);
         numtyp rhojp = (coeff.x*p + coeff.y)*p + coeff.z;
 
+        #ifndef ONETYPE
         mtype = itype+jw;
         index = type2rhor_z2r[mtype].y*(nr+1)+m;
+        #else
+        index = type2rhor_z2ry*(nr+1)+m;
+        #endif
         fetch4(coeff,index,z2r_sp1_tex);
         numtyp z2p = (coeff.x*p + coeff.y)*p + coeff.z;
         fetch4(coeff,index,z2r_sp2_tex);
@@ -534,10 +597,10 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += phi;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -547,8 +610,8 @@ __kernel void k_eam_fast(const __global numtyp4 *x_,
         }
       }
     } // for nbor
-    store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers_eam(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                    ans,engv);
 }
 
diff --git a/lib/gpu/lal_eam.h b/lib/gpu/lal_eam.h
index fa05075883..3cbaeac0b8 100644
--- a/lib/gpu/lal_eam.h
+++ b/lib/gpu/lal_eam.h
@@ -90,7 +90,7 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
                 const bool eatom, const bool vatom);
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Kernel k_energy, k_energy_fast;
+  UCL_Kernel k_energy, k_energy_fast, k_energy_fast_noev, *k_energy_sel;
 
   // --------------------------- TEXTURES -----------------------------
   UCL_Texture fp_tex;
@@ -133,8 +133,8 @@ class EAM : public BaseAtomic<numtyp, acctyp> {
 protected:
   bool _allocated;
   int _nlocal;
-  void loop(const bool _eflag, const bool _vflag);
-  void loop2(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
+  void loop2(const bool eflag, const bool vflag);
 };
 
 }
diff --git a/lib/gpu/lal_eam_alloy_ext.cpp b/lib/gpu/lal_eam_alloy_ext.cpp
index e5f1010e76..f7c4986e68 100644
--- a/lib/gpu/lal_eam_alloy_ext.cpp
+++ b/lib/gpu/lal_eam_alloy_ext.cpp
@@ -67,7 +67,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
     init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
                        host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
-                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size,
                        gpu_split, screen);
 
   EAMALMF.device->world_barrier();
@@ -87,7 +87,7 @@ int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
       init_ok=EAMALMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
                          host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
-                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMALMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_eam_ext.cpp b/lib/gpu/lal_eam_ext.cpp
index 78f2e3c1f8..3010e0ea7f 100644
--- a/lib/gpu/lal_eam_ext.cpp
+++ b/lib/gpu/lal_eam_ext.cpp
@@ -67,7 +67,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
                        host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
-                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size,
                        gpu_split, screen);
 
   EAMMF.device->world_barrier();
@@ -87,7 +87,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
       init_ok=EAMMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
                          host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
-                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMMF.device->gpu_barrier();
@@ -98,7 +98,7 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    EAMMF.estimate_gpu_overhead();
+    EAMMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_eam_fs_ext.cpp b/lib/gpu/lal_eam_fs_ext.cpp
index 37208e54f8..205b601562 100644
--- a/lib/gpu/lal_eam_fs_ext.cpp
+++ b/lib/gpu/lal_eam_fs_ext.cpp
@@ -67,7 +67,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
     init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                        host_type2frho, host_rhor_spline, host_z2r_spline,
                        host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho, nz2r,
-                       nfrho, nr, nlocal, nall, 300, maxspecial, cell_size,
+                       nfrho, nr, nlocal, nall, max_nbors, maxspecial, cell_size,
                        gpu_split, screen);
 
   EAMFSMF.device->world_barrier();
@@ -87,7 +87,7 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
       init_ok=EAMFSMF.init(ntypes, host_cutforcesq, host_type2rhor, host_type2z2r,
                          host_type2frho, host_rhor_spline, host_z2r_spline,
                          host_frho_spline, rdr, rdrho, rhomax, nrhor, nrho,
-                         nz2r, nfrho, nr, nlocal, nall, 300, maxspecial,
+                         nz2r, nfrho, nr, nlocal, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     EAMFSMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_ellipsoid_extra.h b/lib/gpu/lal_ellipsoid_extra.h
index e6122c7404..1c549ab6a6 100644
--- a/lib/gpu/lal_ellipsoid_extra.h
+++ b/lib/gpu/lal_ellipsoid_extra.h
@@ -32,22 +32,21 @@ _texture_2d( quat_tex,int4);
 #define quat_tex qif
 #endif
 
-#define nbor_info_e(nbor_mem, nbor_stride, t_per_atom, ii, offset,           \
-                    i, numj, stride, nbor_end, nbor_begin)                   \
-    i=nbor_mem[ii];                                                          \
-    nbor_begin=ii+nbor_stride;                                               \
-    numj=nbor_mem[nbor_begin];                                               \
-    nbor_begin+=nbor_stride;                                                 \
-    nbor_end=nbor_begin+fast_mul(nbor_stride,numj);                          \
-    nbor_begin+=fast_mul(offset,nbor_stride);                                \
-    stride=fast_mul(t_per_atom,nbor_stride);
+#define nbor_info_e_ss(nbor_mem, nbor_stride, t_per_atom, ii, offset,        \
+                       i, numj, stride, nbor_end, nbor_begin)                \
+  i=nbor_mem[ii];                                                            \
+  nbor_begin=ii+nbor_stride;                                                 \
+  numj=nbor_mem[nbor_begin];                                                 \
+  nbor_begin+=nbor_stride;                                                   \
+  nbor_end=nbor_begin+fast_mul(nbor_stride,numj);                            \
+  nbor_begin+=fast_mul(offset,nbor_stride);                                  \
+  stride=fast_mul(t_per_atom,nbor_stride);
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
 #define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+                        t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[7][BLOCK_PAIR];                                  \
     red_acc[0][tid]=f.x;                                                    \
     red_acc[1][tid]=f.y;                                                    \
     red_acc[2][tid]=f.z;                                                    \
@@ -55,6 +54,7 @@ _texture_2d( quat_tex,int4);
     red_acc[4][tid]=tor.y;                                                  \
     red_acc[5][tid]=tor.z;                                                  \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         for (int r=0; r<6; r++)                                             \
           red_acc[r][tid] += red_acc[r][tid+s];                             \
@@ -66,28 +66,39 @@ _texture_2d( quat_tex,int4);
     tor.x=red_acc[3][tid];                                                  \
     tor.y=red_acc[4][tid];                                                  \
     tor.z=red_acc[5][tid];                                                  \
-    if (eflag>0 || vflag>0) {                                               \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      red_acc[6][tid]=energy;                                               \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<7; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
+    if (EVFLAG && (eflag || vflag)) {                                       \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid]=virial[r];                                        \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                     \
+          simdsync();                                                       \
+          if (offset < s) {                                                 \
+            for (int r=0; r<6; r++)                                         \
+              red_acc[r][tid] += red_acc[r][tid+s];                         \
+          }                                                                 \
+        }                                                                   \
+        for (int r=0; r<6; r++)                                             \
+          virial[r]=red_acc[r][tid];                                        \
+      }                                                                     \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        red_acc[0][tid]=energy;                                             \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                     \
+          simdsync();                                                       \
+          if (offset < s) red_acc[0][tid] += red_acc[0][tid+s];             \
         }                                                                   \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-      energy=red_acc[6][tid];                                               \
+      energy=red_acc[0][tid];                                               \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *ap1=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                         \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                       \
@@ -100,12 +111,12 @@ _texture_2d( quat_tex,int4);
 #define acc_answers(f, energy, virial, ii, inum, tid, t_per_atom, offset,   \
                     eflag, vflag, ans, engv)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
     red_acc[0][tid]=f.x;                                                    \
     red_acc[1][tid]=f.y;                                                    \
     red_acc[2][tid]=f.z;                                                    \
     red_acc[3][tid]=energy;                                                 \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         for (int r=0; r<4; r++)                                             \
           red_acc[r][tid] += red_acc[r][tid+s];                             \
@@ -115,10 +126,11 @@ _texture_2d( quat_tex,int4);
     f.y=red_acc[1][tid];                                                    \
     f.z=red_acc[2][tid];                                                    \
     energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int r=0; r<6; r++)                                               \
         red_acc[r][tid]=virial[r];                                          \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        simdsync();                                                         \
         if (offset < s) {                                                   \
           for (int r=0; r<6; r++)                                           \
             red_acc[r][tid] += red_acc[r][tid+s];                           \
@@ -128,13 +140,13 @@ _texture_2d( quat_tex,int4);
         virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     engv+=ii;                                                               \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *engv+=energy*(acctyp)0.5;                                            \
       engv+=inum;                                                           \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *engv+=virial[i]*(acctyp)0.5;                                       \
         engv+=inum;                                                         \
@@ -150,31 +162,31 @@ _texture_2d( quat_tex,int4);
 #else
 
 #define store_answers_t(f, tor, energy, virial, ii, astride, tid,           \
-                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+                        t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+      f.x += shfl_down(f.x, s, t_per_atom);                                 \
+      f.y += shfl_down(f.y, s, t_per_atom);                                 \
+      f.z += shfl_down(f.z, s, t_per_atom);                                 \
+      tor.x += shfl_down(tor.x, s, t_per_atom);                             \
+      tor.y += shfl_down(tor.y, s, t_per_atom);                             \
+      tor.z += shfl_down(tor.z, s, t_per_atom);                             \
+      if (EVFLAG) energy += shfl_down(energy, s, t_per_atom);               \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+        for (int r=0; r<6; r++)                                             \
+          virial[r] += shfl_down(virial[r], s, t_per_atom);                 \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *ap1=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                         \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *ap1=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                       \
@@ -188,25 +200,25 @@ _texture_2d( quat_tex,int4);
                     eflag, vflag, ans, engv)                                \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+      f.x += shfl_down(f.x, s, t_per_atom);                                 \
+      f.y += shfl_down(f.y, s, t_per_atom);                                 \
+      f.z += shfl_down(f.z, s, t_per_atom);                                 \
+      if (EVFLAG) energy += shfl_down(energy, s, t_per_atom);               \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+        for (int r=0; r<6; r++)                                             \
+          virial[r] += shfl_down(virial[r], s, t_per_atom);                 \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     engv+=ii;                                                               \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *engv+=energy*(acctyp)0.5;                                            \
       engv+=inum;                                                           \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *engv+=virial[i]*(acctyp)0.5;                                       \
         engv+=inum;                                                         \
diff --git a/lib/gpu/lal_ellipsoid_nbor.cu b/lib/gpu/lal_ellipsoid_nbor.cu
index 5ad935ba9b..9b9d03914c 100644
--- a/lib/gpu/lal_ellipsoid_nbor.cu
+++ b/lib/gpu/lal_ellipsoid_nbor.cu
@@ -34,7 +34,8 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
                           __global int *dev_nbor,
                           const int nbor_pitch, const int start, const int inum,
                           const __global int *dev_ij,
-                          const int form_low, const int form_high) {
+                          const int form_low, const int form_high,
+                          const int t_per_atom) {
 
   // ii indexes the two interacting particles in gi
   int ii=GLOBAL_ID_X+start;
@@ -45,12 +46,15 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
     int numj=dev_ij[nbor];
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
-    int packed=ii+nbor_pitch+nbor_pitch;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul(iw,ntypes);
     int newj=0;
+
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
+
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int sj=dev_ij[nbor];
       int j = sj & NEIGHMASK;
@@ -68,9 +72,11 @@ __kernel void kernel_nbor(const __global numtyp4 *restrict x_,
         rsq+=t*t;
 
         if (rsq<cf.x) {
-          dev_nbor[packed]=sj;
-          packed+=nbor_pitch;
+          *out_list=sj;
+          out_list++;
           newj++;
+          if ((newj & (t_per_atom-1))==0)
+            out_list+=out_stride;
         }
       }
     }
@@ -90,7 +96,8 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
                                const int nbor_pitch, const int start,
                                const int inum,
                                const __global int *dev_ij,
-                               const int form_low, const int form_high) {
+                               const int form_low, const int form_high,
+                               const int t_per_atom) {
 
   int ii=THREAD_ID_X;
   __local int form[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
@@ -108,13 +115,15 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
     int numj=dev_ij[nbor];
     nbor+=nbor_pitch;
     int nbor_end=nbor+fast_mul(numj,nbor_pitch);
-    int packed=ii+nbor_pitch+nbor_pitch;
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
     int newj=0;
+
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
     for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
       int sj=dev_ij[nbor];
       int j = sj & NEIGHMASK;
@@ -132,9 +141,11 @@ __kernel void kernel_nbor_fast(const __global numtyp4 *restrict x_,
         rsq+=t*t;
 
         if (rsq<cutsq[mtype]) {
-          dev_nbor[packed]=sj;
-          packed+=nbor_pitch;
+          *out_list=sj;
+          out_list++;
           newj++;
+          if ((newj & (t_per_atom-1))==0)
+            out_list+=out_stride;
         }
       }
     }
diff --git a/lib/gpu/lal_gauss.cpp b/lib/gpu/lal_gauss.cpp
index 2f965758eb..6d8f0f02aa 100644
--- a/lib/gpu/lal_gauss.cpp
+++ b/lib/gpu/lal_gauss.cpp
@@ -122,20 +122,9 @@ double GaussT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void GaussT::loop(const bool _eflag, const bool _vflag) {
+int GaussT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -143,19 +132,20 @@ void GaussT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &gauss1, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &gauss1,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
   } else {
     this->k_pair.set_size(GX,BX);
-    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types, &sp_lj,
+    this->k_pair.run(&this->atom->x, &gauss1, &_lj_types,
                      &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                      &this->ans->force, &this->ans->engv, &eflag, &vflag,
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Gauss<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_gauss.cu b/lib/gpu/lal_gauss.cu
index 2192fb39ca..2540b8492f 100644
--- a/lib/gpu/lal_gauss.cu
+++ b/lib/gpu/lal_gauss.cu
@@ -27,7 +27,6 @@ _texture_2d( pos_tex,int4);
 __kernel void k_gauss(const __global numtyp4 *restrict x_,
                       const __global numtyp4 *restrict gauss1,
                       const int lj_types,
-                      const __global numtyp *restrict sp_lj_in,
                       const __global int *dev_nbor,
                       const __global int *dev_packed,
                       __global acctyp4 *restrict ans,
@@ -37,23 +36,20 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  __local numtyp sp_lj[4];
-  sp_lj[0]=sp_lj_in[0];
-  sp_lj[1]=sp_lj_in[1];
-  sp_lj[2]=sp_lj_in[2];
-  sp_lj[3]=sp_lj_in[3];
+  int n_stride;
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -85,12 +81,12 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
           energy+=e; //factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -101,14 +97,13 @@ __kernel void k_gauss(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
                            const __global numtyp4 *restrict gauss1_in,
-                           const __global numtyp *restrict sp_lj_in,
                            const __global int *dev_nbor,
                            const __global int *dev_packed,
                            __global acctyp4 *restrict ans,
@@ -119,26 +114,26 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp4 gauss1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
-  __local numtyp sp_lj[4];
-  if (tid<4)
-    sp_lj[tid]=sp_lj_in[tid];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     gauss1[tid]=gauss1_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -170,12 +165,12 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=-(gauss1[mtype].x*ucl_exp(-gauss1[mtype].y*rsq) -
             gauss1[mtype].w);
           energy+=e; //factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -186,8 +181,8 @@ __kernel void k_gauss_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_gauss.h b/lib/gpu/lal_gauss.h
index 1399b82d03..ecb04c49b2 100644
--- a/lib/gpu/lal_gauss.h
+++ b/lib/gpu/lal_gauss.h
@@ -73,7 +73,7 @@ class Gauss : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_gauss_ext.cpp b/lib/gpu/lal_gauss_ext.cpp
index a2804ce3cf..afec2e86f2 100644
--- a/lib/gpu/lal_gauss_ext.cpp
+++ b/lib/gpu/lal_gauss_ext.cpp
@@ -55,7 +55,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
   int init_ok=0;
   if (world_me==0)
     init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
-                       offset, special_lj, inum, nall, 300,
+                       offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   GLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=GLMF.init(ntypes, cutsq, host_a, host_b,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     GLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_gayberne.cpp b/lib/gpu/lal_gayberne.cpp
index f17fc50f5f..2b1a190e5a 100644
--- a/lib/gpu/lal_gayberne.cpp
+++ b/lib/gpu/lal_gayberne.cpp
@@ -127,7 +127,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
     host_write[i*4+2]=host_shape[i][2];
   }
   UCL_H_Vec<numtyp4> view4;
-  view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
+  view4.view(host_write,shape.numel());
   ucl_copy(shape,view4,false);
 
   well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
@@ -136,7 +136,7 @@ int GayBerneT::init(const int ntypes, const double gamma,
     host_write[i*4+1]=host_well[i][1];
     host_write[i*4+2]=host_well[i][2];
   }
-  view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
+  view4.view(host_write,well.numel());
   ucl_copy(well,view4,false);
 
   _allocated=true;
@@ -184,19 +184,8 @@ double GayBerneT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void GayBerneT::loop(const bool _eflag, const bool _vflag) {
+int GayBerneT::loop(const int eflag, const int vflag) {
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
   int ainum=this->ans->inum();
@@ -213,8 +202,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor1.stop();
 
       this->time_ellipsoid.start();
-      this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_elps_sel->set_size(GX,BX);
+      this->k_elps_sel->run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->gamma_upsilon_mu,
                             &this->sigma_epsilon, &this->_lj_types,
                             &this->lshape, &this->nbor->dev_nbor, &stride,
@@ -230,7 +219,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
         this->time_ellipsoid2.stop();
         this->time_lj.start();
         this->time_lj.stop();
-        return;
+        return ainum;
       }
 
       // ------------ SPHERE_ELLIPSE ---------------
@@ -246,8 +235,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor2.stop();
 
       this->time_ellipsoid2.start();
-      this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_sphere_elps_sel->set_size(GX,BX);
+      this->k_sphere_elps_sel->run(&this->atom->x, &this->atom->quat,
                                    &this->shape,  &this->well,
                                    &this->gamma_upsilon_mu,
                                    &this->sigma_epsilon, &this->_lj_types,
@@ -276,8 +265,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
-        this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+        this->k_lj_sel->set_size(GX,BX);
+        this->k_lj_sel->run(&this->atom->x, &this->lj1, &this->lj3,
                             &this->gamma_upsilon_mu, &stride,
                             &this->nbor->dev_packed, &this->ans->force,
                             &this->ans->engv, &this->dev_error, &eflag,
@@ -303,8 +292,8 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                                  ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
     this->time_ellipsoid.start();
-    this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x,  &this->atom->quat,
+    this->k_elps_sel->set_size(GX,BX);
+    this->k_elps_sel->run(&this->atom->x,  &this->atom->quat,
                           &this->shape, &this->well, &this->gamma_upsilon_mu,
                           &this->sigma_epsilon, &this->_lj_types, &this->lshape,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
@@ -312,6 +301,7 @@ void GayBerneT::loop(const bool _eflag, const bool _vflag) {
                           &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
+  return ainum;
 }
 
 template class GayBerne<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_gayberne.cu b/lib/gpu/lal_gayberne.cu
index c9d0353ca8..9267dfd85d 100644
--- a/lib/gpu/lal_gayberne.cu
+++ b/lib/gpu/lal_gayberne.cu
@@ -100,29 +100,27 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=gum[3];
   sp_lj[1]=gum[4];
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp4 f, tor;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -322,10 +320,10 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
       }
 
       numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=u_r*temp2;
       numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         r12[0]*=-r;
         r12[1]*=-r;
         r12[2]*=-r;
@@ -356,8 +354,8 @@ __kernel void k_gayberne(const __global numtyp4 *restrict x_,
       tor.z+=temp1*tchi[2]+temp2*teta[2]+temp3*tUr[2];
 
     } // for nbor
-    store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv,inum);
 }
 
diff --git a/lib/gpu/lal_gayberne.h b/lib/gpu/lal_gayberne.h
index 750c739cec..5cdc6bcd67 100644
--- a/lib/gpu/lal_gayberne.h
+++ b/lib/gpu/lal_gayberne.h
@@ -86,7 +86,7 @@ class GayBerne : public BaseEllipsoid<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_gayberne_lj.cu b/lib/gpu/lal_gayberne_lj.cu
index fdf40720aa..4582f0d411 100644
--- a/lib/gpu/lal_gayberne_lj.cu
+++ b/lib/gpu/lal_gayberne_lj.cu
@@ -17,6 +17,13 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
+#if (SHUFFLE_AVAIL == 0)
+#define local_allocate_store_ellipse_lj local_allocate_store_ellipse
+#else
+#define local_allocate_store_ellipse_lj()                                   \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+#endif
+
 __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
                                           const __global numtyp4 *restrict q,
                                           const __global numtyp4 *restrict shape,
@@ -38,25 +45,26 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse_lj();
+
   sp_lj[0]=gum[3];
   sp_lj[1]=gum[4];
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -214,10 +222,10 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
       }
 
       numtyp temp2 = factor_lj*eta*chi;
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=u_r*temp2;
       numtyp temp1 = -eta*u_r*factor_lj;
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         r12[0]*=-1;
         r12[1]*=-1;
         r12[2]*=-1;
@@ -239,9 +247,9 @@ __kernel void k_gayberne_sphere_ellipsoid(const __global numtyp4 *restrict x_,
         f.z+=temp1*dchi[2]-temp2*dUr[2];
       }
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
@@ -261,26 +269,27 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=gum[3];
   sp_lj[1]=gum[4];
   sp_lj[2]=gum[5];
   sp_lj[3]=gum[6];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
@@ -312,11 +321,11 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
           energy+=factor_lj*(e-lj3[ii].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -327,9 +336,9 @@ __kernel void k_gayberne_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
 
 __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
@@ -351,31 +360,32 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   if (tid<4)
     sp_lj[tid]=gum[tid+3];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
@@ -406,11 +416,11 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -421,8 +431,8 @@ __kernel void k_gayberne_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj.cpp b/lib/gpu/lal_lj.cpp
index 5bd015e364..40fefe28b3 100644
--- a/lib/gpu/lal_lj.cpp
+++ b/lib/gpu/lal_lj.cpp
@@ -51,16 +51,31 @@ int LJT::init(const int ntypes,
                           const int nall, const int max_nbors,
                           const int maxspecial, const double cell_size,
                           const double gpu_split, FILE *_screen) {
+  const int max_shared_types=this->device->max_shared_types();
+
+  int onetype=0;
+  #ifdef USE_OPENCL
+  if (maxspecial==0)
+    for (int i=1; i<ntypes; i++)
+      for (int j=i; j<ntypes; j++)
+        if (host_cutsq[i][j]>0) {
+          if (onetype>0)
+            onetype=-1;
+          else if (onetype==0)
+            onetype=i*max_shared_types+j;
+        }
+  if (onetype<0) onetype=0;
+  #endif
+
   int success;
-  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,lj,"k_lj");
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,lj,"k_lj",onetype);
   if (success!=0)
     return success;
 
   // If atom type constants fit in shared memory use fast kernel
   int lj_types=ntypes;
   shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
   if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
     lj_types=max_shared_types;
     shared_types=true;
@@ -130,20 +145,9 @@ double LJT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJT::loop(const bool _eflag, const bool _vflag) {
+int LJT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -151,8 +155,8 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -165,6 +169,7 @@ void LJT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJ<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj.cu b/lib/gpu/lal_lj.cu
index 7297a287e6..382cd140d9 100644
--- a/lib/gpu/lal_lj.cu
+++ b/lib/gpu/lal_lj.cu
@@ -38,16 +38,19 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -81,11 +84,11 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -96,9 +99,9 @@ __kernel void k_lj(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
@@ -114,6 +117,7 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
+  #ifndef ONETYPE
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
@@ -121,38 +125,58 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lj1x=lj1_in[ONETYPE].x;
+  const numtyp lj1y=lj1_in[ONETYPE].y;
+  const numtyp cutsq=lj1_in[ONETYPE].z;
+  numtyp lj3x, lj3y, lj3z;
+  if (EVFLAG && eflag) {
+    lj3x=lj3_in[ONETYPE].x;
+    lj3y=lj3_in[ONETYPE].y;
+    lj3z=lj3_in[ONETYPE].z;
+  }
+  #endif
+
+  int n_stride;
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
-
     numtyp factor_lj;
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
+    #endif
 
+    NOUNROLL
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
       int j=dev_packed[nbor];
+      #ifndef ONETYPE
       factor_lj = sp_lj[sbmask(j)];
       j &= NEIGHMASK;
+      #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int mtype=itype+jx.w;
+      numtyp cutsq=lj1[mtype].z;
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -160,20 +184,37 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp r2inv = delx*delx+dely*dely+delz*delz;
 
-      if (r2inv<lj1[mtype].z) {
+      if (r2inv<cutsq) {
+        #ifndef ONETYPE
+        numtyp lj1x=lj1[mtype].x;
+        numtyp lj1y=lj1[mtype].y;
+        #endif
+
         r2inv=ucl_recip(r2inv);
         numtyp r6inv = r2inv*r2inv*r2inv;
-        numtyp force = factor_lj*r2inv*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+        numtyp force = r2inv*r6inv*(lj1x*r6inv-lj1y);
+        #ifndef ONETYPE
+        force*=factor_lj;
+        #endif
 
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
-          numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
-          energy+=factor_lj*(e-lj3[mtype].z);
+        if (EVFLAG && eflag) {
+          #ifndef ONETYPE
+          numtyp lj3x=lj3[mtype].x;
+          numtyp lj3y=lj3[mtype].y;
+          numtyp lj3z=lj3[mtype].z;
+          #endif
+          numtyp e=r6inv*(lj3x*r6inv-lj3y);
+          #ifndef ONETYPE
+          energy+=factor_lj*(e-lj3z);
+          #else
+          energy+=(e-lj3z);
+          #endif
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -182,10 +223,9 @@ __kernel void k_lj_fast(const __global numtyp4 *restrict x_,
           virial[5] += dely*delz*force;
         }
       }
-
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj.h b/lib/gpu/lal_lj.h
index c6fec0d159..cdf850efd7 100644
--- a/lib/gpu/lal_lj.h
+++ b/lib/gpu/lal_lj.h
@@ -76,7 +76,7 @@ class LJ : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj96.cpp b/lib/gpu/lal_lj96.cpp
index 6f74cd0f19..df7dc11558 100644
--- a/lib/gpu/lal_lj96.cpp
+++ b/lib/gpu/lal_lj96.cpp
@@ -113,20 +113,9 @@ double LJ96T::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJ96T::loop(const bool _eflag, const bool _vflag) {
+int LJ96T::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -149,6 +138,7 @@ void LJ96T::loop(const bool _eflag, const bool _vflag) {
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJ96<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj96.cu b/lib/gpu/lal_lj96.cu
index c602e7555e..d1f7e3791f 100644
--- a/lib/gpu/lal_lj96.cu
+++ b/lib/gpu/lal_lj96.cu
@@ -39,22 +39,25 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -89,11 +92,11 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -104,9 +107,9 @@ __kernel void k_lj96(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
@@ -125,27 +128,30 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -180,11 +186,11 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r3inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -195,8 +201,8 @@ __kernel void k_lj96_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj96.h b/lib/gpu/lal_lj96.h
index eef6863f37..535e32a580 100644
--- a/lib/gpu/lal_lj96.h
+++ b/lib/gpu/lal_lj96.h
@@ -71,7 +71,7 @@ class LJ96 : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj96_ext.cpp b/lib/gpu/lal_lj96_ext.cpp
index f68b35de57..be7ffc5a09 100644
--- a/lib/gpu/lal_lj96_ext.cpp
+++ b/lib/gpu/lal_lj96_ext.cpp
@@ -55,7 +55,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300,
+                        host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   LJ96MF.device->world_barrier();
@@ -73,7 +73,7 @@ int lj96_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJ96MF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum,  nall, 300, maxspecial,
+                          offset, special_lj, inum,  nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen);
 
     LJ96MF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_class2_long.cpp b/lib/gpu/lal_lj_class2_long.cpp
index 24b07212ed..31e03a2a82 100644
--- a/lib/gpu/lal_lj_class2_long.cpp
+++ b/lib/gpu/lal_lj_class2_long.cpp
@@ -123,20 +123,9 @@ double LJClass2LongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
+int LJClass2LongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -144,8 +133,8 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void LJClass2LongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJClass2Long<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_class2_long.cu b/lib/gpu/lal_lj_class2_long.cu
index 65f0bf993c..5c8a2d46b2 100644
--- a/lib/gpu/lal_lj_class2_long.cu
+++ b/lib/gpu/lal_lj_class2_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -123,7 +126,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -131,7 +134,7 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -142,9 +145,9 @@ __kernel void k_lj_class2_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
@@ -168,28 +171,31 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -245,7 +251,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -253,7 +259,7 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -264,8 +270,8 @@ __kernel void k_lj_class2_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_class2_long.h b/lib/gpu/lal_lj_class2_long.h
index eac6451b2e..84e07bf7cd 100644
--- a/lib/gpu/lal_lj_class2_long.h
+++ b/lib/gpu/lal_lj_class2_long.h
@@ -75,7 +75,7 @@ class LJClass2Long : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_class2_long_ext.cpp b/lib/gpu/lal_lj_class2_long_ext.cpp
index f669a81189..311b027536 100644
--- a/lib/gpu/lal_lj_class2_long_ext.cpp
+++ b/lib/gpu/lal_lj_class2_long_ext.cpp
@@ -58,7 +58,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int c2cl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=C2CLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
+                          offset, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_lj_coul.cpp b/lib/gpu/lal_lj_coul.cpp
index 59ce9c5e61..cd8a411a79 100644
--- a/lib/gpu/lal_lj_coul.cpp
+++ b/lib/gpu/lal_lj_coul.cpp
@@ -125,20 +125,9 @@ double LJCoulT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void LJCoulT::loop(const bool _eflag, const bool _vflag) {
                      &cutsq, &_qqrd2e, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoul<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul.cu b/lib/gpu/lal_lj_coul.cu
index afbb972942..c728967bc5 100644
--- a/lib/gpu/lal_lj_coul.cu
+++ b/lib/gpu/lal_lj_coul.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -115,14 +118,14 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -133,9 +136,9 @@ __kernel void k_lj_coul(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
@@ -158,29 +161,32 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -228,14 +234,14 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           e_coul += forcecoul;
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -246,8 +252,8 @@ __kernel void k_lj_coul_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul.h b/lib/gpu/lal_lj_coul.h
index 0e11162aa5..eb490d5820 100644
--- a/lib/gpu/lal_lj_coul.h
+++ b/lib/gpu/lal_lj_coul.h
@@ -77,7 +77,7 @@ class LJCoul : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_debye.cpp b/lib/gpu/lal_lj_coul_debye.cpp
index 556a0a5cd3..78ef1bf3f7 100644
--- a/lib/gpu/lal_lj_coul_debye.cpp
+++ b/lib/gpu/lal_lj_coul_debye.cpp
@@ -127,20 +127,9 @@ double LJCoulDebyeT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulDebyeT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -148,8 +137,8 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->atom->q, &cutsq,
@@ -163,6 +152,7 @@ void LJCoulDebyeT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_kappa, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoulDebye<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul_debye.cu b/lib/gpu/lal_lj_coul_debye.cu
index 053fbeccc8..1804625649 100644
--- a/lib/gpu/lal_lj_coul_debye.cu
+++ b/lib/gpu/lal_lj_coul_debye.cu
@@ -48,6 +48,9 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -57,18 +60,18 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -120,7 +123,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
@@ -129,7 +132,7 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
             e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -140,9 +143,9 @@ __kernel void k_lj_debye(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
@@ -166,29 +169,32 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     cutsq[tid]=_cutsq[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -240,7 +246,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < lj1[mtype].z) {
             numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
             energy+=factor_lj*(e-lj3[mtype].z);
@@ -249,7 +255,7 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
             e_coul+=qqrd2e*qtmp*rinv*screening*factor_coul;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -260,8 +266,8 @@ __kernel void k_lj_debye_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul_debye.h b/lib/gpu/lal_lj_coul_debye.h
index 22fcf7234b..19abf32169 100644
--- a/lib/gpu/lal_lj_coul_debye.h
+++ b/lib/gpu/lal_lj_coul_debye.h
@@ -77,7 +77,7 @@ class LJCoulDebye : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_debye_ext.cpp b/lib/gpu/lal_lj_coul_debye_ext.cpp
index 95588eb95a..4f81b01457 100644
--- a/lib/gpu/lal_lj_coul_debye_ext.cpp
+++ b/lib/gpu/lal_lj_coul_debye_ext.cpp
@@ -58,7 +58,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300,
+                        host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
@@ -77,7 +77,7 @@ int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
+                          offset, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, kappa);
 
diff --git a/lib/gpu/lal_lj_coul_ext.cpp b/lib/gpu/lal_lj_coul_ext.cpp
index 060088a7cb..5b7f97e630 100644
--- a/lib/gpu/lal_lj_coul_ext.cpp
+++ b/lib/gpu/lal_lj_coul_ext.cpp
@@ -57,7 +57,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e);
 
@@ -76,7 +76,7 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e);
 
diff --git a/lib/gpu/lal_lj_coul_long.cpp b/lib/gpu/lal_lj_coul_long.cpp
index 66897a4aa7..e6be361abb 100644
--- a/lib/gpu/lal_lj_coul_long.cpp
+++ b/lib/gpu/lal_lj_coul_long.cpp
@@ -140,20 +140,9 @@ double LJCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -161,8 +150,8 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -178,6 +167,7 @@ void LJCoulLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul_long.cu b/lib/gpu/lal_lj_coul_long.cu
index ac3479421f..85af3c3433 100644
--- a/lib/gpu/lal_lj_coul_long.cu
+++ b/lib/gpu/lal_lj_coul_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -121,7 +124,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -129,7 +132,7 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -140,9 +143,9 @@ __kernel void k_lj_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -164,28 +167,31 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -239,7 +245,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -247,7 +253,7 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -258,8 +264,8 @@ __kernel void k_lj_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul_long.h b/lib/gpu/lal_lj_coul_long.h
index 8f77671dc0..bc4fce40a5 100644
--- a/lib/gpu/lal_lj_coul_long.h
+++ b/lib/gpu/lal_lj_coul_long.h
@@ -80,7 +80,7 @@ class LJCoulLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_long_ext.cpp b/lib/gpu/lal_lj_coul_long_ext.cpp
index 33771af53c..6a027bdc7e 100644
--- a/lib/gpu/lal_lj_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_coul_long_ext.cpp
@@ -58,7 +58,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, special_lj, inum, nall, 300, maxspecial,
+                          offset, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_lj_coul_msm.cpp b/lib/gpu/lal_lj_coul_msm.cpp
index 9a17d068ec..656736865b 100644
--- a/lib/gpu/lal_lj_coul_msm.cpp
+++ b/lib/gpu/lal_lj_coul_msm.cpp
@@ -157,20 +157,9 @@ double LJCoulMSMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
+int LJCoulMSMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -178,8 +167,8 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &gcons, &dgcons, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -195,6 +184,7 @@ void LJCoulMSMT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_order, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCoulMSM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_coul_msm.cu b/lib/gpu/lal_lj_coul_msm.cu
index a3c36eed85..39fc723736 100644
--- a/lib/gpu/lal_lj_coul_msm.cu
+++ b/lib/gpu/lal_lj_coul_msm.cu
@@ -28,6 +28,11 @@ _texture( gcons_tex,int2);
 _texture( dgcons_tex,int2);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define gcons_tex gcons
+#define dgcons_tex dgcons
+#endif
+
 #else
 #define pos_tex x_
 #define q_tex q_
@@ -100,6 +105,9 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -109,18 +117,18 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -175,7 +183,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(egamma-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -183,7 +191,7 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -194,9 +202,9 @@ __kernel void k_lj_coul_msm(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
@@ -220,28 +228,31 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -296,7 +307,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(egamma-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -304,7 +315,7 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -315,8 +326,8 @@ __kernel void k_lj_coul_msm_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_coul_msm.h b/lib/gpu/lal_lj_coul_msm.h
index 6369ce8cb5..a929848aaf 100644
--- a/lib/gpu/lal_lj_coul_msm.h
+++ b/lib/gpu/lal_lj_coul_msm.h
@@ -80,7 +80,7 @@ class LJCoulMSM : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_coul_msm_ext.cpp b/lib/gpu/lal_lj_coul_msm_ext.cpp
index d957cbe376..2d9d77fe77 100644
--- a/lib/gpu/lal_lj_coul_msm_ext.cpp
+++ b/lib/gpu/lal_lj_coul_msm_ext.cpp
@@ -59,7 +59,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   if (world_me==0)
     init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                         host_gcons, host_dgcons, offset,
-                        special_lj, inum, nall, 300, maxspecial,
+                        special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, order, qqrd2e);
 
@@ -79,7 +79,7 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCMLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
                           host_gcons, host_dgcons, offset,
-                          special_lj, inum, nall, 300, maxspecial,
+                          special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, order, qqrd2e);
 
diff --git a/lib/gpu/lal_lj_cubic.cpp b/lib/gpu/lal_lj_cubic.cpp
index f8200ec037..fa5073d409 100644
--- a/lib/gpu/lal_lj_cubic.cpp
+++ b/lib/gpu/lal_lj_cubic.cpp
@@ -119,20 +119,9 @@ double LJCubicT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJCubicT::loop(const bool _eflag, const bool _vflag) {
+int LJCubicT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -140,8 +129,8 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj2, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -154,6 +143,7 @@ void LJCubicT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJCubic<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_cubic.cu b/lib/gpu/lal_lj_cubic.cu
index f93013fe75..a91326d521 100644
--- a/lib/gpu/lal_lj_cubic.cu
+++ b/lib/gpu/lal_lj_cubic.cu
@@ -46,16 +46,19 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,7 +101,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e;
           if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -106,7 +109,7 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -117,9 +120,9 @@ __kernel void k_lj_cubic(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
@@ -140,27 +143,30 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp2 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
     lj2[tid]=lj2_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -203,7 +209,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e;
           if (rsq <= lj2[mtype].x)
             e = r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
@@ -211,7 +217,7 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
             e = lj2[mtype].w*(_PHIS + _DPHIDS*t - _A3*t*t*t/6.0);
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -222,8 +228,8 @@ __kernel void k_lj_cubic_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_cubic.h b/lib/gpu/lal_lj_cubic.h
index 9578ca27e4..a37044b279 100644
--- a/lib/gpu/lal_lj_cubic.h
+++ b/lib/gpu/lal_lj_cubic.h
@@ -73,7 +73,7 @@ class LJCubic : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_cubic_ext.cpp b/lib/gpu/lal_lj_cubic_ext.cpp
index f02ce0f184..2f8ebac37b 100644
--- a/lib/gpu/lal_lj_cubic_ext.cpp
+++ b/lib/gpu/lal_lj_cubic_ext.cpp
@@ -58,7 +58,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
   if (world_me==0)
     init_ok=LJCubicLMF.init(ntypes, cutsq, cut_inner_sq, cut_inner, sigma,
                             epsilon, host_lj1, host_lj2, host_lj3, host_lj4,
-                            special_lj, inum, nall, 300, maxspecial,
+                            special_lj, inum, nall, max_nbors, maxspecial,
                             cell_size, gpu_split, screen);
 
   LJCubicLMF.device->world_barrier();
@@ -77,7 +77,7 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJCubicLMF.init(ntypes, cutsq, cut_inner_sq, cut_inner, sigma,
                               epsilon, host_lj1, host_lj2, host_lj3, host_lj4,
-                              special_lj, inum, nall, 300, maxspecial,
+                              special_lj, inum, nall, max_nbors, maxspecial,
                               cell_size, gpu_split, screen);
 
     LJCubicLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_dsf.cpp b/lib/gpu/lal_lj_dsf.cpp
index b888f33f00..d41aa13deb 100644
--- a/lib/gpu/lal_lj_dsf.cpp
+++ b/lib/gpu/lal_lj_dsf.cpp
@@ -125,20 +125,9 @@ double LJDSFT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJDSFT::loop(const bool _eflag, const bool _vflag) {
+int LJDSFT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -146,8 +135,8 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -163,6 +152,7 @@ void LJDSFT::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJDSF<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_dsf.cu b/lib/gpu/lal_lj_dsf.cu
index c1bb197148..5beedb0bbb 100644
--- a/lib/gpu/lal_lj_dsf.cu
+++ b/lib/gpu/lal_lj_dsf.cu
@@ -50,6 +50,9 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -59,18 +62,18 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -78,7 +81,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
     numtyp qtmp; fetch(qtmp,i,q_tex);
     int itype=ix.w;
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -130,7 +133,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
             e_coul += e;
@@ -140,7 +143,7 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -151,9 +154,9 @@ __kernel void k_lj_dsf(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
@@ -176,28 +179,31 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -206,7 +212,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
     int iw=ix.w;
     int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
 
-    if (eflag>0) {
+    if (EVFLAG && eflag) {
       acctyp e_self = -((acctyp)0.5*e_shift + alpha/MY_PIS) *
         qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
       e_coul += (acctyp)2.0*e_self;
@@ -257,7 +263,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq) {
             numtyp e=prefactor*(erfcc-r*e_shift-rsq*f_shift-factor_coul);
             e_coul += e;
@@ -267,7 +273,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -278,8 +284,7 @@ __kernel void k_lj_dsf_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_dsf.h b/lib/gpu/lal_lj_dsf.h
index b176e087db..b303285e9c 100644
--- a/lib/gpu/lal_lj_dsf.h
+++ b/lib/gpu/lal_lj_dsf.h
@@ -77,7 +77,7 @@ class LJDSF : public BaseCharge<numtyp, acctyp> {
  private:
   bool _allocated;
   numtyp _e_shift, _f_shift, _alpha, _cut_coulsq;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_dsf_ext.cpp b/lib/gpu/lal_lj_dsf_ext.cpp
index 6d53896a11..e70059261c 100644
--- a/lib/gpu/lal_lj_dsf_ext.cpp
+++ b/lib/gpu/lal_lj_dsf_ext.cpp
@@ -59,7 +59,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                        host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                        f_shift, alpha);
@@ -79,7 +79,7 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJDMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen, host_cut_ljsq,
                          host_cut_coulsq, host_special_coul, qqrd2e, e_shift,
                          f_shift, alpha);
diff --git a/lib/gpu/lal_lj_expand.cpp b/lib/gpu/lal_lj_expand.cpp
index 1c58cecfae..3d9e526d0c 100644
--- a/lib/gpu/lal_lj_expand.cpp
+++ b/lib/gpu/lal_lj_expand.cpp
@@ -133,20 +133,9 @@ double LJExpandT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJExpandT::loop(const bool _eflag, const bool _vflag) {
+int LJExpandT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -154,8 +143,8 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -168,6 +157,7 @@ void LJExpandT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJExpand<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_expand.cu b/lib/gpu/lal_lj_expand.cu
index 46ed9e2a31..2eff2cd89b 100644
--- a/lib/gpu/lal_lj_expand.cu
+++ b/lib/gpu/lal_lj_expand.cu
@@ -41,22 +41,25 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -93,11 +96,11 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -108,9 +111,9 @@ __kernel void k_lj_expand(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
@@ -129,27 +132,30 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(numtyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -186,11 +192,11 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -201,8 +207,8 @@ __kernel void k_lj_expand_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_expand.h b/lib/gpu/lal_lj_expand.h
index 2560d166c7..94448a871d 100644
--- a/lib/gpu/lal_lj_expand.h
+++ b/lib/gpu/lal_lj_expand.h
@@ -76,7 +76,7 @@ class LJExpand : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_expand_coul_long.cpp b/lib/gpu/lal_lj_expand_coul_long.cpp
index 3e5e00ef6a..41c2ff6229 100644
--- a/lib/gpu/lal_lj_expand_coul_long.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long.cpp
@@ -140,20 +140,9 @@ double LJExpandCoulLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) {
+int LJExpandCoulLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -161,8 +150,8 @@ void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -178,6 +167,7 @@ void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJExpandCoulLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_expand_coul_long.cu b/lib/gpu/lal_lj_expand_coul_long.cu
index 0f0fe4c2fb..abb3d5ca3f 100644
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -125,7 +128,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -133,7 +136,7 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -144,9 +147,9 @@ __kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
@@ -168,6 +171,9 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -175,20 +181,20 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -246,7 +252,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].w) {
@@ -254,7 +260,7 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
             energy+=factor_lj*(e-lj3[mtype].z);
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -265,8 +271,8 @@ __kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
diff --git a/lib/gpu/lal_lj_expand_coul_long.h b/lib/gpu/lal_lj_expand_coul_long.h
index 404a36e5bc..44f7aff3fe 100644
--- a/lib/gpu/lal_lj_expand_coul_long.h
+++ b/lib/gpu/lal_lj_expand_coul_long.h
@@ -80,7 +80,7 @@ class LJExpandCoulLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_expand_coul_long_ext.cpp b/lib/gpu/lal_lj_expand_coul_long_ext.cpp
index 3ff1bef701..e5506dd7aa 100644
--- a/lib/gpu/lal_lj_expand_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long_ext.cpp
@@ -58,7 +58,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                        offset, shift, special_lj, inum, nall, 300, maxspecial,
+                        offset, shift, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
@@ -77,7 +77,7 @@ int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                          offset, shift, special_lj, inum, nall, 300, maxspecial,
+                          offset, shift, special_lj, inum, nall, max_nbors, maxspecial,
                           cell_size, gpu_split, screen, host_cut_ljsq,
                           host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
 
diff --git a/lib/gpu/lal_lj_expand_ext.cpp b/lib/gpu/lal_lj_expand_ext.cpp
index 603e425d3f..02decf2712 100644
--- a/lib/gpu/lal_lj_expand_ext.cpp
+++ b/lib/gpu/lal_lj_expand_ext.cpp
@@ -56,7 +56,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, shift, special_lj, inum, nall, 300,
+                       host_lj4, offset, shift, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   LJEMF.device->world_barrier();
@@ -74,7 +74,7 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJEMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, shift, special_lj, inum, nall, 300, maxspecial,
+                         offset, shift, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split,screen);
 
     LJEMF.device->world_barrier();
diff --git a/lib/gpu/lal_lj_ext.cpp b/lib/gpu/lal_lj_ext.cpp
index 124cf46c8c..fa00fc4f64 100644
--- a/lib/gpu/lal_lj_ext.cpp
+++ b/lib/gpu/lal_lj_ext.cpp
@@ -55,7 +55,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   LJLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     LJLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_gromacs.cpp b/lib/gpu/lal_lj_gromacs.cpp
index 0563151ddd..8a385ece6b 100644
--- a/lib/gpu/lal_lj_gromacs.cpp
+++ b/lib/gpu/lal_lj_gromacs.cpp
@@ -121,20 +121,9 @@ double LJGROMACST::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
+int LJGROMACST::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -142,8 +131,8 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &ljsw,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &ljsw,
                           &sp_lj, &this->nbor->dev_nbor,
                           &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv,
@@ -159,6 +148,7 @@ void LJGROMACST::loop(const bool _eflag, const bool _vflag) {
                      &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class LJGROMACS<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_gromacs.cu b/lib/gpu/lal_lj_gromacs.cu
index 21381bef30..4117cc1440 100644
--- a/lib/gpu/lal_lj_gromacs.cu
+++ b/lib/gpu/lal_lj_gromacs.cu
@@ -42,21 +42,24 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -99,7 +102,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           e += lj3[mtype].w;
           if (rsq > lj1[mtype].w) {
@@ -108,7 +111,7 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -119,9 +122,9 @@ __kernel void k_lj_gromacs(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                  vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                vflag,ans,engv);
 }
 
 __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
@@ -142,6 +145,9 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 ljsw[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -150,18 +156,18 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
     ljsw[tid]=ljsw_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -204,7 +210,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           e += lj3[mtype].w;
           if (rsq > lj1[mtype].w) {
@@ -213,7 +219,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -224,8 +230,7 @@ __kernel void k_lj_gromacs_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                  vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_gromacs.h b/lib/gpu/lal_lj_gromacs.h
index 3dec13c6d7..8fedaf07a1 100644
--- a/lib/gpu/lal_lj_gromacs.h
+++ b/lib/gpu/lal_lj_gromacs.h
@@ -76,7 +76,7 @@ class LJGROMACS : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_gromacs_ext.cpp b/lib/gpu/lal_lj_gromacs_ext.cpp
index 99d32ab09a..19d1d12513 100644
--- a/lib/gpu/lal_lj_gromacs_ext.cpp
+++ b/lib/gpu/lal_lj_gromacs_ext.cpp
@@ -58,7 +58,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   int init_ok=0;
   if (world_me==0)
     LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                 special_lj, inum, nall, 300, maxspecial, cell_size,
+                 special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                  gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                  host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
@@ -77,7 +77,7 @@ int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=LJGRMMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                           special_lj, inum, nall, 300, maxspecial, cell_size,
+                           special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                            gpu_split, screen, host_ljsw1, host_ljsw2, host_ljsw3,
                            host_ljsw4, host_ljsw5, cut_inner, cut_inner_sq);
 
diff --git a/lib/gpu/lal_lj_sdk.cpp b/lib/gpu/lal_lj_sdk.cpp
index c6a282576c..0da094c953 100644
--- a/lib/gpu/lal_lj_sdk.cpp
+++ b/lib/gpu/lal_lj_sdk.cpp
@@ -113,20 +113,9 @@ double CGCMMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CGCMMT::loop(const bool _eflag, const bool _vflag) {
+int CGCMMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -149,6 +138,7 @@ void CGCMMT::loop(const bool _eflag, const bool _vflag) {
                      &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CGCMM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_sdk.cu b/lib/gpu/lal_lj_sdk.cu
index 249b29a4b2..1bd9a93d5e 100644
--- a/lib/gpu/lal_lj_sdk.cu
+++ b/lib/gpu/lal_lj_sdk.cu
@@ -39,22 +39,25 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -97,10 +100,10 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_,
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
-        if (eflag>0)
+        if (EVFLAG && eflag)
           energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
                     lj3[mtype].z;
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -111,9 +114,9 @@ __kernel void k_lj_sdk(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
@@ -132,27 +135,30 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -195,10 +201,10 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
         f.x+=delx*force;
         f.y+=dely*force;
         f.z+=delz*force;
-        if (eflag>0)
+        if (EVFLAG && eflag)
           energy += factor_lj*inv1*(lj3[mtype].x*inv2-lj3[mtype].y)-
                     lj3[mtype].z;
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -209,8 +215,7 @@ __kernel void k_lj_sdk_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_sdk.h b/lib/gpu/lal_lj_sdk.h
index fc50756a3f..043bafdda8 100644
--- a/lib/gpu/lal_lj_sdk.h
+++ b/lib/gpu/lal_lj_sdk.h
@@ -71,7 +71,7 @@ class CGCMM : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_sdk_ext.cpp b/lib/gpu/lal_lj_sdk_ext.cpp
index de0c5fef4f..4497233861 100644
--- a/lib/gpu/lal_lj_sdk_ext.cpp
+++ b/lib/gpu/lal_lj_sdk_ext.cpp
@@ -56,7 +56,7 @@ int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types,
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   CMMMF.device->world_barrier();
@@ -74,7 +74,7 @@ int sdk_gpu_init(const int ntypes, double **cutsq, int **cg_types,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CMMMF.init(ntypes,cutsq,cg_types,host_lj1,host_lj2,host_lj3,
-                         host_lj4, offset, special_lj, inum, nall, 300,
+                         host_lj4, offset, special_lj, inum, nall, max_nbors,
                          maxspecial, cell_size, gpu_split, screen);
 
     CMMMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_lj_sdk_long.cpp b/lib/gpu/lal_lj_sdk_long.cpp
index 74dbfc40e3..d78e8d84da 100644
--- a/lib/gpu/lal_lj_sdk_long.cpp
+++ b/lib/gpu/lal_lj_sdk_long.cpp
@@ -124,20 +124,9 @@ double CGCMMLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
+int CGCMMLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -145,8 +134,8 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch, &this->atom->q,
@@ -161,6 +150,7 @@ void CGCMMLongT::loop(const bool _eflag, const bool _vflag) {
                      &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class CGCMMLong<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_lj_sdk_long.cu b/lib/gpu/lal_lj_sdk_long.cu
index 6dd1829c71..3972ed2076 100644
--- a/lib/gpu/lal_lj_sdk_long.cu
+++ b/lib/gpu/lal_lj_sdk_long.cu
@@ -47,6 +47,9 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
@@ -56,18 +59,18 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
   sp_lj[6]=sp_lj_in[6];
   sp_lj[7]=sp_lj_in[7];
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -130,7 +133,7 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].y) {
@@ -138,7 +141,7 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
                       lj3[mtype].w;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -149,9 +152,9 @@ __kernel void k_lj_sdk_long(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
@@ -173,6 +176,9 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -180,20 +186,20 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
     lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
-  acctyp e_coul=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    e_coul=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -256,7 +262,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           if (rsq < cut_coulsq)
             e_coul += prefactor*(_erfc-factor_coul);
           if (rsq < lj1[mtype].y) {
@@ -264,7 +270,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
                       lj3[mtype].w;
           }
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -275,8 +281,7 @@ __kernel void k_lj_sdk_long_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
-
diff --git a/lib/gpu/lal_lj_sdk_long.h b/lib/gpu/lal_lj_sdk_long.h
index 608488bd30..102b007b59 100644
--- a/lib/gpu/lal_lj_sdk_long.h
+++ b/lib/gpu/lal_lj_sdk_long.h
@@ -75,7 +75,7 @@ class CGCMMLong : public BaseCharge<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_sdk_long_ext.cpp b/lib/gpu/lal_lj_sdk_long_ext.cpp
index f293487282..3170ac8b52 100644
--- a/lib/gpu/lal_lj_sdk_long_ext.cpp
+++ b/lib/gpu/lal_lj_sdk_long_ext.cpp
@@ -58,7 +58,7 @@ int sdkl_gpu_init(const int ntypes, double **cutsq, int **cg_type,
   int init_ok=0;
   if (world_me==0)
     init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                        host_lj4, offset, special_lj, inum, nall, 300,
+                        host_lj4, offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
                         host_cut_coulsq, host_special_coul, qqrd2e,g_ewald);
 
@@ -77,7 +77,7 @@ int sdkl_gpu_init(const int ntypes, double **cutsq, int **cg_type,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=CMMLMF.init(ntypes, cutsq, cg_type, host_lj1, host_lj2, host_lj3,
-                          host_lj4, offset, special_lj, inum,  nall, 300,
+                          host_lj4, offset, special_lj, inum,  nall, max_nbors,
                           maxspecial, cell_size, gpu_split, screen,
                           host_cut_ljsq, host_cut_coulsq, host_special_coul,
                           qqrd2e, g_ewald);
diff --git a/lib/gpu/lal_lj_tip4p_long.cpp b/lib/gpu/lal_lj_tip4p_long.cpp
index 1f3b32248c..66477d1fb4 100644
--- a/lib/gpu/lal_lj_tip4p_long.cpp
+++ b/lib/gpu/lal_lj_tip4p_long.cpp
@@ -65,6 +65,12 @@ int LJTIP4PLongT::init(const int ntypes,
   k_pair_distrib.set_function(*this->pair_program,"k_lj_tip4p_long_distrib");
   k_pair_reneigh.set_function(*this->pair_program,"k_lj_tip4p_reneigh");
   k_pair_newsite.set_function(*this->pair_program,"k_lj_tip4p_newsite");
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_distrib_noev.set_function(*this->pair_program_noev,
+                                   "k_lj_tip4p_long_distrib");
+  #else
+  k_pair_dt_sel = &k_pair_distrib;
+  #endif
 
   TypeH = tH;
   TypeO = tO;
@@ -151,6 +157,9 @@ void LJTIP4PLongT::clear() {
   k_pair_distrib.clear();
   k_pair_reneigh.clear();
   k_pair_newsite.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_distrib_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -164,19 +173,9 @@ double LJTIP4PLongT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
+int LJTIP4PLongT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
 
   int ainum=this->ans->inum();
   const int nall = this->atom->nall();
@@ -210,8 +209,8 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
   this->ansO.zero();
   this->device->gpu->sync();
   if(shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &lj1, &lj3, &_lj_types, &sp_lj,
           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
           &this->ans->force, &this->ans->engv, &eflag, &vflag,
           &ainum, &nbor_pitch, &this->_threads_per_atom,
@@ -228,12 +227,19 @@ void LJTIP4PLongT::loop(const bool _eflag, const bool _vflag) {
           &this->atom->q, &cutsq, &_qqrd2e, &_g_ewald,
           &cut_coulsq, &cut_coulsqplus, &this->ansO);
   }
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_pair_dt_sel = &k_pair_distrib;
+  else k_pair_dt_sel = &k_pair_distrib_noev;
+  #endif
+
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/BX));
-  this->k_pair_distrib.set_size(GX,BX);
-  this->k_pair_distrib.run(&this->atom->x, &this->ans->force, &this->ans->engv,
-      &eflag, &vflag, &ainum, &nbor_pitch, &this->_threads_per_atom,
-      &hneight, &m, &TypeO, &TypeH, &alpha,&this->atom->q,  &this->ansO);
+  k_pair_dt_sel->set_size(GX,BX);
+  k_pair_dt_sel->run(&this->atom->x, &this->ans->force, &this->ans->engv,
+                     &eflag, &vflag, &ainum, &nbor_pitch,
+                     &this->_threads_per_atom, &hneight, &m, &TypeO, &TypeH,
+                     &alpha,&this->atom->q,  &this->ansO);
   this->time_pair.stop();
+  return GX;
 }
 
 
@@ -269,22 +275,26 @@ void LJTIP4PLongT::copy_relations_data(int n, tagint *tag, int *map_array,
   }
 }
 
-
-
-
 // ---------------------------------------------------------------------------
 // Copy nbor list from host if necessary and then calculate forces, virials,..
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
-                               const int nall, double **host_x, int *host_type,
-                               int *ilist, int *numj, int **firstneigh,
-                               const bool eflag, const bool vflag,
-                               const bool eatom, const bool vatom,
-                               int &host_start, const double cpu_time,
-                               bool &success, double *host_q,
-                               const int nlocal, double *boxlo, double *prd) {
+                           const int nall, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
+                           const bool eflag_in, const bool vflag_in,
+                           const bool eatom, const bool vatom,
+                           int &host_start, const double cpu_time,
+                           bool &success, double *host_q,
+                           const int nlocal, double *boxlo, double *prd) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -315,7 +325,7 @@ void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
 
   t_ago = ago;
   loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,inum);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
@@ -325,16 +335,23 @@ void LJTIP4PLongT::compute(const int f_ago, const int inum_full,
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
 int** LJTIP4PLongT::compute(const int ago, const int inum_full,
-                                const int nall, double **host_x, int *host_type,
-                                double *sublo, double *subhi, tagint *tag,
-                                int *map_array, int map_size, int *sametag, int max_same,
-                                int **nspecial, tagint **special, const bool eflag,
-                                const bool vflag, const bool eatom,
-                                const bool vatom, int &host_start,
-                                int **ilist, int **jnum,
-                                const double cpu_time, bool &success,
-                                double *host_q, double *boxlo, double *prd) {
+                            const int nall, double **host_x, int *host_type,
+                            double *sublo, double *subhi, tagint *tag,
+                            int *map_array, int map_size, int *sametag,
+                            int max_same, int **nspecial, tagint **special,
+                            const bool eflag_in, const bool vflag_in,
+                            const bool eatom, const bool vatom,
+                            int &host_start, int **ilist, int **jnum,
+                            const double cpu_time, bool &success,
+                            double *host_q, double *boxlo, double *prd) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eflag_in) eflag=2;
+  else eflag=0;
+  if (vflag_in) vflag=2;
+  else vflag=0;
+
+  this->set_kernel(eflag,vflag);
   if (inum_full==0) {
     host_start=0;
     // Make sure textures are correct if realloc by a different hybrid style
@@ -373,7 +390,7 @@ int** LJTIP4PLongT::compute(const int ago, const int inum_full,
 
   t_ago = ago;
   loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,inum);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 
diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu
index 782ae43662..bd900d9244 100644
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@@ -129,7 +129,7 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
         f.x += fM.x * (acctyp)0.5 * alpha;
         f.y += fM.y * (acctyp)0.5 * alpha;
         f.z += fM.z * (acctyp)0.5 * alpha;
-        if (vflag > 0) {
+        if (EVFLAG && vflag) {
           vM = ansO[inum  +iO];
           engv[inum*engv_iter + i] += vM.x * (acctyp)0.5 * alpha; engv_iter++;
           engv[inum*engv_iter + i] += vM.y * (acctyp)0.5 * alpha; engv_iter++;
@@ -147,13 +147,13 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
       f.x += fM.x * (acctyp)(1 - alpha);
       f.y += fM.y * (acctyp)(1 - alpha);
       f.z += fM.z * (acctyp)(1 - alpha);
-      if (eflag > 0) {
+      if (EVFLAG && eflag) {
         eM = engv[i+inum];
         engv[inum+i] = eM*(acctyp)(1 - alpha);
         if (iH1 < inum) engv[inum+iH1] += eM * (acctyp)0.5 * alpha;
         if (iH2 < inum) engv[inum+iH2] += eM * (acctyp)0.5 * alpha;
       }
-      if (vflag > 0) {
+      if (EVFLAG && vflag) {
         vM = ansO[inum   + i];
         engv[inum*engv_iter + i] += vM.x * (acctyp)(1 - alpha); engv_iter++;
         engv[inum*engv_iter + i] += vM.y * (acctyp)(1 - alpha); engv_iter++;
@@ -276,22 +276,27 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy = (acctyp)0;
-  acctyp e_coul = (acctyp)0;
+  int n_stride;
+  local_allocate_store_charge();
+
   acctyp4 f, fO;
   f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
   fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
-  acctyp virial[6],vO[6];
-  for (int i=0; i<6; i++) {
-    virial[i]=(acctyp)0;
-    vO[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6], vO[6];
+  if (EVFLAG) {
+    energy = (acctyp)0;
+    e_coul = (acctyp)0;
+    for (int i=0; i<6; i++) {
+      virial[i]=(acctyp)0;
+      vO[i]=(acctyp)0;
+    }
   }
 
+  int i;
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
-    __local int n_stride;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-        n_stride,nbor_end,nbor);
+              n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     numtyp qtmp; fetch(qtmp,i,q_tex);
@@ -343,11 +348,11 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
         f.y += dely*forcelj;
         f.z += delz*forcelj;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = r6inv * (lj3[mtype].x*r6inv-lj3[mtype].y);
           energy += factor_lj * (e - lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*forcelj;
           virial[1] += dely*dely*forcelj;
           virial[2] += delz*delz*forcelj;
@@ -396,10 +401,10 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
             fO.z += delz * force_coul;
             fO.w += 0;
           }
-          if (eflag>0) {
+          if (EVFLAG && eflag) {
             e_coul += prefactor*(_erfc-factor_coul);
           }
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             acctyp4 fd;
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
@@ -489,10 +494,10 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
             f.y += fd.y;
             f.z += fd.z;
 
-            if (eflag>0) {
+            if (EVFLAG && eflag) {
               e_coul += prefactor*(_erfc-factor_coul) * (acctyp)0.5 * alpha;
             }
-            if (vflag>0) {
+            if (EVFLAG && vflag) {
               numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
               numtyp4 xH2; fetch4(xH2,iH2,pos_tex);
               numtyp4 xO;  fetch4(xO,iO,pos_tex);
@@ -508,62 +513,64 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
         }
       } // if cut_coulsqplus
     } // for nbor
-    if (t_per_atom>1) {
-#if (ARCH < 300)
-      __local acctyp red_acc[6][BLOCK_PAIR];
-      red_acc[0][tid]=fO.x;
-      red_acc[1][tid]=fO.y;
-      red_acc[2][tid]=fO.z;
-      red_acc[3][tid]=fO.w;
+  } // if ii
+  if (t_per_atom>1) {
+#if (SHUFFLE_AVAIL == 0)
+    red_acc[0][tid]=fO.x;
+    red_acc[1][tid]=fO.y;
+    red_acc[2][tid]=fO.z;
+    red_acc[3][tid]=fO.w;
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      simdsync();
+      if (offset < s) {
+        for (int r=0; r<4; r++)
+          red_acc[r][tid] += red_acc[r][tid+s];
+      }
+    }
+    fO.x=red_acc[0][tid];
+    fO.y=red_acc[1][tid];
+    fO.z=red_acc[2][tid];
+    fO.w=red_acc[3][tid];
+    if (EVFLAG && vflag) {
+      simdsync();
+      for (int r=0; r<6; r++) red_acc[r][tid]=vO[r];
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        simdsync();
         if (offset < s) {
-          for (int r=0; r<4; r++)
+          for (int r=0; r<6; r++)
             red_acc[r][tid] += red_acc[r][tid+s];
         }
       }
-      fO.x=red_acc[0][tid];
-      fO.y=red_acc[1][tid];
-      fO.z=red_acc[2][tid];
-      fO.w=red_acc[3][tid];
-      if (vflag>0) {
-        for (int r=0; r<6; r++) red_acc[r][tid]=vO[r];
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          if (offset < s) {
-            for (int r=0; r<6; r++)
-              red_acc[r][tid] += red_acc[r][tid+s];
-          }
-        }
-        for (int r=0; r<6; r++) vO[r]=red_acc[r][tid];
-      }
+      for (int r=0; r<6; r++) vO[r]=red_acc[r][tid];
+    }
 #else
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+      fO.x += shfl_down(fO.x, s, t_per_atom);
+      fO.y += shfl_down(fO.y, s, t_per_atom);
+      fO.z += shfl_down(fO.z, s, t_per_atom);
+      fO.w += shfl_down(fO.w, s, t_per_atom);
+    }
+    if (EVFLAG && vflag) {
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        fO.x += shfl_xor(fO.x, s, t_per_atom);
-        fO.y += shfl_xor(fO.y, s, t_per_atom);
-        fO.z += shfl_xor(fO.z, s, t_per_atom);
-        fO.w += shfl_xor(fO.w, s, t_per_atom);
-      }
-      if (vflag>0) {
-        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-          for (int r=0; r<6; r++)
-            vO[r] += shfl_xor(vO[r], s, t_per_atom);
-        }
+        for (int r=0; r<6; r++)
+          vO[r] += shfl_down(vO[r], s, t_per_atom);
       }
+    }
 #endif
+  }
+  if(offset == 0 && ii<inum) {
+    ansO[i] = fO;
+    if (EVFLAG && vflag) {
+      ansO[inum   + i].x = vO[0];
+      ansO[inum   + i].y = vO[1];
+      ansO[inum   + i].z = vO[2];
+      ansO[inum*2 + i].x = vO[3];
+      ansO[inum*2 + i].y = vO[4];
+      ansO[inum*2 + i].z = vO[5];
     }
-    if(offset == 0) {
-      ansO[i] = fO;
-      if (vflag>0) {
-        ansO[inum   + i].x = vO[0];
-        ansO[inum   + i].y = vO[1];
-        ansO[inum   + i].z = vO[2];
-        ansO[inum*2 + i].x = vO[3];
-        ansO[inum*2 + i].y = vO[4];
-        ansO[inum*2 + i].z = vO[5];
-      }
-    }
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-        vflag,ans,engv);
-  } // if ii
+  }
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
 
 __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
@@ -592,28 +599,32 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[8];
+  int n_stride;
+  local_allocate_store_charge();
+
   if (tid<8)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
-  acctyp energy = (acctyp)0;
-  acctyp e_coul = (acctyp)0;
   acctyp4 f, fO;
   f.x=(acctyp)0;  f.y=(acctyp)0;  f.z=(acctyp)0;
   fO.x=(acctyp)0; fO.y=(acctyp)0; fO.z=(acctyp)0;
-  acctyp virial[6],vO[6];
-  for (int i=0; i<6; i++) {
-    virial[i]=(acctyp)0;
-    vO[i]=(acctyp)0;
+  acctyp energy, e_coul, virial[6], vO[6];
+  if (EVFLAG) {
+    energy = (acctyp)0;
+    e_coul = (acctyp)0;
+    for (int i=0; i<6; i++) {
+      virial[i]=(acctyp)0;
+      vO[i]=(acctyp)0;
+    }
   }
 
   __syncthreads();
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
         n_stride,nbor_end,nbor);
 
@@ -667,11 +678,11 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
         f.y += dely*forcelj;
         f.z += delz*forcelj;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = r6inv * (lj3[mtype].x*r6inv-lj3[mtype].y);
           energy += factor_lj * (e - lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*forcelj;
           virial[1] += dely*dely*forcelj;
           virial[2] += delz*delz*forcelj;
@@ -720,10 +731,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
             fO.z += delz * force_coul;
             fO.w += 0;
           }
-          if (eflag>0) {
+          if (EVFLAG && eflag) {
             e_coul += prefactor*(_erfc-factor_coul);
           }
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             acctyp4 fd;
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
@@ -813,10 +824,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
             f.y += fd.y;
             f.z += fd.z;
 
-            if (eflag>0) {
+            if (EVFLAG && eflag) {
               e_coul += prefactor*(_erfc-factor_coul) * (acctyp)0.5 * alpha;
             }
-            if (vflag>0) {
+            if (EVFLAG && vflag) {
               numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
               numtyp4 xH2; fetch4(xH2,iH2,pos_tex);
               numtyp4 xO;  fetch4(xO,iO,pos_tex);
@@ -833,13 +844,13 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
       } // if cut_coulsqplus
     } // for nbor
     if (t_per_atom>1) {
-#if (ARCH < 300)
-      __local acctyp red_acc[6][BLOCK_PAIR];
+#if (SHUFFLE_AVAIL == 0)
       red_acc[0][tid]=fO.x;
       red_acc[1][tid]=fO.y;
       red_acc[2][tid]=fO.z;
       red_acc[3][tid]=fO.w;
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+        simdsync();
         if (offset < s) {
           for (int r=0; r<4; r++)
             red_acc[r][tid] += red_acc[r][tid+s];
@@ -849,9 +860,10 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
       fO.y=red_acc[1][tid];
       fO.z=red_acc[2][tid];
       fO.w=red_acc[3][tid];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         for (int r=0; r<6; r++) red_acc[r][tid]=vO[r];
         for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
+          simdsync();
           if (offset < s) {
             for (int r=0; r<6; r++)
               red_acc[r][tid] += red_acc[r][tid+s];
@@ -861,22 +873,22 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
       }
 #else
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
-        fO.x += shfl_xor(fO.x, s, t_per_atom);
-        fO.y += shfl_xor(fO.y, s, t_per_atom);
-        fO.z += shfl_xor(fO.z, s, t_per_atom);
-        fO.w += shfl_xor(fO.w, s, t_per_atom);
+        fO.x += shfl_down(fO.x, s, t_per_atom);
+        fO.y += shfl_down(fO.y, s, t_per_atom);
+        fO.z += shfl_down(fO.z, s, t_per_atom);
+        fO.w += shfl_down(fO.w, s, t_per_atom);
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         for (unsigned int s=t_per_atom/2; s>0; s>>=1) {
           for (int r=0; r<6; r++)
-            vO[r] += shfl_xor(vO[r], s, t_per_atom);
+            vO[r] += shfl_down(vO[r], s, t_per_atom);
         }
       }
 #endif
     }
     if(offset == 0) {
       ansO[i] = fO;
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         ansO[inum   + i].x = vO[0];
         ansO[inum   + i].y = vO[1];
         ansO[inum   + i].z = vO[2];
@@ -885,7 +897,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
         ansO[inum*2 + i].z = vO[5];
       }
     }
-    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
-        vflag,ans,engv);
   } // if ii
+  store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv);
 }
diff --git a/lib/gpu/lal_lj_tip4p_long.h b/lib/gpu/lal_lj_tip4p_long.h
index 90c342e246..b163a62309 100644
--- a/lib/gpu/lal_lj_tip4p_long.h
+++ b/lib/gpu/lal_lj_tip4p_long.h
@@ -74,13 +74,13 @@ public:
 
   /// Reimplement BaseCharge pair loop with device neighboring
   int** compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
-                double *subhi, tagint *tag,int *map_array, int map_size, int *sametag, int max_same,
-                int **nspecial,
-                tagint **special, const bool eflag, const bool vflag,
-                const bool eatom, const bool vatom, int &host_start,
-                int **ilist, int **numj, const double cpu_time, bool &success,
-                double *charge, double *boxlo, double *prd);
+                double **host_x, int *host_type, double *sublo, double *subhi,
+                tagint *tag,int *map_array, int map_size, int *sametag,
+                int max_same, int **nspecial, tagint **special,
+                const bool eflag, const bool vflag, const bool eatom,
+                const bool vatom, int &host_start, int **ilist, int **numj,
+                const double cpu_time, bool &success, double *charge,
+                double *boxlo, double *prd);
 
 
   // --------------------------- TYPE DATA --------------------------
@@ -115,11 +115,12 @@ public:
   UCL_D_Vec<int> atom_sametag;
 
   UCL_Kernel k_pair_distrib, k_pair_reneigh, k_pair_newsite;
+  UCL_Kernel k_pair_distrib_noev, *k_pair_dt_sel;
 
  private:
   bool _allocated;
   int t_ago;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_lj_tip4p_long_ext.cpp b/lib/gpu/lal_lj_tip4p_long_ext.cpp
index d0d6c7a3d2..7395506c2d 100644
--- a/lib/gpu/lal_lj_tip4p_long_ext.cpp
+++ b/lib/gpu/lal_lj_tip4p_long_ext.cpp
@@ -62,7 +62,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
   if (world_me==0)
     init_ok=LJTIP4PLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
         host_lj4, offset, special_lj, inum,
-        tH, tO, alpha, qdist, nall, 300,
+        tH, tO, alpha, qdist, nall, max_nbors,
         maxspecial, cell_size, gpu_split, screen,
         host_cut_ljsq, host_cut_coulsq, host_cut_coulsqplus,
         host_special_coul, qqrd2e, g_ewald, map_size, max_same);
@@ -83,7 +83,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     if (gpu_rank==i && world_me!=0)
       init_ok=LJTIP4PLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
           offset, special_lj, inum,
-          tH, tO, alpha, qdist, nall, 300, maxspecial,
+          tH, tO, alpha, qdist, nall, max_nbors, maxspecial,
           cell_size, gpu_split, screen, host_cut_ljsq,
           host_cut_coulsq, host_cut_coulsqplus,
           host_special_coul, qqrd2e,
@@ -97,7 +97,7 @@ int ljtip4p_long_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    LJTIP4PLMF.estimate_gpu_overhead();
+    LJTIP4PLMF.estimate_gpu_overhead(2);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_mie.cpp b/lib/gpu/lal_mie.cpp
index 394d1f8a2f..e370b7bde5 100644
--- a/lib/gpu/lal_mie.cpp
+++ b/lib/gpu/lal_mie.cpp
@@ -113,20 +113,9 @@ double MieT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void MieT::loop(const bool _eflag, const bool _vflag) {
+int MieT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -134,8 +123,8 @@ void MieT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &mie1, &mie3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &mie1, &mie3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
@@ -147,6 +136,7 @@ void MieT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Mie<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_mie.cu b/lib/gpu/lal_mie.cu
index 36ec8a496b..fedfaf157a 100644
--- a/lib/gpu/lal_mie.cu
+++ b/lib/gpu/lal_mie.cu
@@ -39,22 +39,25 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -89,12 +92,12 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) -
             mie3[mtype].z;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -105,9 +108,9 @@ __kernel void k_mie(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
@@ -126,6 +129,9 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 mie1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 mie3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
@@ -133,19 +139,19 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
     mie3[tid]=mie3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -180,12 +186,12 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=(mie3[mtype].x*rgamR - mie3[mtype].y*rgamA) -
             mie3[mtype].z;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -196,8 +202,7 @@ __kernel void k_mie_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_mie.h b/lib/gpu/lal_mie.h
index dfc2ee6e53..9a41596ccb 100644
--- a/lib/gpu/lal_mie.h
+++ b/lib/gpu/lal_mie.h
@@ -72,7 +72,7 @@ class Mie : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_mie_ext.cpp b/lib/gpu/lal_mie_ext.cpp
index f612de4336..5cbb9c29d2 100644
--- a/lib/gpu/lal_mie_ext.cpp
+++ b/lib/gpu/lal_mie_ext.cpp
@@ -58,7 +58,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
   if (world_me==0)
     init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                       host_mie3, host_mie4, host_gamA, host_gamR,
-                      offset, special_lj, inum, nall, 300,
+                      offset, special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);
 
   MLMF.device->world_barrier();
@@ -77,7 +77,7 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
     if (gpu_rank==i && world_me!=0)
       init_ok=MLMF.init(ntypes, cutsq, host_mie1, host_mie2,
                         host_mie3, host_mie4, host_gamA, host_gamR,
-                        offset, special_lj, inum, nall, 300, maxspecial,
+                        offset, special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     MLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_morse.cpp b/lib/gpu/lal_morse.cpp
index 09da65d252..4bedc67ed7 100644
--- a/lib/gpu/lal_morse.cpp
+++ b/lib/gpu/lal_morse.cpp
@@ -112,20 +112,9 @@ double MorseT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void MorseT::loop(const bool _eflag, const bool _vflag) {
+int MorseT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -133,8 +122,8 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &mor1, &mor2, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &mor1, &mor2, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -147,6 +136,7 @@ void MorseT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Morse<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_morse.cu b/lib/gpu/lal_morse.cu
index d6bab1e131..b1c8f2673b 100644
--- a/lib/gpu/lal_morse.cu
+++ b/lib/gpu/lal_morse.cu
@@ -41,22 +41,25 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -91,11 +94,11 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=mor2[mtype].x*(dexp*dexp - 2.0*dexp) - mor2[mtype].y;
           energy+=e*factor_lj;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -106,9 +109,9 @@ __kernel void k_morse(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
@@ -127,27 +130,30 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 mor1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp2 mor2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     mor1[tid]=mor1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       mor2[tid]=mor2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -182,11 +188,11 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=mor2[mtype].x*(dm-dexp)-mor2[mtype].y;
           energy+=e*factor_lj;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -197,8 +203,7 @@ __kernel void k_morse_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_morse.h b/lib/gpu/lal_morse.h
index bf5f1c0f8f..c5948d8be8 100644
--- a/lib/gpu/lal_morse.h
+++ b/lib/gpu/lal_morse.h
@@ -71,7 +71,7 @@ class Morse : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_morse_ext.cpp b/lib/gpu/lal_morse_ext.cpp
index 3b62d10305..f43676a1b5 100644
--- a/lib/gpu/lal_morse_ext.cpp
+++ b/lib/gpu/lal_morse_ext.cpp
@@ -56,7 +56,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
   int init_ok=0;
   if (world_me==0)
     init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
-                       host_lj4, offset, special_lj, inum, nall, 300,
+                       host_lj4, offset, special_lj, inum, nall, max_nbors,
                        maxspecial, cell_size, gpu_split, screen);
 
   MORMF.device->world_barrier();
@@ -74,7 +74,7 @@ int mor_gpu_init(const int ntypes, double **cutsq,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=MORMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     MORMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_neighbor.cpp b/lib/gpu/lal_neighbor.cpp
index 6c4890ef47..aabba49575 100644
--- a/lib/gpu/lal_neighbor.cpp
+++ b/lib/gpu/lal_neighbor.cpp
@@ -1,6 +1,7 @@
 /***************************************************************************
                                  neighbor.cpp
                              -------------------
+                            Nitin Dhamankar (Intel)
                             W. Michael Brown (ORNL)
                               Peng Wang (Nvidia)
 
@@ -32,22 +33,25 @@ int Neighbor::bytes_per_atom(const int max_nbors) const {
 }
 
 bool Neighbor::init(NeighborShared *shared, const int inum,
-                       const int host_inum, const int max_nbors,
-                       const int maxspecial, UCL_Device &devi,
-                       const int gpu_nbor, const int gpu_host,
-                       const bool pre_cut, const int block_cell_2d,
-                       const int block_cell_id, const int block_nbor_build,
-                       const int threads_per_atom, const int warp_size,
-                       const bool time_device,
-                       const std::string compile_flags) {
+                    const int host_inum, const int max_nbors,
+                    const int maxspecial, UCL_Device &devi, const int gpu_nbor,
+                    const int gpu_host, const bool pre_cut,
+                    const int block_cell_2d, const int block_cell_id,
+                    const int block_nbor_build, const int threads_per_atom,
+                    const int simd_size, const bool time_device,
+                    const std::string compile_flags, const bool ilist_map) {
   clear();
+  _ilist_map = ilist_map;
 
   _threads_per_atom=threads_per_atom;
   _block_cell_2d=block_cell_2d;
   _block_cell_id=block_cell_id;
-  _max_block_nbor_build=block_nbor_build;
   _block_nbor_build=block_nbor_build;
-  _warp_size=warp_size;
+  _simd_size=simd_size;
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (_block_nbor_build < _simd_size)
+    _block_nbor_build = _simd_size;
+  #endif
   _shared=shared;
   dev=&devi;
   _gpu_nbor=gpu_nbor;
@@ -90,7 +94,13 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
     _max_atoms=1000;
 
   _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
-  _max_nbors=(max_nbors/threads_per_atom+1)*threads_per_atom;
+
+  _max_neighbor_factor=1.0e-2*max_nbors*1.1;
+  if (_gpu_nbor != 1)
+    _max_nbors=0;
+  else
+    _max_nbors=300;
+  if (_old_max_nbors) _max_nbors=_old_max_nbors;
 
   _maxspecial=maxspecial;
   if (gpu_nbor==0)
@@ -103,8 +113,36 @@ bool Neighbor::init(NeighborShared *shared, const int inum,
   if (!success)
     return false;
 
-  if (_use_packing==false)
-    _shared->compile_kernels(devi,gpu_nbor,compile_flags);
+  if (_use_packing==false) {
+    #ifndef LAL_USE_OLD_NEIGHBOR
+      _shared->compile_kernels(devi, gpu_nbor, compile_flags+
+        " -DMAX_SUBGROUPS_PER_BLOCK="+toa(_block_nbor_build/_simd_size));
+    #else
+      _shared->compile_kernels(devi,gpu_nbor,compile_flags);
+    #endif
+
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    if (_gpu_nbor) {
+      #if defined(USE_OPENCL) && (defined(CL_VERSION_2_1) || \
+          defined(CL_VERSION_3_0))
+      if (dev->has_subgroup_support()) {
+        int simd_size_kernel=
+          _shared->k_build_nbor.max_subgroup_size(_block_nbor_build);
+        if (_simd_size != simd_size_kernel) {
+          _simd_size = simd_size_kernel;
+          if (_block_nbor_build < _simd_size)
+            _block_nbor_build = _simd_size;
+          _shared->clear();
+          _shared->compile_kernels(devi, gpu_nbor, compile_flags+
+            " -DMAX_SUBGROUPS_PER_BLOCK="+toa(_block_nbor_build/_simd_size));
+        }
+      }
+      #endif
+      _bin_stencil.get_global(*(_shared->build_program),"bin_stencil");
+    }
+    #endif
+  }
+  _max_block_nbor_build=_block_nbor_build;
 
   return success;
 }
@@ -113,24 +151,44 @@ void Neighbor::alloc(bool &success) {
   dev_nbor.clear();
   host_acc.clear();
   int nt=_max_atoms+_max_host;
-  if (_use_packing==false || _gpu_nbor>0)
-    success=success &&
-            (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
-  else
+  if (_max_nbors)
+    _max_nbors = ((_max_nbors-1)/_threads_per_atom+1)*_threads_per_atom;
+  if (_use_packing==false || _gpu_nbor>0) {
+    if (_max_nbors)
+      success=success &&
+        (dev_nbor.alloc((_max_nbors+2)*_max_atoms,*dev)==UCL_SUCCESS);
+  } else
     success=success && (dev_nbor.alloc(3*_max_atoms,*dev,
                                        UCL_READ_ONLY)==UCL_SUCCESS);
-  success=success && (host_acc.alloc(nt*2,*dev,
-                                     UCL_READ_WRITE)==UCL_SUCCESS);
+  if (_gpu_nbor != 2 || _max_host>0)
+    success=success && (host_acc.alloc(nt*2,*dev,
+                                       UCL_READ_WRITE)==UCL_SUCCESS);
 
   _c_bytes=dev_nbor.row_bytes();
   if (_alloc_packed) {
+    if (_use_packing==false) {
+      dev_packed_begin.clear();
+      success=success && (dev_packed_begin.alloc(_max_atoms,*dev,
+         _packed_permissions)==UCL_SUCCESS);
+    }
+
     dev_packed.clear();
-    success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
-                                         _packed_permissions)==UCL_SUCCESS);
-    dev_ilist.clear();
-    success=success && (dev_ilist.alloc(_max_atoms,*dev,
-                                      UCL_READ_WRITE)==UCL_SUCCESS);
-    _c_bytes+=dev_packed.row_bytes()+dev_ilist.row_bytes();
+    if (_max_nbors)
+      success=success && (dev_packed.alloc((_max_nbors+2)*_max_atoms,*dev,
+                                           _packed_permissions)==UCL_SUCCESS);
+    if (_ilist_map) {
+      if (_gpu_nbor) {
+        if (three_ilist.numel()==0)
+          success=success && (three_ilist.alloc(16,*dev,UCL_READ_WRITE,
+                                                UCL_READ_ONLY)==UCL_SUCCESS);
+      } else {
+        three_ilist.clear();
+        success=success && (three_ilist.alloc(_max_atoms,*dev,UCL_READ_WRITE,
+                                              UCL_READ_ONLY)==UCL_SUCCESS);
+      }
+      _c_bytes+=three_ilist.row_bytes();
+    }
+    _c_bytes+=dev_packed.row_bytes()+dev_packed_begin.row_bytes();
   }
   if (_max_host>0) {
     nbor_host.clear();
@@ -138,8 +196,9 @@ void Neighbor::alloc(bool &success) {
     host_ilist.clear();
     host_jlist.clear();
 
-    success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
-                             UCL_READ_WRITE)==UCL_SUCCESS) && success;
+    if (_max_nbors)
+      success=(nbor_host.alloc(_max_nbors*_max_host,*dev,UCL_READ_WRITE,
+                               UCL_READ_WRITE)==UCL_SUCCESS) && success;
     success=success && (dev_numj_host.alloc(_max_host,*dev,
                                             UCL_READ_WRITE)==UCL_SUCCESS);
     success=success && (host_ilist.alloc(nt,*dev,UCL_NOT_PINNED)==UCL_SUCCESS);
@@ -157,7 +216,8 @@ void Neighbor::alloc(bool &success) {
       ptr+=_max_nbors;
     }
     _c_bytes+=nbor_host.device.row_bytes()+dev_numj_host.row_bytes();
-  } else {
+  } else if (dev_nbor.numel()) {
+    if (!success) return;
     // Some OpenCL implementations return errors for nullptr pointers as args
     nbor_host.device.view(dev_nbor);
     dev_numj_host.view(dev_nbor);
@@ -188,6 +248,12 @@ void Neighbor::clear() {
   if (_ncells>0) {
     _ncells=0;
     cell_counts.clear();
+#ifndef LAL_USE_OLD_NEIGHBOR
+    cell_subgroup_counts.clear();
+    subgroup2cell.clear();
+    _host_bin_stencil.clear();
+    _bin_stencil.clear();
+#endif
     if (_gpu_nbor==2)
       delete [] cell_iter;
   }
@@ -195,12 +261,15 @@ void Neighbor::clear() {
     _allocated=false;
     _nbor_time_avail=false;
 
+    _old_max_nbors=_max_nbors;
+    _max_nbors=0;
     host_packed.clear();
     host_acc.clear();
-    dev_ilist.clear();
+    three_ilist.clear();
     dev_nbor.clear();
     nbor_host.clear();
     dev_packed.clear();
+    dev_packed_begin.clear();
     dev_numj_host.clear();
     host_ilist.clear();
     host_jlist.clear();
@@ -236,9 +305,9 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   UCL_H_Vec<int> ilist_view;
   ilist_view.view(ilist,inum,*dev);
   ucl_copy(dev_nbor,ilist_view,false);
-
-  UCL_D_Vec<int> nbor_offset;
-  UCL_H_Vec<int> host_offset;
+  #ifndef GERYON_OCL_FLUSH
+  dev_nbor.flush();
+  #endif
 
   int copy_count=0;
   int ij_count=0;
@@ -263,9 +332,12 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
 
       if (ij_count==IJ_SIZE) {
         dev_nbor.sync();
-        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
-        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
-        ucl_copy(nbor_offset,host_offset,true);
+        _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
+        _nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
+        ucl_copy(_nbor_offset,_host_offset,true);
+        #ifndef GERYON_OCL_FLUSH
+        _nbor_offset.flush();
+        #endif
         copy_count++;
         ij_count=0;
         dev_count+=IJ_SIZE;
@@ -275,21 +347,29 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
   }
   if (ij_count!=0) {
     dev_nbor.sync();
-    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
-    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
-    ucl_copy(nbor_offset,host_offset,true);
+    _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
+    _nbor_offset.view_offset(dev_count,dev_packed,ij_count);
+    ucl_copy(_nbor_offset,_host_offset,true);
+  }
+  _acc_view.view_offset(inum,dev_nbor,inum*2);
+  if (_use_packing)
+    ucl_copy(_acc_view,host_acc,inum*2,true);
+  else {
+    ucl_copy(_acc_view,host_acc,inum,true);
+    _host_offset.view_offset(inum,host_acc,inum);
+    ucl_copy(dev_packed_begin,_host_offset,inum,true);
   }
-  UCL_D_Vec<int> acc_view;
-  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,inum*2,true);
 
-  UCL_H_Vec<int> host_view;
-  host_view.alloc(_max_atoms,*dev,UCL_READ_WRITE);
-  for (int ii=0; ii<inum; ii++) {
-    int i=ilist[ii];
-    host_view[i] = ii;
+  if (_ilist_map && _gpu_nbor==0) {
+    #ifndef GERYON_OCL_FLUSH
+    _acc_view.flush();
+    #endif
+    for (int ii=0; ii<inum; ii++) {
+      int i=ilist[ii];
+      three_ilist[i] = ii;
+    }
+    three_ilist.update_device(inum,true);
   }
-  ucl_copy(dev_ilist,host_view,true);
 
   time_nbor.stop();
 
@@ -298,7 +378,8 @@ void Neighbor::get_host(const int inum, int *ilist, int *numj,
     int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
                                  block_size));
     _shared->k_nbor.set_size(GX,block_size);
-    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &dev_packed_begin, &inum,
+                        &_threads_per_atom);
     time_kernel.stop();
   }
 }
@@ -315,9 +396,6 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
   ilist_view.view(ilist,inum,*dev);
   ucl_copy(dev_nbor,ilist_view,false);
 
-  UCL_D_Vec<int> nbor_offset;
-  UCL_H_Vec<int> host_offset;
-
   int copy_count=0;
   int ij_count=0;
   int acc_count=0;
@@ -346,9 +424,9 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
 
       if (ij_count==IJ_SIZE) {
         dev_nbor.sync();
-        host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
-        nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
-        ucl_copy(nbor_offset,host_offset,true);
+        _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,IJ_SIZE);
+        _nbor_offset.view_offset(dev_count,dev_packed,IJ_SIZE);
+        ucl_copy(_nbor_offset,_host_offset,true);
         copy_count++;
         ij_count=0;
         dev_count+=IJ_SIZE;
@@ -358,13 +436,18 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
   }
   if (ij_count!=0) {
     dev_nbor.sync();
-    host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
-    nbor_offset.view_offset(dev_count,dev_packed,ij_count);
-    ucl_copy(nbor_offset,host_offset,true);
+    _host_offset.view_offset(IJ_SIZE*(copy_count%2),host_packed,ij_count);
+    _nbor_offset.view_offset(dev_count,dev_packed,ij_count);
+    ucl_copy(_nbor_offset,_host_offset,true);
+  }
+  _acc_view.view_offset(inum,dev_nbor,inum*2);
+  if (_use_packing)
+    ucl_copy(_acc_view,host_acc,inum*2,true);
+  else {
+    ucl_copy(_acc_view,host_acc,inum,true);
+    _host_offset.view_offset(inum,host_acc,inum);
+    ucl_copy(dev_packed_begin,_host_offset,inum,true);
   }
-  UCL_D_Vec<int> acc_view;
-  acc_view.view_offset(inum,dev_nbor,inum*2);
-  ucl_copy(acc_view,host_acc,inum*2,true);
   time_nbor.stop();
 
   if (_use_packing==false) {
@@ -372,20 +455,28 @@ void Neighbor::get_host3(const int inum, const int nlist, int *ilist, int *numj,
     int GX=static_cast<int>(ceil(static_cast<double>(inum)*_threads_per_atom/
                                  block_size));
     _shared->k_nbor.set_size(GX,block_size);
-    _shared->k_nbor.run(&dev_nbor, &dev_packed, &inum, &_threads_per_atom);
+    _shared->k_nbor.run(&dev_nbor, &dev_packed, &dev_packed_begin, &inum,
+                        &_threads_per_atom);
     time_kernel.stop();
   }
 }
 
 template <class numtyp, class acctyp>
-void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
+void Neighbor::resize_max_neighbors(int maxn, bool &success) {
+  if (maxn == 0) maxn = 1;
   if (maxn>_max_nbors) {
     int mn=static_cast<int>(static_cast<double>(maxn)*1.10);
-    mn=(mn/_threads_per_atom+1)*_threads_per_atom;
-    success=success && (dev_nbor.resize((mn+1)*_max_atoms)==UCL_SUCCESS);
+    mn = ((mn-1)/_threads_per_atom+1)*_threads_per_atom;
+    dev_nbor.clear();
+    success=success &&
+      (dev_nbor.alloc((mn+2)*_max_atoms,*dev)==UCL_SUCCESS);
+    if (!success) return;
     _gpu_bytes=dev_nbor.row_bytes();
     if (_max_host>0) {
-      success=success && (nbor_host.resize(mn*_max_host)==UCL_SUCCESS);
+      nbor_host.clear();
+      success=(nbor_host.alloc(mn*_max_host,*dev,UCL_READ_WRITE,
+                               UCL_READ_WRITE)==UCL_SUCCESS) && success;
+      if (!success) return;
       int *ptr=nbor_host.host.begin();
       for (int i=0; i<_max_host; i++) {
         host_jlist[i]=ptr;
@@ -397,7 +488,9 @@ void Neighbor::resize_max_neighbors(const int maxn, bool &success) {
       dev_numj_host.view(dev_nbor);
     }
     if (_alloc_packed) {
-      success=success && (dev_packed.resize((mn+2)*_max_atoms)==UCL_SUCCESS);
+      dev_packed.clear();
+      success=success && (dev_packed.alloc((mn+2)*_max_atoms,*dev,
+                                           _packed_permissions)==UCL_SUCCESS);
       _gpu_bytes+=dev_packed.row_bytes();
     }
     _max_nbors=mn;
@@ -409,32 +502,66 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
                                const int nall, Atom<numtyp,acctyp> &atom,
                                double *sublo, double *subhi, tagint *tag,
                                int **nspecial, tagint **special, bool &success,
-                               int &mn) {
+                               int &mn, UCL_Vector<int,int> &error_flag) {
   _nbor_time_avail=true;
   const int nt=inum+host_inum;
 
+  const double subx = subhi[0]-sublo[0];
+  const double suby = subhi[1]-sublo[1];
+  const double subz = subhi[2]-sublo[2];
+
   // Calculate number of cells and allocate storage for binning as necessary
-  int ncellx, ncelly, ncellz, ncell_3d;
-  int ghost_cells=2*_cells_in_cutoff;
-  ncellx = static_cast<int>(ceil((subhi[0]-sublo[0])/_cell_size))+ghost_cells;
-  ncelly = static_cast<int>(ceil((subhi[1]-sublo[1])/_cell_size))+ghost_cells;
-  ncellz = static_cast<int>(ceil((subhi[2]-sublo[2])/_cell_size))+ghost_cells;
-  ncell_3d = ncellx * ncelly * ncellz;
+  int ncellx, ncelly, ncellz;
+  int cells_in_cutoff=static_cast<int>(ceil(_cutoff/_cell_size));
+  int ghost_cells=2*cells_in_cutoff;
+  ncellx = static_cast<int>(ceil(subx/_cell_size))+ghost_cells;
+  ncelly = static_cast<int>(ceil(suby/_cell_size))+ghost_cells;
+  ncellz = static_cast<int>(ceil(subz/_cell_size))+ghost_cells;
+
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  if (_auto_cell_size && subz>0.0) {
+    if (_old_ncellx!=ncellx || _old_ncelly!=ncelly || _old_ncellz!=ncellz) {
+      _cell_size = _shared->best_cell_size(subx, suby, subz, nt, _cutoff);
+      cells_in_cutoff=static_cast<int>(ceil(_cutoff/_cell_size));
+      ghost_cells=2*cells_in_cutoff;
+      ncellx = static_cast<int>(ceil(subx/_cell_size))+ghost_cells;
+      ncelly = static_cast<int>(ceil(suby/_cell_size))+ghost_cells;
+      ncellz = static_cast<int>(ceil(subz/_cell_size))+ghost_cells;
+    }
+  }
+  #endif
+
+  int ncell_3d = ncellx * ncelly * ncellz;
   if (ncell_3d+1>_ncells) {
     cell_counts.clear();
+#ifndef LAL_USE_OLD_NEIGHBOR
+    cell_subgroup_counts.clear();
+#endif
 
     if (_gpu_nbor==2) {
       if (_ncells>0)
         delete [] cell_iter;
       cell_iter = new int[ncell_3d+1];
-      cell_counts.alloc(ncell_3d+1,dev_nbor,UCL_READ_WRITE,UCL_READ_ONLY);
+      success = success && (cell_counts.alloc(ncell_3d+1,*dev,
+                               UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS);
+#ifndef LAL_USE_OLD_NEIGHBOR
+      success = success && (cell_subgroup_counts.alloc(ncell_3d+1,*dev,
+                               UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS);
+      if (!success) return;
+      cell_subgroup_counts.host[0]=0;
+#endif
     } else {
       cell_counts.device.clear();
-      cell_counts.device.alloc(ncell_3d+1,dev_nbor);
+      success = success && (cell_counts.device.alloc(ncell_3d+1,
+                                                     *dev) == UCL_SUCCESS);
     }
+    if (!success) return;
 
     _ncells=ncell_3d+1;
     _cell_bytes=cell_counts.device.row_bytes();
+#ifndef LAL_USE_OLD_NEIGHBOR
+    _cell_bytes+=cell_subgroup_counts.row_bytes()+subgroup2cell.row_bytes();
+#endif
   }
 
   const numtyp cutoff_cast=static_cast<numtyp>(_cutoff);
@@ -463,7 +590,13 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
   }
 
   // If binning on CPU, do this now
+#ifndef LAL_USE_OLD_NEIGHBOR
+  int subgroup_count = 0;
+#endif
   if (_gpu_nbor==2) {
+    #ifndef GERYON_OCL_FLUSH
+    dev_nbor.flush();
+    #endif
     double stime = MPI_Wtime();
     int *cell_id=atom.host_cell_id.begin();
     int *particle_id=atom.host_particle_id.begin();
@@ -472,21 +605,21 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     cell_counts.host.zero();
     double i_cell_size=1.0/_cell_size;
 
-    int offset_hi=_cells_in_cutoff+1;
+    int offset_hi=cells_in_cutoff+1;
     for (int i=0; i<nt; i++) {
       double px, py, pz;
       px=x[i][0]-sublo[0];
       py=x[i][1]-sublo[1];
       pz=x[i][2]-sublo[2];
 
-      int ix = static_cast<int>(px*i_cell_size+1);
-      ix = std::max(ix,_cells_in_cutoff);
+      int ix = static_cast<int>(px*i_cell_size+cells_in_cutoff);
+      ix = std::max(ix,cells_in_cutoff);
       ix = std::min(ix,ncellx-offset_hi);
-      int iy = static_cast<int>(py*i_cell_size+1);
-      iy = std::max(iy,_cells_in_cutoff);
+      int iy = static_cast<int>(py*i_cell_size+cells_in_cutoff);
+      iy = std::max(iy,cells_in_cutoff);
       iy = std::min(iy,ncelly-offset_hi);
-      int iz = static_cast<int>(pz*i_cell_size+1);
-      iz = std::max(iz,_cells_in_cutoff);
+      int iz = static_cast<int>(pz*i_cell_size+cells_in_cutoff);
+      iz = std::max(iz,cells_in_cutoff);
       iz = std::min(iz,ncellz-offset_hi);
 
       int id = ix+iy*ncellx+iz*ncellx*ncelly;
@@ -494,19 +627,40 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       cell_counts[id+1]++;
     }
 
+#ifndef LAL_USE_OLD_NEIGHBOR
+    // populate subgroup counts only for the local atoms
+    for (int i=1; i<_ncells; i++) {
+      cell_subgroup_counts[i] = ceil(static_cast<double>(cell_counts[i]) /
+                                     _simd_size);
+      subgroup_count += cell_subgroup_counts[i];
+      cell_subgroup_counts[i] += cell_subgroup_counts[i-1];
+    }
+    if (subgroup_count > subgroup2cell.numel()) {
+      subgroup2cell.clear();
+      success = success && (subgroup2cell.alloc(1.1*subgroup_count,*dev,
+                                UCL_READ_WRITE,UCL_READ_ONLY) == UCL_SUCCESS);
+      if (!success) return;
+      _cell_bytes=cell_counts.device.row_bytes() +
+        cell_subgroup_counts.row_bytes()+subgroup2cell.row_bytes();
+    }
+    for (int i=1; i<_ncells; i++)
+      for (int j=cell_subgroup_counts[i-1]; j<cell_subgroup_counts[i]; j++)
+        subgroup2cell[j] = i-1;
+#endif
+
     for (int i=nt; i<nall; i++) {
       double px, py, pz;
-      px=x[i][0]-sublo[0];
-      py=x[i][1]-sublo[1];
-      pz=x[i][2]-sublo[2];
+      px=x[i][0]-sublo[0]+_cell_size*cells_in_cutoff;
+      py=x[i][1]-sublo[1]+_cell_size*cells_in_cutoff;
+      pz=x[i][2]-sublo[2]+_cell_size*cells_in_cutoff;
 
-      int ix = static_cast<int>(px*i_cell_size+1);
+      int ix = static_cast<int>(px*i_cell_size);
       ix = std::max(ix,0);
       ix = std::min(ix,ncellx-1);
-      int iy = static_cast<int>(py*i_cell_size+1);
+      int iy = static_cast<int>(py*i_cell_size);
       iy = std::max(iy,0);
       iy = std::min(iy,ncelly-1);
-      int iz = static_cast<int>(pz*i_cell_size+1);
+      int iz = static_cast<int>(pz*i_cell_size);
       iz = std::max(iz,0);
       iz = std::min(iz,ncellz-1);
 
@@ -518,21 +672,54 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     mn=0;
     for (int i=0; i<_ncells; i++)
       mn=std::max(mn,cell_counts[i]);
-    mn*=8;
-    set_nbor_block_size(mn/2);
-
+    double mind=std::min(subx,suby);
+    mind=std::min(mind,subz) + _cutoff;
+    double ics;
+    if (mind >= _cell_size) ics = i_cell_size;
+    else ics = 1.0 / mind;
+    double vadjust=_cutoff*ics;
+    vadjust*=vadjust*vadjust*4.1888;
+    if (_cutoff < _cell_size) vadjust*=1.46;
+    mn=std::max(mn,static_cast<int>(ceil(_max_neighbor_factor*vadjust*mn)));
+    if (mn<33) mn+=3;
     resize_max_neighbors<numtyp,acctyp>(mn,success);
+    set_nbor_block_size(mn/2);
     if (!success)
       return;
     _total_atoms=nt;
 
+    // For neighbor builds for host atoms, _max_nbors is used for neighbor
+    // allocation offsets.
+    if (_max_host > 0) mn=_max_nbors;
+
     cell_iter[0]=0;
     for (int i=1; i<_ncells; i++) {
       cell_counts[i]+=cell_counts[i-1];
       cell_iter[i]=cell_counts[i];
     }
     time_hybrid1.start();
-    cell_counts.update_device(true);
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    if (_old_ncellx!=ncellx || _old_ncelly!=ncelly || _old_ncellz!=ncellz) {
+      _old_ncellx = ncellx;
+      _old_ncelly = ncelly;
+      _old_ncellz = ncellz;
+      const int bin_stencil_stride = cells_in_cutoff * 2 + 1;
+      const int bin_stencil_size = bin_stencil_stride * bin_stencil_stride;
+      if (bin_stencil_size > _host_bin_stencil.numel())
+        _host_bin_stencil.alloc(bin_stencil_size,*dev);
+        for (int s = 0; s<bin_stencil_size; s++) {
+          const int nbory = s % bin_stencil_stride - cells_in_cutoff;
+          const int nborz = s / bin_stencil_stride - cells_in_cutoff;
+          _host_bin_stencil[s] = nbory*ncellx + nborz*ncellx*ncelly;
+        }
+      _bin_stencil.update_device(_host_bin_stencil,bin_stencil_size);
+    }
+    #endif
+    cell_counts.update_device(ncell_3d+1,true);
+#ifndef LAL_USE_OLD_NEIGHBOR
+    cell_subgroup_counts.update_device(ncell_3d+1,true);
+    subgroup2cell.update_device(subgroup_count,true);
+#endif
     time_hybrid1.stop();
     for (int i=0; i<nall; i++) {
       int celli=cell_id[i];
@@ -541,7 +728,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       particle_id[ploc]=i;
     }
     time_hybrid2.start();
-    ucl_copy(atom.dev_particle_id,atom.host_particle_id,true);
+    ucl_copy(atom.dev_particle_id,atom.host_particle_id,nall,true);
     time_hybrid2.stop();
     _bin_time+=MPI_Wtime()-stime;
   }
@@ -563,7 +750,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
     _shared->k_cell_id.run(&atom.x, &atom.dev_cell_id,
                            &atom.dev_particle_id, &sublo0, &sublo1,
                            &sublo2, &i_cell_size, &ncellx, &ncelly, &ncellz,
-                           &nt, &nall, &_cells_in_cutoff);
+                           &nt, &nall, &cells_in_cutoff);
 
     atom.sort_neighbor(nall);
 
@@ -575,22 +762,37 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 
   /* build the neighbor list */
   const int cell_block=_block_nbor_build;
+#ifndef LAL_USE_OLD_NEIGHBOR
+  int nblocks = (subgroup_count-1)/(cell_block/_simd_size)+1;
+  _shared->k_build_nbor.set_size(nblocks, cell_block);
+  _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
+                            &cell_counts, &dev_nbor, &nbor_host,
+                            &dev_numj_host, &mn, &cutoff_cast, &ncellx,
+                            &ncelly, &ncellz, &inum, &nt, &nall,
+                            &_threads_per_atom, &cells_in_cutoff,
+                            &cell_subgroup_counts, &subgroup2cell,
+                            &subgroup_count, _bin_stencil.begin(),
+                            &error_flag);
+  error_flag.update_host();
+#else
   _shared->k_build_nbor.set_size(ncellx-ghost_cells,(ncelly-ghost_cells)*
                                  (ncellz-ghost_cells),cell_block,1);
   _shared->k_build_nbor.run(&atom.x, &atom.dev_particle_id,
                             &cell_counts, &dev_nbor, &nbor_host,
-                            &dev_numj_host, &_max_nbors, &cutoff_cast, &ncellx,
+                            &dev_numj_host, &mn, &cutoff_cast, &ncellx,
                             &ncelly, &ncellz, &inum, &nt, &nall,
-                            &_threads_per_atom, &_cells_in_cutoff);
+                            &_threads_per_atom, &cells_in_cutoff);
+#endif
 
   /* Get the maximum number of nbors and realloc if necessary */
-  UCL_D_Vec<int> numj;
-  numj.view_offset(inum,dev_nbor,inum);
-  ucl_copy(host_acc,numj,inum,true);
-  if (nt>inum) {
-    UCL_H_Vec<int> host_offset;
-    host_offset.view_offset(inum,host_acc,nt-inum);
-    ucl_copy(host_offset,dev_numj_host,nt-inum,true);
+  UCL_D_Vec<int> _numj_view;
+  if (_gpu_nbor!=2 || inum<nt) {
+    _numj_view.view_offset(inum,dev_nbor,inum);
+    ucl_copy(host_acc,_numj_view,inum,true);
+    if (nt>inum) {
+      _host_offset.view_offset(inum,host_acc,nt-inum);
+      ucl_copy(_host_offset,dev_numj_host,nt-inum,true);
+    }
   }
 
   if (_gpu_nbor!=2) {
@@ -608,7 +810,7 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
       if (_time_device)
         time_kernel.add_to_total();
       build_nbor_list(x, inum, host_inum, nall, atom, sublo, subhi, tag,
-                      nspecial, special, success, mn);
+                      nspecial, special, success, mn, error_flag);
       return;
     }
   }
@@ -634,5 +836,5 @@ void Neighbor::build_nbor_list(double **x, const int inum, const int host_inum,
 template void Neighbor::build_nbor_list<PRECISION,ACC_PRECISION>
      (double **x, const int inum, const int host_inum, const int nall,
       Atom<PRECISION,ACC_PRECISION> &atom, double *sublo, double *subhi,
-      tagint *, int **, tagint **, bool &success, int &mn);
-
+      tagint *, int **, tagint **, bool &success, int &mn,
+      UCL_Vector<int,int> &error_flag);
diff --git a/lib/gpu/lal_neighbor.h b/lib/gpu/lal_neighbor.h
index 996deaff6d..5939567a41 100644
--- a/lib/gpu/lal_neighbor.h
+++ b/lib/gpu/lal_neighbor.h
@@ -1,6 +1,7 @@
 /***************************************************************************
                                   neighbor.h
                              -------------------
+                            Nitin Dhamankar (Intel)
                             W. Michael Brown (ORNL)
                               Peng Wang (Nvidia)
 
@@ -19,14 +20,25 @@
 
 #include "lal_atom.h"
 #include "lal_neighbor_shared.h"
+#include <sstream>
 
 #define IJ_SIZE 131072
 
+#if !defined(USE_OPENCL) && !defined(USE_HIP)
+#ifndef LAL_USE_OLD_NEIGHBOR
+// Issue with incorrect results with CUDA 11.2
+#if (CUDA_VERSION > 11019) && (CUDA_VERSION < 11030)
+#define LAL_USE_OLD_NEIGHBOR
+#endif
+#endif
+#endif
+
 namespace LAMMPS_AL {
 
 class Neighbor {
  public:
-  Neighbor() : _allocated(false), _use_packing(false), _ncells(0) {}
+  Neighbor() : _allocated(false), _use_packing(false), _ncells(0),
+    _old_max_nbors(0) {}
   ~Neighbor() { clear(); }
 
   /// Determine whether neighbor unpacking should be used
@@ -37,7 +49,7 @@ class Neighbor {
   /// Clear any old data and setup for new LAMMPS run
   /** \param inum Initial number of particles whose neighbors stored on device
     * \param host_inum Initial number of particles whose nbors copied to host
-    * \param max_nbors Initial number of rows in the neighbor matrix
+    * \param max_nbors Factor (in percentage) applied to density calculated max
     * \param gpu_nbor 0 if neighboring will be performed on host
     *        gpu_nbor 1 if neighboring will be performed on device
     *        gpu_nbor 2 if binning on host and neighboring on device
@@ -48,33 +60,41 @@ class Neighbor {
     *                than the force kernel
     * \param threads_per_atom Number of threads used per atom for force
     *                         calculation
-    * \param compile_flags Flags for JIT compiling **/
+    * \param compile_flags Flags for JIT compiling
+    * \param ilist_map true if ilist mapping data structures used (3-body) **/
   bool init(NeighborShared *shared, const int inum, const int host_inum,
             const int max_nbors, const int maxspecial, UCL_Device &dev,
             const int gpu_nbor, const int gpu_host, const bool pre_cut,
             const int block_cell_2d, const int block_cell_id,
             const int block_nbor_build, const int threads_per_atom,
-            const int warp_size, const bool time_device,
-            const std::string compile_flags);
+            const int simd_size, const bool time_device,
+            const std::string compile_flags, const bool ilist_map);
 
-  /// Set the size of the cutoff+skin
-  inline void cell_size(const double size, const double cutoff) {
-    _cell_size=size;
+  /// Set the cutoff+skin
+  inline void set_cutoff(const double cutoff) {
     _cutoff=cutoff;
-    if (cutoff>size)
-      _cells_in_cutoff=static_cast<int>(ceil(cutoff/size));
-    else
-      _cells_in_cutoff=1;
+
+    #ifndef LAL_USE_OLD_NEIGHBOR
+    _cell_size=_shared->cell_size();
+    _auto_cell_size=_shared->auto_cell_size();
+    const int cells_in_cutoff=static_cast<int>(ceil(_cutoff/_cell_size));
+    if (cells_in_cutoff > 2) _cell_size=_cutoff*0.5;
+    _old_ncellx = _old_ncelly = _old_ncellz = -1;
+    #else
+    _cell_size=cutoff;
+    _auto_cell_size=false;
+    #endif
   }
 
-  /// Get the size of the cutoff+skin
-  inline double cell_size() const { return _cell_size; }
+  /// Get the cutoff+skin
+  inline double cutoff() { return _cutoff; }
 
   /// Check if there is enough memory for neighbor data and realloc if not
   /** \param inum Number of particles whose nbors will be stored on device
     * \param max_nbor Current max number of neighbors for a particle
     * \param success False if insufficient memory **/
-  inline void resize(const int inum, const int max_nbor, bool &success) {
+  inline void resize(const int inum, int max_nbor, bool &success) {
+    if (max_nbor == 0) max_nbor = 1;
     if (inum>_max_atoms || max_nbor>_max_nbors) {
       _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
       if (max_nbor>_max_nbors)
@@ -88,8 +108,9 @@ class Neighbor {
     * \param host_inum Number of particles whose nbors will be copied to host
     * \param max_nbor Current max number of neighbors for a particle
     * \param success False if insufficient memory **/
-  inline void resize(const int inum, const int host_inum, const int max_nbor,
+  inline void resize(const int inum, const int host_inum, int max_nbor,
                      bool &success) {
+    if (max_nbor == 0) max_nbor = 1;
     if (inum>_max_atoms || max_nbor>_max_nbors || host_inum>_max_host) {
       _max_atoms=static_cast<int>(static_cast<double>(inum)*1.10);
       _max_host=static_cast<int>(static_cast<double>(host_inum)*1.10);
@@ -99,15 +120,8 @@ class Neighbor {
     }
   }
 
-  inline void acc_timers() {
+  inline void acc_timers(FILE *screen) {
     if (_nbor_time_avail) {
-      if (_gpu_nbor==2) {
-        int mn=0;
-        for (int i=0; i<_total_atoms; i++)
-          mn=std::max(mn,host_acc[i]);
-        if (mn>_max_nbors)
-          assert(0==1);
-      }
       if (_time_device) {
         time_nbor.add_to_total();
         if (_use_packing==false) time_kernel.add_to_total();
@@ -172,9 +186,10 @@ class Neighbor {
   /// Build nbor list on the device
   template <class numtyp, class acctyp>
   void build_nbor_list(double **x, const int inum, const int host_inum,
-                       const int nall, Atom<numtyp,acctyp> &atom, double *sublo,
-                       double *subhi, tagint *tag, int **nspecial, tagint **special,
-                       bool &success, int &max_nbors);
+                       const int nall, Atom<numtyp,acctyp> &atom,
+                       double *sublo, double *subhi, tagint *tag,
+                       int **nspecial, tagint **special, bool &success,
+                       int &max_nbors, UCL_Vector<int,int> &error_flag);
 
   /// Return the number of bytes used on device
   inline double gpu_bytes() {
@@ -193,14 +208,16 @@ class Neighbor {
     * - 3rd row is starting location in packed nbors
     * - Remaining rows are the neighbors arranged for coalesced access **/
   UCL_D_Vec<int> dev_nbor;
+  /// Starting location in packed neighbors used only by unpack kernel
+  UCL_D_Vec<int> dev_packed_begin;
   /// Packed storage for neighbor lists copied from host
   UCL_D_Vec<int> dev_packed;
   /// Host buffer for copying neighbor lists
   UCL_H_Vec<int> host_packed;
   /// Host storage for nbor counts (row 1) & accumulated neighbor counts (row2)
   UCL_H_Vec<int> host_acc;
-  /// Device storage for accessing atom indices from the neighbor list (3-body)
-  UCL_D_Vec<int> dev_ilist;
+  /// Storage for accessing atom indices from the neighbor list (3-body)
+  UCL_Vector<int,int> three_ilist;
 
   // ----------------- Data for GPU Neighbor Calculation ---------------
 
@@ -217,18 +234,36 @@ class Neighbor {
   UCL_D_Vec<tagint> dev_special, dev_special_t;
   /// Host/Device storage for number of particles per cell
   UCL_Vector<int,int> cell_counts;
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  /// Host/Device storage for number of subgroups per cell
+  UCL_Vector<int,int> cell_subgroup_counts;
+  /// Host/Device storage for subgroup to cell mapping
+  UCL_Vector<int,int> subgroup2cell;
+  #endif
   int *cell_iter;
 
   /// Device timers
   UCL_Timer time_nbor, time_kernel, time_hybrid1, time_hybrid2, time_transpose;
 
+  /// Effective SIMD width of neighbor build kernel
+  inline int simd_size() { return _simd_size; }
+
+  template <class t>
+    inline std::string toa(const t& in) {
+    std::ostringstream o;
+    o.precision(2);
+    o << in;
+    return o.str();
+  }
+
  private:
   NeighborShared *_shared;
   UCL_Device *dev;
   bool _allocated, _use_packing, _nbor_time_avail, _time_device;
   int _gpu_nbor, _max_atoms, _max_nbors, _max_host, _nbor_pitch, _maxspecial;
-  bool _gpu_host, _alloc_packed;
-  double _cutoff, _cell_size, _bin_time;
+  int _old_max_nbors;
+  bool _gpu_host, _alloc_packed, _ilist_map, _auto_cell_size;
+  double _cutoff, _bin_time, _max_neighbor_factor, _cell_size;
   enum UCL_MEMOPT _packed_permissions;
 
   double _gpu_bytes, _c_bytes, _cell_bytes;
@@ -236,18 +271,29 @@ class Neighbor {
 
   int _block_cell_2d, _block_cell_id, _max_block_nbor_build, _block_nbor_build;
   int _ncells, _threads_per_atom, _total_atoms;
-  int _cells_in_cutoff;
 
   template <class numtyp, class acctyp>
-  inline void resize_max_neighbors(const int maxn, bool &success);
+  inline void resize_max_neighbors(int maxn, bool &success);
 
-  int _warp_size;
+  // For viewing host arrays for data copy operations
+  UCL_H_Vec<int> _host_offset;
+  UCL_D_Vec<int> _nbor_offset, _acc_view, _numj_view;
+
+  #ifndef LAL_USE_OLD_NEIGHBOR
+  UCL_H_Vec<int> _host_bin_stencil;
+  UCL_Const _bin_stencil;
+  int _old_ncellx, _old_ncelly, _old_ncellz;
+  #endif
+
+  int _simd_size;
   inline void set_nbor_block_size(const int mn) {
-    int desired=mn/(2*_warp_size);
-    desired*=_warp_size;
-    if (desired<_warp_size) desired=_warp_size;
+    #ifdef LAL_USE_OLD_NEIGHBOR
+    int desired=mn/(2*_simd_size);
+    desired*=_simd_size;
+    if (desired<_simd_size) desired=_simd_size;
     else if (desired>_max_block_nbor_build) desired=_max_block_nbor_build;
     _block_nbor_build=desired;
+    #endif
   }
 };
 
diff --git a/lib/gpu/lal_neighbor_cpu.cu b/lib/gpu/lal_neighbor_cpu.cu
index f8b32e1746..3dfe23bdc2 100644
--- a/lib/gpu/lal_neighbor_cpu.cu
+++ b/lib/gpu/lal_neighbor_cpu.cu
@@ -19,6 +19,7 @@
 
 __kernel void kernel_unpack(__global int *dev_nbor,
                             const __global int *dev_ij,
+                            const __global int *dev_ij_begin,
                             const int inum, const int t_per_atom) {
   int tid=THREAD_ID_X;
   int offset=tid & (t_per_atom-1);
@@ -28,7 +29,7 @@ __kernel void kernel_unpack(__global int *dev_nbor,
     int nbor=ii+inum;
     int numj=dev_nbor[nbor];
     nbor+=inum;
-    int list=dev_nbor[nbor];
+    int list=dev_ij_begin[ii];
     int list_end=list+numj;
     list+=offset;
     nbor+=fast_mul(ii,t_per_atom-1)+offset;
@@ -40,4 +41,3 @@ __kernel void kernel_unpack(__global int *dev_nbor,
     }
   } // if ii
 }
-
diff --git a/lib/gpu/lal_neighbor_gpu.cu b/lib/gpu/lal_neighbor_gpu.cu
index f1da437c86..2aca505396 100644
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@@ -1,6 +1,7 @@
 // **************************************************************************
 //                               neighbor_gpu.cu
 //                             -------------------
+//                            Nitin Dhamankar (Intel)
 //                              Peng Wang (Nvidia)
 //                           W. Michael Brown (ORNL)
 //
@@ -32,7 +33,14 @@ _texture( pos_tex,float4);
 _texture_2d( pos_tex,int4);
 #endif
 
-__kernel void calc_cell_id(const numtyp4 *restrict pos,
+#ifdef NV_KERNEL
+#if (__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ == 2)
+// Issue with incorrect results in CUDA 11.2
+#define LAL_USE_OLD_NEIGHBOR
+#endif
+#endif
+
+__kernel void calc_cell_id(const numtyp4 *restrict x_,
                            unsigned *restrict cell_id,
                            int *restrict particle_id,
                            numtyp boxlo0, numtyp boxlo1, numtyp boxlo2,
@@ -43,7 +51,7 @@ __kernel void calc_cell_id(const numtyp4 *restrict pos,
 
   if (i < nall) {
     numtyp4 p;
-    fetch4(p,i,pos_tex); //pos[i];
+    fetch4(p,i,pos_tex); //x_[i];
 
     p.x -= boxlo0;
     p.y -= boxlo1;
@@ -138,16 +146,219 @@ __kernel void transpose(__global tagint *restrict out,
     out[j*rows_in+i] = block[ti][tj];
 }
 
+#ifndef LAL_USE_OLD_NEIGHBOR
+
+#define MAX_STENCIL_SIZE 25
+#if !defined(MAX_SUBGROUPS_PER_BLOCK)
+#define MAX_SUBGROUPS_PER_BLOCK 8
+#endif
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+__device__ __constant__  int bin_stencil[MAX_STENCIL_SIZE];
+#endif
+
 __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
-                                   const __global int *restrict cell_particle_id,
-                                   const __global int *restrict cell_counts,
-                                   __global int *nbor_list,
-                                   __global int *host_nbor_list,
-                                   __global int *host_numj,
-                                   int neigh_bin_size, numtyp cell_size,
-                                   int ncellx, int ncelly, int ncellz,
-                                   int inum, int nt, int nall, int t_per_atom,
-                                   int cells_in_cutoff)
+                            const __global int *restrict cell_particle_id,
+                            const __global int *restrict cell_counts,
+                            __global int *nbor_list,
+                            __global int *host_nbor_list,
+                            __global int *host_numj,
+                            int neigh_bin_size, numtyp cutoff_neigh,
+                            int ncellx, int ncelly, int ncellz,
+                            int inum, int nt, int nall, int t_per_atom,
+                            int cells_in_cutoff,
+                            const __global int *restrict cell_subgroup_counts,
+                            const __global int *restrict subgroup2cell,
+                            int subgroup_count,
+#if defined(NV_KERNEL) || defined(USE_HIP)
+                            int *not_used, __global int *error_flag)
+#else
+                            __constant int *bin_stencil,
+                            __global int *error_flag)
+#endif
+{
+  int tid = THREAD_ID_X;
+  int bsx = BLOCK_SIZE_X;
+  int simd_size = simd_size();
+  int subgroup_id_local = tid / simd_size;
+  int subgroup_id_global = BLOCK_ID_X * bsx / simd_size + subgroup_id_local;
+  int lane_id = tid % simd_size;
+
+#if (SHUFFLE_AVAIL == 0)
+  __local int cell_list_sh[BLOCK_NBOR_BUILD];
+  __local numtyp4 pos_sh[BLOCK_NBOR_BUILD];
+  __local int local_cell_counts[BLOCK_NBOR_BUILD];
+#endif
+  __local int local_begin[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK];
+  __local int local_counts[(MAX_STENCIL_SIZE+1)*MAX_SUBGROUPS_PER_BLOCK];
+
+  if (subgroup_id_global < subgroup_count) {
+    // identify own cell for subgroup (icell) and local atom (i) for the lane
+    int icell = subgroup2cell[subgroup_id_global];
+    int icell_end = cell_counts[icell+1];
+    int i = cell_counts[icell] + (subgroup_id_global -
+                                  cell_subgroup_counts[icell]) *
+      simd_size + lane_id;
+
+    // Get count of the number of iterations to finish all cells
+    const int bin_stencil_stride = cells_in_cutoff * 2 + 1;
+    const int bin_stencil_size = bin_stencil_stride * bin_stencil_stride;
+    int offset = 0;
+    int cell_count = 0, jcellyz, jcell_begin;
+    const int offset2 = subgroup_id_local * (MAX_STENCIL_SIZE+1);
+    const int niter = (bin_stencil_size - 1)/simd_size + 1;
+    int end_idx = simd_size;
+    for (int ni = 0; ni < niter; ni++) {
+      if (ni == niter - 1)
+        end_idx = bin_stencil_size - offset;
+      if (lane_id < end_idx) {
+        jcellyz = icell + bin_stencil[lane_id + offset];
+        jcell_begin = cell_counts[jcellyz - cells_in_cutoff];
+        local_begin[lane_id + offset2 + offset] = jcell_begin;
+            const int local_count = cell_counts[jcellyz + cells_in_cutoff + 1] -
+                                    jcell_begin;
+            cell_count += local_count;
+        local_counts[lane_id + offset2 + offset] = local_count;
+      }
+      offset += simd_size;
+    }
+
+#if (SHUFFLE_AVAIL == 0)
+    local_cell_counts[tid] = cell_count;
+    offset = subgroup_id_local * simd_size;
+    for (unsigned int mask=simd_size/2; mask>0; mask>>=1) {
+      simdsync();
+      local_cell_counts[tid] += local_cell_counts[ offset + lane_id^mask ];
+    }
+    simdsync();
+    cell_count = local_cell_counts[tid];
+#else
+    #pragma unroll
+    for (unsigned int s=simd_size/2; s>0; s>>=1)
+      cell_count += shfl_xor(cell_count, s, simd_size);
+#endif
+
+    int num_iter = cell_count;
+    int remainder = num_iter % simd_size;
+    if (remainder == 0) remainder = simd_size;
+    if (num_iter) num_iter = (num_iter - 1) / simd_size + 1;
+
+    numtyp4 diff;
+    numtyp r2;
+
+    int pid_i = nall, lpid_j, stride;
+    numtyp4 atom_i, atom_j;
+    int cnt = 0;
+    __global int *neigh_counts, *neigh_list;
+
+    if (i < icell_end)
+      pid_i = cell_particle_id[i];
+
+    if (pid_i < nt) {
+      fetch4(atom_i,pid_i,pos_tex); //pos[i];
+    }
+
+    if (pid_i < inum) {
+      stride=inum;
+      neigh_counts=nbor_list+stride+pid_i;
+      neigh_list=neigh_counts+stride+pid_i*(t_per_atom-1);
+      stride=stride*t_per_atom-t_per_atom;
+      nbor_list[pid_i]=pid_i;
+    } else {
+      stride=0;
+      neigh_counts=host_numj+pid_i-inum;
+      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
+    }
+
+    // loop through neighbors
+    int bin_shift = 0;
+    int zy = -1;
+    int num_atom_cell = 0;
+    int cell_pos = lane_id;
+    end_idx = simd_size;
+    for (int ci = 0; ci < num_iter; ci++) {
+      cell_pos += simd_size;
+      while (cell_pos >= num_atom_cell && zy < bin_stencil_size) {
+        // Shift lane index into atom bins based on remainder from last bin
+        bin_shift += num_atom_cell % simd_size;
+        if (bin_shift >= simd_size) bin_shift -= simd_size;
+        cell_pos = lane_id - bin_shift;
+        if (cell_pos < 0) cell_pos += simd_size;
+        // Move to next bin
+        zy++;
+        jcell_begin = local_begin[offset2 + zy];
+        num_atom_cell = local_counts[offset2 + zy];
+      }
+
+      if (zy < bin_stencil_size) {
+        lpid_j =  cell_particle_id[jcell_begin + cell_pos];
+        fetch4(atom_j,lpid_j,pos_tex);
+#if (SHUFFLE_AVAIL == 0)
+        cell_list_sh[tid] = lpid_j;
+        pos_sh[tid].x = atom_j.x;
+        pos_sh[tid].y = atom_j.y;
+        pos_sh[tid].z = atom_j.z;
+      }
+      simdsync();
+#else
+      }
+#endif
+
+      if (ci == num_iter-1) end_idx = remainder;
+
+      for (int j = 0; j < end_idx; j++) {
+#if (SHUFFLE_AVAIL == 0)
+        int pid_j = cell_list_sh[offset+j]; // gather from shared memory
+        diff.x = atom_i.x - pos_sh[offset+j].x;
+        diff.y = atom_i.y - pos_sh[offset+j].y;
+        diff.z = atom_i.z - pos_sh[offset+j].z;
+#else
+        int pid_j = simd_broadcast_i(lpid_j, j, simd_size);
+#ifdef _DOUBLE_DOUBLE
+        diff.x = atom_i.x - simd_broadcast_d(atom_j.x, j, simd_size);
+        diff.y = atom_i.y - simd_broadcast_d(atom_j.y, j, simd_size);
+        diff.z = atom_i.z - simd_broadcast_d(atom_j.z, j, simd_size);
+#else
+        diff.x = atom_i.x - simd_broadcast_f(atom_j.x, j, simd_size);
+        diff.y = atom_i.y - simd_broadcast_f(atom_j.y, j, simd_size);
+        diff.z = atom_i.z - simd_broadcast_f(atom_j.z, j, simd_size);
+#endif
+#endif
+
+        r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
+//USE CUTOFFSQ?
+        if (r2 < cutoff_neigh*cutoff_neigh && pid_j != pid_i && pid_i < nt) {
+          if (cnt < neigh_bin_size) {
+            cnt++;
+            *neigh_list = pid_j;
+            neigh_list++;
+            if ((cnt & (t_per_atom-1))==0)
+              neigh_list=neigh_list+stride;
+          } else
+            *error_flag=1;
+        }
+      } // for j
+#if (SHUFFLE_AVAIL == 0)
+      simdsync();
+#endif
+    } // for (ci)
+    if (pid_i < nt)
+      *neigh_counts = cnt;
+  } // if (subgroup_id_global < subgroup_count)
+}
+
+#else
+
+__kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
+                                const __global int *restrict cell_particle_id,
+                                const __global int *restrict cell_counts,
+                                __global int *nbor_list,
+                                __global int *host_nbor_list,
+                                __global int *host_numj,
+                                int neigh_bin_size, numtyp cell_size,
+                                int ncellx, int ncelly, int ncellz,
+                                int inum, int nt, int nall, int t_per_atom,
+                                int cells_in_cutoff)
 {
   int tid = THREAD_ID_X;
   int ix = BLOCK_ID_X + cells_in_cutoff;
@@ -232,7 +443,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                 diff.z = atom_i.z - pos_sh[j].z;
 
                 r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                if (r2 < cell_size*cell_size && pid_j != pid_i) { //  && r2 > 1e-5
+                if (r2 < cell_size*cell_size && pid_j != pid_i) {
                   cnt++;
                   if (cnt <= neigh_bin_size) {
                     *neigh_list = pid_j;
@@ -253,6 +464,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
   } // for (i)
 }
 
+#endif
+
 __kernel void kernel_special(__global int *dev_nbor,
                              __global int *host_nbor_list,
                              const __global int *host_numj,
@@ -310,4 +523,3 @@ __kernel void kernel_special(__global int *dev_nbor,
     }
   } // if ii
 }
-
diff --git a/lib/gpu/lal_neighbor_shared.cpp b/lib/gpu/lal_neighbor_shared.cpp
index f1458b35be..e1c3f5ca68 100644
--- a/lib/gpu/lal_neighbor_shared.cpp
+++ b/lib/gpu/lal_neighbor_shared.cpp
@@ -13,6 +13,7 @@
     email                : brownw@ornl.gov
  ***************************************************************************/
 
+#include <cmath>
 #include "lal_precision.h"
 #include "lal_neighbor_shared.h"
 
@@ -48,6 +49,45 @@ void NeighborShared::clear() {
   }
 }
 
+double NeighborShared::best_cell_size(const double subx, const double suby,
+                                      const double subz, const int nlocal,
+                                      const double cut) {
+  if (_cached_cell_size && _cut_sort==cut) {
+    _cached_cell_size=false;
+    return _cell_size;
+  }
+
+  const double box_density = static_cast<double>(nlocal) / (subx*suby*subz);
+  const double density=box_density*cut*cut*cut;
+  if (density >= 4.0 * _simd_size) return cut*0.5;
+  else if (density >= 0.5 * _simd_size) return cut;
+
+  const double iters = 60;
+  const double inc = cut/(iters-1);
+  const double iss = 1.0 / _simd_size;
+  double test_size = cut;
+  double best_iters = 1e200;
+  double best_size;
+  for (int i = 0; i < iters; i++) {
+    const double i_test_size = 1.0/test_size;
+    const int ncellx = static_cast<int>(ceil(subx*i_test_size));
+    const int ncelly = static_cast<int>(ceil(suby*i_test_size));
+    const int ncellz = static_cast<int>(ceil(subz*i_test_size));
+    const double density = box_density*test_size*test_size*test_size;
+    const double iters_per_cell = ceil(iss*density);
+    const double iters = ncellx*ncelly*ncellz*iters_per_cell*
+      ceil(density*27.0*iss);
+    if (iters < best_iters) {
+      best_iters = iters;
+      best_size = test_size;
+    }
+    test_size += inc;
+  }
+  const int cells_in_cutoff=static_cast<int>(ceil(cut/best_size));
+  if (cells_in_cutoff > 2) best_size=cut*0.5;
+  return best_size;
+}
+
 void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
                                      const std::string flags) {
   if (_compiled)
@@ -56,11 +96,11 @@ void NeighborShared::compile_kernels(UCL_Device &dev, const int gpu_nbor,
   _gpu_nbor=gpu_nbor;
   if (_gpu_nbor==0) {
     nbor_program=new UCL_Program(dev);
-    nbor_program->load_string(neighbor_cpu,flags.c_str());
+    nbor_program->load_string(neighbor_cpu,flags.c_str(),nullptr,stderr);
     k_nbor.set_function(*nbor_program,"kernel_unpack");
   } else {
     build_program=new UCL_Program(dev);
-    build_program->load_string(neighbor_gpu,flags.c_str());
+    build_program->load_string(neighbor_gpu,flags.c_str(),nullptr,stderr);
 
     if (_gpu_nbor==1) {
       k_cell_id.set_function(*build_program,"calc_cell_id");
diff --git a/lib/gpu/lal_neighbor_shared.h b/lib/gpu/lal_neighbor_shared.h
index 5cfc4e4767..e574aaeaeb 100644
--- a/lib/gpu/lal_neighbor_shared.h
+++ b/lib/gpu/lal_neighbor_shared.h
@@ -47,6 +47,44 @@ class NeighborShared {
   /// Texture for cached position/type access with CUDA
   UCL_Texture neigh_tex;
 
+  /// Use a heuristic to approximate best bin size assuming uniform density
+  /** This is only called by core LAMMPS for atom sort sizes **/
+  inline double update_cell_size(const double subx, const double suby,
+                                 const double subz, const int nlocal,
+                                 const double cut) {
+    if (_auto_cell_size==false || subz==0.0) return cut;
+    else {
+      _cell_size=best_cell_size(subx, suby, subz, nlocal, cut);
+      _cached_cell_size=true;
+      _cut_sort=cut;
+      return _cell_size;
+    }
+  }
+
+  /// Use a heuristic to approximate best bin size assuming uniform density
+  double best_cell_size(const double subx, const double suby,
+                        const double subz, const int nlocal,
+                        const double cut);
+
+  /// Current cutoff used for cell size determination
+  inline double neighbor_cutoff() { return _neighbor_cutoff; }
+
+  /// Current neighbor cell size
+  inline double cell_size() { return _cell_size; }
+
+  /// Return setting for auto cell size
+  inline bool auto_cell_size() { return _auto_cell_size; }
+
+  inline void setup_auto_cell_size(const bool autosize, const double cut,
+                                   const int simd_size) {
+    _auto_cell_size = autosize;
+    _cached_cell_size = false;
+    _neighbor_cutoff = cut;
+    _cell_size = cut;
+    _simd_size = simd_size;
+    if (_simd_size < 2) _auto_cell_size = false;
+  }
+
   /// Compile kernels for neighbor lists
   void compile_kernels(UCL_Device &dev, const int gpu_nbor,
                        const std::string flags);
@@ -59,6 +97,8 @@ class NeighborShared {
  private:
   bool _compiled;
   int _gpu_nbor;
+  bool _auto_cell_size, _cached_cell_size;
+  double _neighbor_cutoff, _cell_size, _simd_size, _cut_sort;
 };
 
 }
diff --git a/lib/gpu/lal_pppm.cpp b/lib/gpu/lal_pppm.cpp
index 6b5bf88ea5..6e8fe237a6 100644
--- a/lib/gpu/lal_pppm.cpp
+++ b/lib/gpu/lal_pppm.cpp
@@ -71,7 +71,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
   if (flag!=0)
     return 0;
   if (sizeof(grdtyp)==sizeof(double) && device->double_precision()==false) {
-    flag=-5;
+    flag=-15;
     return 0;
   }
   if (device->ptx_arch()>0.0 && device->ptx_arch()<1.1) {
@@ -133,7 +133,7 @@ grdtyp * PPPMT::init(const int nlocal, const int nall, FILE *_screen,
                       UCL_SUCCESS);
   UCL_H_Vec<grdtyp> view;
   view.view(rho_coeff[0]+n2lo,numel,*ucl_device);
-  ucl_copy(d_rho_coeff,view,true);
+  ucl_copy(d_rho_coeff,view,false);
   _max_bytes+=d_rho_coeff.row_bytes();
 
   // Allocate storage for grid
@@ -191,6 +191,7 @@ void PPPMT::clear(const double cpu_time) {
   d_brick_counts.clear();
   error_flag.clear();
   d_brick_atoms.clear();
+  d_rho_coeff.clear();
 
   acc_timers();
   device->output_kspace_times(time_in,time_out,time_map,time_rho,time_interp,
@@ -261,7 +262,7 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   double delvolinv = delxinv*delyinv*delzinv;
   grdtyp f_delvolinv = delvolinv;
 
-  device->zero(d_brick_counts,d_brick_counts.numel());
+  d_brick_counts.zero();
   k_particle_map.set_size(GX,BX);
   k_particle_map.run(&atom->x, &atom->q, &f_delvolinv, &ainum,
                      &d_brick_counts, &d_brick_atoms, &_brick_x, &_brick_y,
@@ -286,6 +287,10 @@ void PPPMT::_precompute(const int ago, const int nlocal, const int nall,
   error_flag.update_host(true);
   time_out.stop();
 
+  #ifndef GERYON_OCL_FLUSH
+  error_flag.flush();
+  #endif
+
   _precompute_done=true;
 }
 
@@ -351,7 +356,7 @@ void PPPMT::interp(const grdtyp qqrd2e_scale) {
                &ans->force);
   time_interp.stop();
 
-  ans->copy_answers(false,false,false,false);
+  ans->copy_answers(false,false,false,false,0);
   if (_kspace_split==false)
     device->add_ans_object(ans);
 }
@@ -374,18 +379,19 @@ void PPPMT::compile_kernels(UCL_Device &dev) {
   #ifdef USE_OPENCL
   flags+=std::string(" -Dgrdtyp=")+ucl_template_name<grdtyp>()+" -Dgrdtyp4="+
          ucl_template_name<grdtyp>()+"4";
+  if (sizeof(grdtyp)==sizeof(double)) flags+=std::string(" -DGRD_DBL");
   #endif
 
   if (pppm_program) delete pppm_program;
   pppm_program=new UCL_Program(dev);
 
   #ifdef USE_OPENCL
-  pppm_program->load_string(pppm,flags.c_str());
+  pppm_program->load_string(pppm,flags.c_str(),nullptr,screen);
   #else
   if (sizeof(grdtyp)==sizeof(float))
-    pppm_program->load_string(pppm_f,flags.c_str());
+    pppm_program->load_string(pppm_f,flags.c_str(),nullptr,screen);
   else
-    pppm_program->load_string(pppm_d,flags.c_str());
+    pppm_program->load_string(pppm_d,flags.c_str(),nullptr,screen);
   #endif
 
   k_particle_map.set_function(*pppm_program,"particle_map");
diff --git a/lib/gpu/lal_pppm.cu b/lib/gpu/lal_pppm.cu
index ee9f1b61d6..e17df5b88c 100644
--- a/lib/gpu/lal_pppm.cu
+++ b/lib/gpu/lal_pppm.cu
@@ -35,11 +35,14 @@ _texture( q_tex,int2);
 #define pos_tex x_
 #define q_tex q_
 #pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics: enable
+
+#ifdef GRD_DBL
 #if defined(cl_amd_fp64)
 #pragma OPENCL EXTENSION cl_amd_fp64 : enable
 #else
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 #endif
+#endif
 
 #endif
 
diff --git a/lib/gpu/lal_pppm_ext.cpp b/lib/gpu/lal_pppm_ext.cpp
index b826881392..d548b94be1 100644
--- a/lib/gpu/lal_pppm_ext.cpp
+++ b/lib/gpu/lal_pppm_ext.cpp
@@ -129,7 +129,8 @@ double pppm_gpu_bytes_f() {
 void pppm_gpu_forces_f(double **f) {
   double etmp;
   PPPMF.atom->data_unavail();
-  PPPMF.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp);
+  int error_flag;
+  PPPMF.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp,error_flag);
 }
 
 double * pppm_gpu_init_d(const int nlocal, const int nall, FILE *screen,
@@ -173,6 +174,7 @@ double pppm_gpu_bytes_d() {
 void pppm_gpu_forces_d(double **f) {
   double etmp;
   PPPMD.atom->data_unavail();
-  PPPMD.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp);
+  int error_flag;
+  PPPMD.ans->get_answers(f,nullptr,nullptr,nullptr,nullptr,etmp,error_flag);
 }
 
diff --git a/lib/gpu/lal_pre_cuda_hip.h b/lib/gpu/lal_pre_cuda_hip.h
new file mode 100644
index 0000000000..d37b4a94c2
--- /dev/null
+++ b/lib/gpu/lal_pre_cuda_hip.h
@@ -0,0 +1,355 @@
+// **************************************************************************
+//                               pre_cuda_hip.h
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//                           Nitin Dhamankar (Intel)
+//
+//  Device-side preprocessor definitions for CUDA and HIP builds
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+//*************************************************************************
+//                       Device Configuration Definitions
+//                    See lal_preprocessor.h for definitions
+//*************************************************************************/
+
+// -------------------------------------------------------------------------
+//                           CUDA and HIP DEFINITIONS
+// -------------------------------------------------------------------------
+
+#if defined(NV_KERNEL) || defined(USE_HIP)
+
+// -------------------------------------------------------------------------
+//                             DEVICE CONFIGURATION
+// -------------------------------------------------------------------------
+
+
+#ifdef __HIP_PLATFORM_HCC__
+#define CONFIG_ID 303
+#define SIMD_SIZE 64
+#else
+#define CONFIG_ID 103
+#define SIMD_SIZE 32
+#endif
+
+#define MEM_THREADS SIMD_SIZE
+#define SHUFFLE_AVAIL 1
+#define FAST_MATH 1
+
+#define THREADS_PER_ATOM 4
+#define THREADS_PER_CHARGE 8
+#define THREADS_PER_THREE 2
+
+#define BLOCK_PAIR 256
+#define BLOCK_BIO_PAIR 256
+#define BLOCK_ELLIPSE 128
+#define PPPM_BLOCK_1D 64
+#define BLOCK_NBOR_BUILD 128
+#define BLOCK_CELL_2D 8
+#define BLOCK_CELL_ID 128
+
+#define MAX_SHARED_TYPES 11
+#define MAX_BIO_SHARED_TYPES 128
+#define PPPM_MAX_SPLINE 8
+
+// -------------------------------------------------------------------------
+//                          LEGACY DEVICE CONFIGURATION
+// -------------------------------------------------------------------------
+
+#ifdef __CUDA_ARCH__
+
+#if (__CUDA_ARCH__ < 200)
+
+#undef CONFIG_ID
+#define CONFIG_ID 101
+#define MEM_THREADS 16
+#undef THREADS_PER_ATOM
+#define THREADS_PER_ATOM 1
+#undef THREADS_PER_CHARGE
+#define THREADS_PER_CHARGE 16
+#undef BLOCK_PAIR
+#define BLOCK_PAIR 64
+#undef BLOCK_BIO_PAIR
+#define BLOCK_BIO_PAIR 64
+#undef BLOCK_NBOR_BUILD
+#define BLOCK_NBOR_BUILD 64
+#undef MAX_SHARED_TYPES
+#define MAX_SHARED_TYPES 8
+#undef SHUFFLE_AVAIL
+#define SHUFFLE_AVAIL 0
+
+#elseif (__CUDA_ARCH__ < 300)
+
+#undef CONFIG_ID
+#define CONFIG_ID 102
+#undef BLOCK_PAIR
+#define BLOCK_PAIR 128
+#undef BLOCK_BIO_PAIR
+#define BLOCK_BIO_PAIR 128
+#undef MAX_SHARED_TYPES
+#define MAX_SHARED_TYPES 8
+#undef SHUFFLE_AVAIL
+#define SHUFFLE_AVAIL 0
+
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                              KERNEL MACROS
+// -------------------------------------------------------------------------
+
+#ifdef USE_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+#define fast_mul(X,Y) (X)*(Y)
+
+#ifdef __CUDA_ARCH__
+#if (__CUDA_ARCH__ < 200)
+#define fast_mul __mul24
+#endif
+#endif
+
+#define EVFLAG 1
+#define NOUNROLL
+#define GLOBAL_ID_X threadIdx.x+fast_mul(blockIdx.x,blockDim.x)
+#define GLOBAL_ID_Y threadIdx.y+fast_mul(blockIdx.y,blockDim.y)
+#define GLOBAL_SIZE_X fast_mul(gridDim.x,blockDim.x);
+#define GLOBAL_SIZE_Y fast_mul(gridDim.y,blockDim.y);
+#define THREAD_ID_X threadIdx.x
+#define THREAD_ID_Y threadIdx.y
+#define BLOCK_ID_X blockIdx.x
+#define BLOCK_ID_Y blockIdx.y
+#define BLOCK_SIZE_X blockDim.x
+#define BLOCK_SIZE_Y blockDim.y
+#define NUM_BLOCKS_X gridDim.x
+
+#define __kernel extern "C" __global__
+#ifdef __local
+#undef __local
+#endif
+#define __local __shared__
+#define __global
+#define restrict __restrict__
+#define atom_add atomicAdd
+#define ucl_inline static __inline__ __device__
+
+#define simd_size() SIMD_SIZE
+
+#define simdsync()
+
+#ifdef NV_KERNEL
+#if (__CUDACC_VER_MAJOR__ >= 9)
+#undef simdsync
+#define simdsync() __syncwarp(0xffffffff)
+#endif
+#endif
+
+#ifdef __HIP_PLATFORM_NVCC__
+#undef simdsync()
+#define simdsync() __syncwarp(0xffffffff)
+#endif
+
+// -------------------------------------------------------------------------
+//                         KERNEL MACROS - TEXTURES
+// -------------------------------------------------------------------------
+
+#ifdef __HIP_PLATFORM_HCC__
+#define _texture(name, type)  __device__ type* name
+#define _texture_2d(name, type)  __device__ type* name
+#else
+#define _texture(name, type)  texture<type> name
+#define _texture_2d(name, type) texture<type,1> name
+#endif
+
+#if (__CUDACC_VER_MAJOR__ < 11)
+  #ifdef _DOUBLE_DOUBLE
+  #define fetch4(ans,i,pos_tex) {                        \
+    int4 xy = tex1Dfetch(pos_tex,i*2);                   \
+    int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
+    ans.x=__hiloint2double(xy.y, xy.x);                  \
+    ans.y=__hiloint2double(xy.w, xy.z);                  \
+    ans.z=__hiloint2double(zt.y, zt.x);                  \
+    ans.w=__hiloint2double(zt.w, zt.z);                  \
+  }
+  #define fetch(ans,i,q_tex) {                           \
+    int2 qt = tex1Dfetch(q_tex,i);                       \
+    ans=__hiloint2double(qt.y, qt.x);                    \
+  }
+  #else
+  #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
+  #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
+  #endif
+#else
+  #define fetch4(ans,i,x) ans=x[i]
+  #define fetch(ans,i,q) ans=q[i]
+  #undef _texture
+  #undef _texture_2d
+  #define _texture(name, type)
+  #define _texture_2d(name, type)
+  #define pos_tex x_
+  #define quat_tex qif
+  #define q_tex q_
+  #define vel_tex v_
+  #define mu_tex mu_
+#endif
+
+#ifdef __HIP_PLATFORM_HCC__
+
+#undef fetch4
+#undef fetch
+
+#ifdef _DOUBLE_DOUBLE
+#define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
+#define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
+#else
+#define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
+#define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                           KERNEL MACROS - MATH
+// -------------------------------------------------------------------------
+
+#ifdef CUDA_PRE_THREE
+struct __builtin_align__(16) _double4
+{
+  double x, y, z, w;
+};
+typedef struct _double4 double4;
+#endif
+
+#ifdef _DOUBLE_DOUBLE
+
+#define ucl_exp exp
+#define ucl_powr pow
+#define ucl_atan atan
+#define ucl_cbrt cbrt
+#define ucl_ceil ceil
+#define ucl_abs fabs
+#define ucl_rsqrt rsqrt
+#define ucl_sqrt sqrt
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
+#else
+
+#define ucl_atan atanf
+#define ucl_cbrt cbrtf
+#define ucl_ceil ceilf
+#define ucl_abs fabsf
+#define ucl_recip(x) ((numtyp)1.0/(x))
+#define ucl_rsqrt rsqrtf
+#define ucl_sqrt sqrtf
+#define ucl_exp expf
+#define ucl_powr powf
+
+#endif
+
+// -------------------------------------------------------------------------
+//                         KERNEL MACROS - SHUFFLE
+// -------------------------------------------------------------------------
+
+#if SHUFFLE_AVAIL == 1
+
+#ifndef USE_HIP
+#if (__CUDACC_VER_MAJOR__ < 9)
+#define CUDA_PRE_NINE
+#endif
+#endif
+
+#if defined(CUDA_PRE_NINE) || defined(__HIP_PLATFORM_HCC__)
+
+  #ifdef _SINGLE_SINGLE
+    #define shfl_down __shfl_down
+    #define shfl_xor __shfl_xor
+  #else
+    ucl_inline double shfl_down(double var, unsigned int delta, int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+      tmp.x = __shfl_down(tmp.x,delta,width);
+      tmp.y = __shfl_down(tmp.y,delta,width);
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+    ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+      tmp.x = __shfl_xor(tmp.x,lanemask,width);
+      tmp.y = __shfl_xor(tmp.y,lanemask,width);
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+  #endif
+  #define simd_broadcast_i __shfl
+  #define simd_broadcast_f __shfl
+  #ifdef _DOUBLE_DOUBLE
+    ucl_inline double simd_broadcast_d(double var, unsigned int src,
+                                       int width) {
+      int2 tmp;
+      tmp.x = __double2hiint(var);
+      tmp.y = __double2loint(var);
+      tmp.x = __shfl(tmp.x,src,width);
+      tmp.y = __shfl(tmp.y,src,width);
+      return __hiloint2double(tmp.x,tmp.y);
+    }
+  #endif
+
+#else
+
+  #ifdef _SINGLE_SINGLE
+  ucl_inline float shfl_down(float var, unsigned int delta, int width) {
+    return __shfl_down_sync(0xffffffff, var, delta, width);
+  }
+  ucl_inline float shfl_xor(float var, unsigned int lanemask, int width) {
+    return __shfl_xor_sync(0xffffffff, var, lanemask, width);
+  }
+  #else
+  ucl_inline double shfl_down(double var, unsigned int delta, int width) {
+    int2 tmp;
+    tmp.x = __double2hiint(var);
+    tmp.y = __double2loint(var);
+    tmp.x = __shfl_down_sync(0xffffffff,tmp.x,delta,width);
+    tmp.y = __shfl_down_sync(0xffffffff,tmp.y,delta,width);
+    return __hiloint2double(tmp.x,tmp.y);
+  }
+  ucl_inline double shfl_xor(double var, unsigned int lanemask, int width) {
+    int2 tmp;
+    tmp.x = __double2hiint(var);
+    tmp.y = __double2loint(var);
+    tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,lanemask,width);
+    tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,lanemask,width);
+    return __hiloint2double(tmp.x,tmp.y);
+  }
+  #endif
+  #define simd_broadcast_i(var, src, width) \
+    __shfl_sync(0xffffffff, var, src, width)
+  #define simd_broadcast_f(var, src, width) \
+    __shfl_sync(0xffffffff, var, src, width)
+  #ifdef _DOUBLE_DOUBLE
+  ucl_inline double simd_broadcast_d(double var, unsigned int src, int width) {
+    int2 tmp;
+    tmp.x = __double2hiint(var);
+    tmp.y = __double2loint(var);
+    tmp.x = __shfl_sync(0xffffffff,tmp.x,src,width);
+    tmp.y = __shfl_sync(0xffffffff,tmp.y,src,width);
+    return __hiloint2double(tmp.x,tmp.y);
+  }
+  #endif
+#endif
+
+#endif
+
+// -------------------------------------------------------------------------
+//                            END CUDA / HIP DEFINITIONS
+// -------------------------------------------------------------------------
+
+#endif
diff --git a/lib/gpu/lal_pre_ocl_config.h b/lib/gpu/lal_pre_ocl_config.h
new file mode 100644
index 0000000000..15c503c942
--- /dev/null
+++ b/lib/gpu/lal_pre_ocl_config.h
@@ -0,0 +1,53 @@
+// **************************************************************************
+//                               pre_ocl_config.h
+//                             -------------------
+//                           W. Michael Brown (ORNL)
+//                           Nitin Dhamankar (Intel)
+//
+//  Device-side preprocessor definitions
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : brownw@ornl.gov
+// ***************************************************************************/
+
+//*************************************************************************
+//                       Device Configuration Definitions
+//                    See lal_preprocessor.h for definitions
+//                           Configuration order:
+//
+//  {CONFIG_NAME, CONFIG_ID, SIMD_SIZE, MEM_THREADS, SHUFFLE_AVAIL, FAST_MATH,
+//   THREADS_PER_ATOM, THREADS_PER_CHARGE, THREADS_PER_THREE, BLOCK_PAIR,
+//   BLOCK_BIO_PAIR, BLOCK_ELLIPSE, PPPM_BLOCK_1D, BLOCK_NBOR_BUILD,
+//   BLOCK_CELL_2D, BLOCK_CELL_ID, MAX_SHARED_TYPES, MAX_BIO_SHARED_TYPES,
+//   PPPM_MAX_SPLINE}
+//
+//*************************************************************************/
+
+const int nconfigs=6;
+const char * ocl_config_names[] =
+  {
+   "generic",
+   "nvidiagpu",
+   "amdgpu",
+   "intelgpu",
+   "applegpu",
+   "intelcpu"
+  };
+const char * ocl_config_strings[] =
+  {
+   "GENERIC,1,1,16,0,1,1,1,1,64,64,64,64,64,8,128,8,128,8",
+   "NVIDIA_GPU,203,32,32,1,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
+   "AMD_GPU,403,64,64,0,1,4,8,2,256,256,128,64,128,8,128,11,128,8",
+#ifdef _SINGLE_SINGLE
+   "INTEL_GPU,500,8,16,1,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
+   "APPLE_GPU,600,16,16,0,1,4,8,1,64,64,64,64,64,8,128,8,128,8",
+#else
+   "INTEL_GPU,500,8,16,1,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
+   "APPLE_GPU,600,16,16,0,1,2,8,1,64,64,64,64,64,8,128,8,128,8",
+#endif
+   "INTEL_CPU,1500,8,8,1,1,1,1,1,64,64,64,64,64,8,64,8,128,8"
+  };
diff --git a/lib/gpu/lal_precision.h b/lib/gpu/lal_precision.h
index 7f82ba18aa..bb2423198f 100644
--- a/lib/gpu/lal_precision.h
+++ b/lib/gpu/lal_precision.h
@@ -20,6 +20,29 @@
 #include <cuda_runtime.h>
 #endif
 
+// ---------------------- OPENMP PREPROCESSOR STUFF ------------------
+#if defined(_OPENMP)
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 1
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+    #if (_OPENMP >= 201307)
+    #define LAL_USE_OMP_SIMD 1
+    #else
+    #define LAL_USE_OMP_SIMD 0
+    #endif
+  #endif
+#else
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 0
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+  #define LAL_USE_OMP_SIMD 0
+  #endif
+#endif
+
 struct _lgpu_int2 {
   int x; int y;
 };
@@ -75,6 +98,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_float2
 #define numtyp4 _lgpu_float4
+#define acctyp2 _lgpu_double2
 #define acctyp4 _lgpu_double4
 #endif
 
@@ -84,6 +108,7 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define ACC_PRECISION double
 #define numtyp2 _lgpu_double2
 #define numtyp4 _lgpu_double4
+#define acctyp2 _lgpu_double2
 #define acctyp4 _lgpu_double4
 #endif
 
@@ -93,44 +118,16 @@ inline std::ostream & operator<<(std::ostream &out, const _lgpu_double4 &v) {
 #define ACC_PRECISION float
 #define numtyp2 _lgpu_float2
 #define numtyp4 _lgpu_float4
+#define acctyp2 _lgpu_float2
 #define acctyp4 _lgpu_float4
 #endif
 
 enum{SPHERE_SPHERE,SPHERE_ELLIPSE,ELLIPSE_SPHERE,ELLIPSE_ELLIPSE};
 
-// OCL_DEFAULT_VENDOR: preprocessor define for hardware
-// specific sizes of OpenCL kernel related constants
-
-#ifdef FERMI_OCL
-#define OCL_DEFAULT_VENDOR "fermi"
-#endif
-
-#ifdef KEPLER_OCL
-#define OCL_DEFAULT_VENDOR "kepler"
-#endif
-
-#ifdef CYPRESS_OCL
-#define OCL_DEFAULT_VENDOR "cypress"
-#endif
-
-#ifdef GENERIC_OCL
-#define OCL_DEFAULT_VENDOR "generic"
-#endif
-
-#ifdef INTEL_OCL
-#define OCL_DEFAULT_VENDOR "intel"
-#endif
-
-#ifdef PHI_OCL
-#define OCL_DEFAULT_VENDOR "phi"
-#endif
-
-#ifndef OCL_DEFAULT_VENDOR
-#define OCL_DEFAULT_VENDOR "none"
-#endif
-
-// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h
-#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG)
+// default to 32-bit smallint and other ints, 64-bit bigint:
+//   same as defined in src/lmptype.h
+#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
+  !defined(LAMMPS_SMALLBIG)
 #define LAMMPS_SMALLBIG
 #endif
 
diff --git a/lib/gpu/lal_preprocessor.h b/lib/gpu/lal_preprocessor.h
index 7c94438272..12cf6345c2 100644
--- a/lib/gpu/lal_preprocessor.h
+++ b/lib/gpu/lal_preprocessor.h
@@ -1,9 +1,10 @@
 // **************************************************************************
-//                              preprocessor.cu
+//                               preprocessor.h
 //                             -------------------
 //                           W. Michael Brown (ORNL)
+//                           Nitin Dhamankar (Intel)
 //
-//  Device code for CUDA-specific preprocessor definitions
+//  Device-side preprocessor definitions
 //
 // __________________________________________________________________________
 //    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
@@ -14,566 +15,136 @@
 // ***************************************************************************/
 
 //*************************************************************************
-//                           Preprocessor Definitions
+//                       Device Configuration Definitions
 //
-//  Note: It is assumed that constants with the same names are defined with
-//  the same values in all files.
+//  For OpenCL, the configuration is a string (optionally controlled at
+//  runtime) where tokens specify the values below in order)
 //
-//  ARCH
-//     Definition:   Architecture number for accelerator
+//  CONFIG_ID:
+//     Definition:   Unique ID for a configuration
+//                   100-199 for NVIDIA GPUs with CUDA / HIP
+//                   200-299 for NVIDIA GPUs with OpenCL
+//                   300-399 for AMD GPUs with HIP
+//                   400-499 for AMD GPUs with OpenCL
+//                   500-599 for Intel GPUs with OpenCL
+//  SIMD_SIZE:
+//     Definition:   For CUDA this is the warp size.
+//                   For AMD this is the wavefront size.
+//                   For OpenCL < 2.1 this is the number of workitems
+//                     guarenteed to have the same instruction pointer
+//                   For OpenCL >= 2.1 this is the smallest expected subgroup
+//                     size. Actually subgroup sizes are determined per kernel.
 //  MEM_THREADS
-//     Definition:   Number of threads with sequential ids accessing memory
-//                   simultaneously on multiprocessor
-//  WARP_SIZE:
-//     Definition:   Number of threads guaranteed to be on the same instruction
+//     Definition:   Number of elements in main memory transaction. Used in
+//                   PPPM. If unknown, set to SIMD_SIZE.
+//  SHUFFLE_AVAIL
+//     Definition:   Controls the use of instructions for horizontal vector
+//                   operations. 0 disables and will increase shared memory
+//                   usage. 1 enables for CUDA, HIP, and OpenCL >= 2.1 on
+//                   NVIDIA and Intel devices.
+//  FAST_MATH
+//     Definition:   0: do not use -cl-fast-relaxed-math optimization flag or
+//                   native transcendentals for OpenCL (fused multiply-add
+//                   still enabled). For CUDA and HIP, this is controlled by
+//                   the Makefile at compile time. 1: enable fast math opts
+//
 //  THREADS_PER_ATOM
-//     Definition:   Default number of threads assigned per atom for pair styles
-//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for pair styles
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
 //  THREADS_PER_CHARGE
-//     Definition:   Default number of threads assigned per atom for pair styles
-//                   with charge
-//     Restructions: Must be power of 2; THREADS_PER_ATOM<=WARP_SIZE
-//  PPPM_MAX_SPLINE
-//     Definition:   Maximum order for splines in PPPM
-//  PPPM_BLOCK_1D
-//     Definition:   Thread block size for PPPM kernels
-//     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
-//                   PPPM_BLOCK_1D%32==0
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for pair styles using charge
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM<=SIMD_SIZE
+//  THREADS_PER_THREE
+//     Definition:   Default number of work items or CUDA threads assigned per
+//                   per atom for 3-body styles
+//     Restrictions: Must be power of 2; THREADS_PER_ATOM^2<=SIMD_SIZE
+//
 //  BLOCK_PAIR
-//     Definition:   Default thread block size for pair styles
-//     Restrictions:
+//     Definition:   Default block size for pair styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_BIO_PAIR
+//     Definition:   Default block size for CHARMM styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_ELLIPSE
+//     Definition:   Default block size for ellipsoidal models and some 3-body
+//                   styles
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  PPPM_BLOCK_1D
+//     Definition:   Default block size for PPPM kernels
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_NBOR_BUILD
+//     Definition:   Default block size for neighbor list builds
+//     Restrictions: Must be integer multiple of SIMD_SIZE
+//  BLOCK_CELL_2D
+//     Definition:   Default block size in each dimension for matrix transpose
+//  BLOCK_CELL_ID
+//     Definition:   Unused in current implementation; Maintained for legacy
+//                   purposes and specialized builds
+//
 //  MAX_SHARED_TYPES 8
 //     Definition:   Max # of atom type params can be stored in shared memory
 //     Restrictions: MAX_SHARED_TYPES*MAX_SHARED_TYPES<=BLOCK_PAIR
-//  BLOCK_CELL_2D
-//     Definition:   Default block size in each dimension for cell list builds
-//                   and matrix transpose
-//  BLOCK_CELL_ID
-//     Definition:   Default block size for binning atoms in cell list builds
-//  BLOCK_NBOR_BUILD
-//     Definition:   Default block size for neighbor list builds
-//  BLOCK_BIO_PAIR
-//     Definition:   Default thread block size for "bio" pair styles
 //  MAX_BIO_SHARED_TYPES
 //     Definition:   Max # of atom type params can be stored in shared memory
-//     Restrictions:  MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
+//     Restrictions: MAX_BIO_SHARED_TYPES<=BLOCK_BIO_PAIR*2
+//  PPPM_MAX_SPLINE
+//     Definition:   Maximum order for splines in PPPM
+//     Restrictions: PPPM_BLOCK_1D>=PPPM_MAX_SPLINE*PPPM_MAX_SPLINE
 //
 //*************************************************************************/
 
-#define _texture(name, type)  texture<type> name
-#define _texture_2d(name, type) texture<type,1> name
-
 // -------------------------------------------------------------------------
-//                            HIP DEFINITIONS
+//                           CUDA and HIP DEFINITIONS
 // -------------------------------------------------------------------------
 
-#ifdef USE_HIP
-  #include <hip/hip_runtime.h>
-  #ifdef __HIP_PLATFORM_HCC__
-    #define mul24(x, y) __mul24(x, y)
-    #undef _texture
-    #undef _texture_2d
-    #define _texture(name, type)  __device__ type* name
-    #define _texture_2d(name, type)  __device__ type* name
-  #endif
-  #define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
-  #define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
-  #define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
-  #define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
-  #define THREAD_ID_X threadIdx.x
-  #define THREAD_ID_Y threadIdx.y
-  #define BLOCK_ID_X blockIdx.x
-  #define BLOCK_ID_Y blockIdx.y
-  #define BLOCK_SIZE_X blockDim.x
-  #define BLOCK_SIZE_Y blockDim.y
-  #define __kernel extern "C" __global__
-  #ifdef __local
-    #undef __local
-  #endif
-  #define __local __shared__
-  #define __global
-  #define restrict __restrict__
-  #define atom_add atomicAdd
-  #define ucl_inline static __inline__ __device__
-
-  #define THREADS_PER_ATOM 4
-  #define THREADS_PER_CHARGE 8
-  #define BLOCK_NBOR_BUILD 128
-  #define BLOCK_PAIR 256
-  #define BLOCK_BIO_PAIR 256
-  #define BLOCK_ELLIPSE 128
-  #define MAX_SHARED_TYPES 11
-
-  #ifdef _SINGLE_SINGLE
-    ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  #ifdef __HIP_PLATFORM_HCC__
-      return __shfl_xor(var, laneMask, width);
-  #else
-      return __shfl_xor_sync(0xffffffff, var, laneMask, width);
-  #endif
-    }
-  #else
-    ucl_inline double shfl_xor(double var, int laneMask, int width) {
-      int2 tmp;
-      tmp.x = __double2hiint(var);
-      tmp.y = __double2loint(var);
-  #ifdef __HIP_PLATFORM_HCC__
-      tmp.x = __shfl_xor(tmp.x,laneMask,width);
-      tmp.y = __shfl_xor(tmp.y,laneMask,width);
-  #else
-      tmp.x = __shfl_xor_sync(0xffffffff, tmp.x,laneMask,width);
-      tmp.y = __shfl_xor_sync(0xffffffff, tmp.y,laneMask,width);
-  #endif
-      return __hiloint2double(tmp.x,tmp.y);
-    }
-  #endif
-
-  #ifdef __HIP_PLATFORM_HCC__
-    #define ARCH 600
-    #define WARP_SIZE 64
-  #endif
-
-  #ifdef __HIP_PLATFORM_NVCC__
-    #define ARCH __CUDA_ARCH__
-    #define WARP_SIZE 32
-  #endif
-
-  #define fast_mul(X,Y) (X)*(Y)
-
-  #define MEM_THREADS WARP_SIZE
-  #define PPPM_BLOCK_1D 64
-  #define BLOCK_CELL_2D 8
-  #define BLOCK_CELL_ID 128
-  #define MAX_BIO_SHARED_TYPES 128
-
-  #ifdef __HIP_PLATFORM_NVCC__
-    #ifdef _DOUBLE_DOUBLE
-      #define fetch4(ans,i,pos_tex) {                        \
-        int4 xy = tex1Dfetch(pos_tex,i*2);                   \
-        int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
-        ans.x=__hiloint2double(xy.y, xy.x);                  \
-        ans.y=__hiloint2double(xy.w, xy.z);                  \
-        ans.z=__hiloint2double(zt.y, zt.x);                  \
-        ans.w=__hiloint2double(zt.w, zt.z);                  \
-      }
-      #define fetch(ans,i,q_tex) {                           \
-        int2 qt = tex1Dfetch(q_tex,i);                       \
-        ans=__hiloint2double(qt.y, qt.x);                    \
-      }
-    #else
-      #define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
-      #define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
-    #endif
-  #else
-    #ifdef _DOUBLE_DOUBLE
-      #define fetch4(ans,i,pos_tex) (ans=*(((double4*)pos_tex) + i))
-      #define fetch(ans,i,q_tex)    (ans=*(((double *)  q_tex) + i))
-    #else
-      #define fetch4(ans,i,pos_tex) (ans=*(((float4*)pos_tex) + i))
-      #define fetch(ans,i,q_tex)    (ans=*(((float *)  q_tex) + i))
-    #endif
-  #endif
-
-  #ifdef _DOUBLE_DOUBLE
-    #define ucl_exp exp
-    #define ucl_powr pow
-    #define ucl_atan atan
-    #define ucl_cbrt cbrt
-    #define ucl_ceil ceil
-    #define ucl_abs fabs
-    #define ucl_rsqrt rsqrt
-    #define ucl_sqrt sqrt
-    #define ucl_recip(x) ((numtyp)1.0/(x))
-
-  #else
-    #define ucl_atan atanf
-    #define ucl_cbrt cbrtf
-    #define ucl_ceil ceilf
-    #define ucl_abs fabsf
-    #define ucl_recip(x) ((numtyp)1.0/(x))
-    #define ucl_rsqrt rsqrtf
-    #define ucl_sqrt sqrtf
-
-    #ifdef NO_HARDWARE_TRANSCENDENTALS
-      #define ucl_exp expf
-      #define ucl_powr powf
-    #else
-      #define ucl_exp __expf
-      #define ucl_powr __powf
-    #endif
-  #endif
-#endif
-    
-// -------------------------------------------------------------------------
-//                            CUDA DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef NV_KERNEL
-
-#define GLOBAL_ID_X threadIdx.x+mul24(blockIdx.x,blockDim.x)
-#define GLOBAL_ID_Y threadIdx.y+mul24(blockIdx.y,blockDim.y)
-#define GLOBAL_SIZE_X mul24(gridDim.x,blockDim.x);
-#define GLOBAL_SIZE_Y mul24(gridDim.y,blockDim.y);
-#define THREAD_ID_X threadIdx.x
-#define THREAD_ID_Y threadIdx.y
-#define BLOCK_ID_X blockIdx.x
-#define BLOCK_ID_Y blockIdx.y
-#define BLOCK_SIZE_X blockDim.x
-#define BLOCK_SIZE_Y blockDim.y
-#define __kernel extern "C" __global__
-#define __local __shared__
-#define __global
-#define restrict __restrict__
-#define atom_add atomicAdd
-#define ucl_inline static __inline__ __device__
-
-#ifdef __CUDA_ARCH__
-#define ARCH __CUDA_ARCH__
-#else
-#define ARCH 100
-#endif
-
-#if (ARCH < 200)
-
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 16
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_PAIR 64
-#define BLOCK_BIO_PAIR 64
-#define MAX_SHARED_TYPES 8
-
-#else
-
-#if (ARCH < 300)
-
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 128
-#define BLOCK_BIO_PAIR 128
-#define MAX_SHARED_TYPES 8
-
-#else
-
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_PAIR 256
-#define BLOCK_BIO_PAIR 256
-#define BLOCK_ELLIPSE 128
-#define MAX_SHARED_TYPES 11
-
-#if (__CUDACC_VER_MAJOR__ < 9)
-
-#ifdef _SINGLE_SINGLE
-#define shfl_xor __shfl_xor
-#else
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  int2 tmp;
-  tmp.x = __double2hiint(var);
-  tmp.y = __double2loint(var);
-  tmp.x = __shfl_xor(tmp.x,laneMask,width);
-  tmp.y = __shfl_xor(tmp.y,laneMask,width);
-  return __hiloint2double(tmp.x,tmp.y);
-}
-#endif
-
-#else
-
-#ifdef _SINGLE_SINGLE
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  return __shfl_xor_sync(0xffffffff, var, laneMask, width);
-}
-#else
-ucl_inline double shfl_xor(double var, int laneMask, int width) {
-  int2 tmp;
-  tmp.x = __double2hiint(var);
-  tmp.y = __double2loint(var);
-  tmp.x = __shfl_xor_sync(0xffffffff,tmp.x,laneMask,width);
-  tmp.y = __shfl_xor_sync(0xffffffff,tmp.y,laneMask,width);
-  return __hiloint2double(tmp.x,tmp.y);
-}
-#endif
-
-#endif
-
-#endif
-
-#endif
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifdef _DOUBLE_DOUBLE
-#define fetch4(ans,i,pos_tex) {                        \
-  int4 xy = tex1Dfetch(pos_tex,i*2);                   \
-  int4 zt = tex1Dfetch(pos_tex,i*2+1);                 \
-  ans.x=__hiloint2double(xy.y, xy.x);                  \
-  ans.y=__hiloint2double(xy.w, xy.z);                  \
-  ans.z=__hiloint2double(zt.y, zt.x);                  \
-  ans.w=__hiloint2double(zt.w, zt.z);                  \
-}
-#define fetch(ans,i,q_tex) {                           \
-  int2 qt = tex1Dfetch(q_tex,i);                       \
-  ans=__hiloint2double(qt.y, qt.x);                    \
-}
-#else
-#define fetch4(ans,i,pos_tex) ans=tex1Dfetch(pos_tex, i);
-#define fetch(ans,i,q_tex) ans=tex1Dfetch(q_tex,i);
-#endif
-
-#if (__CUDA_ARCH__ < 200)
-#define fast_mul __mul24
-#define MEM_THREADS 16
-#else
-#define fast_mul(X,Y) (X)*(Y)
-#define MEM_THREADS 32
-#endif
-
-#ifdef CUDA_PRE_THREE
-struct __builtin_align__(16) _double4
-{
-  double x, y, z, w;
-};
-typedef struct _double4 double4;
-#endif
-
-#ifdef _DOUBLE_DOUBLE
-
-#define ucl_exp exp
-#define ucl_powr pow
-#define ucl_atan atan
-#define ucl_cbrt cbrt
-#define ucl_ceil ceil
-#define ucl_abs fabs
-#define ucl_rsqrt rsqrt
-#define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
-
-#else
-
-#define ucl_atan atanf
-#define ucl_cbrt cbrtf
-#define ucl_ceil ceilf
-#define ucl_abs fabsf
-#define ucl_recip(x) ((numtyp)1.0/(x))
-#define ucl_rsqrt rsqrtf
-#define ucl_sqrt sqrtf
-
-#ifdef NO_HARDWARE_TRANSCENDENTALS
-
-#define ucl_exp expf
-#define ucl_powr powf
-
-#else
-
-#define ucl_exp __expf
-#define ucl_powr __powf
-
-#endif
-
-#endif
-
+#if defined(NV_KERNEL) || defined(USE_HIP)
+#include "lal_pre_cuda_hip.h"
 #endif
 
 // -------------------------------------------------------------------------
-//                            NVIDIA GENERIC OPENCL DEFINITIONS
+//                         OPENCL DEVICE CONFIGURATAIONS
 // -------------------------------------------------------------------------
 
-#ifdef NV_GENERIC_OCL
+// See lal_pre_ocl_config.h for OpenCL device configurations
+
+#if !defined(NV_KERNEL) && !defined(USE_HIP)
 
 #define USE_OPENCL
-#define fast_mul mul24
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
 
 // -------------------------------------------------------------------------
-//                           NVIDIA FERMI OPENCL DEFINITIONS
+//                         OPENCL KERNEL MACROS
 // -------------------------------------------------------------------------
 
-#ifdef FERMI_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 128
-#define MAX_SHARED_TYPES 11
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_BIO_PAIR 128
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
-
-// -------------------------------------------------------------------------
-//                           NVIDIA KEPLER OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef KEPLER_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 256
-#define MAX_SHARED_TYPES 11
-#define BLOCK_NBOR_BUILD 128
-#define BLOCK_BIO_PAIR 256
-#define BLOCK_ELLIPSE 128
-
-#define WARP_SIZE 32
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#ifndef NO_OCL_PTX
-#define ARCH 300
-#ifdef _SINGLE_SINGLE
-inline float shfl_xor(float var, int laneMask, int width) {
-  float ret;
-  int c;
-  c = ((WARP_SIZE-width) << 8) | 0x1f;
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=f"(ret) : "f"(var), "r"(laneMask), "r"(c));
-  return ret;
-}
+#if (__OPENCL_VERSION__ > 199)
+#define NOUNROLL __attribute__((opencl_unroll_hint(1)))
 #else
-#pragma OPENCL EXTENSION cl_khr_fp64 : enable
-inline double shfl_xor(double var, int laneMask, int width) {
-  int c = ((WARP_SIZE-width) << 8) | 0x1f;
-  int x,y,x2,y2;
-  double ans;
-  asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(x2) : "r"(x), "r"(laneMask), "r"(c));
-  asm volatile ("shfl.bfly.b32 %0, %1, %2, %3;" : "=r"(y2) : "r"(y), "r"(laneMask), "r"(c));
-  asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
-  return ans;
-}
-#endif
+#define NOUNROLL
 #endif
 
-#endif
+#define GLOBAL_ID_X get_global_id(0)
+#define THREAD_ID_X get_local_id(0)
+#define BLOCK_ID_X get_group_id(0)
+#define BLOCK_SIZE_X get_local_size(0)
+#define GLOBAL_SIZE_X get_global_size(0)
+#define THREAD_ID_Y get_local_id(1)
+#define BLOCK_ID_Y get_group_id(1)
+#define NUM_BLOCKS_X get_num_groups(0)
+#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
+#define ucl_inline inline
 
 // -------------------------------------------------------------------------
-//                            AMD CYPRESS OPENCL DEFINITIONS
+//                      OPENCL KERNEL MACROS - TEXTURES
 // -------------------------------------------------------------------------
 
-#ifdef CYPRESS_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 32
-#define THREADS_PER_ATOM 4
-#define THREADS_PER_CHARGE 8
-#define BLOCK_PAIR 128
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 64
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
+#define fetch4(ans,i,x) ans=x[i]
+#define fetch(ans,i,q) ans=q[i]
 
 // -------------------------------------------------------------------------
-//                           INTEL CPU OPENCL DEFINITIONS
+//                       OPENCL KERNEL MACROS - MATH
 // -------------------------------------------------------------------------
 
-#ifdef INTEL_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 1
-#define MAX_SHARED_TYPES 0
-#define BLOCK_NBOR_BUILD 4
-#define BLOCK_BIO_PAIR 2
-#define BLOCK_ELLIPSE 2
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 32
-#define BLOCK_CELL_2D 1
-#define BLOCK_CELL_ID 2
-#define MAX_BIO_SHARED_TYPES 0
-
-#endif
-
-// -------------------------------------------------------------------------
-//                           INTEL PHI OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef PHI_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 16
-#define MAX_SHARED_TYPES 0
-#define BLOCK_NBOR_BUILD 16
-#define BLOCK_BIO_PAIR 16
-#define BLOCK_ELLIPSE 16
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 32
-#define BLOCK_CELL_2D 4
-#define BLOCK_CELL_ID 16
-#define MAX_BIO_SHARED_TYPES 0
-
-#endif
-
-// -------------------------------------------------------------------------
-//                            GENERIC OPENCL DEFINITIONS
-// -------------------------------------------------------------------------
-
-#ifdef GENERIC_OCL
-
-#define USE_OPENCL
-#define MEM_THREADS 16
-#define THREADS_PER_ATOM 1
-#define THREADS_PER_CHARGE 1
-#define BLOCK_PAIR 64
-#define MAX_SHARED_TYPES 8
-#define BLOCK_NBOR_BUILD 64
-#define BLOCK_BIO_PAIR 64
-
-#define WARP_SIZE 1
-#define PPPM_BLOCK_1D 64
-#define BLOCK_CELL_2D 8
-#define BLOCK_CELL_ID 128
-#define MAX_BIO_SHARED_TYPES 128
-
-#endif
-
-// -------------------------------------------------------------------------
-//                     OPENCL Stuff for All Hardware
-// -------------------------------------------------------------------------
-#ifdef USE_OPENCL
-
 #ifndef _SINGLE_SINGLE
 
 #ifndef cl_khr_fp64
@@ -589,48 +160,14 @@ inline double shfl_xor(double var, int laneMask, int width) {
 
 #endif
 
-#ifndef fast_mul
 #define fast_mul(X,Y) (X)*(Y)
-#endif
-
-#ifndef ARCH
-#define ARCH 0
-#endif
-
-#ifndef DRIVER
-#define DRIVER 0
-#endif
-
-#define GLOBAL_ID_X get_global_id(0)
-#define THREAD_ID_X get_local_id(0)
-#define BLOCK_ID_X get_group_id(0)
-#define BLOCK_SIZE_X get_local_size(0)
-#define GLOBAL_SIZE_X get_global_size(0)
-#define THREAD_ID_Y get_local_id(1)
-#define BLOCK_ID_Y get_group_id(1)
-#define __syncthreads() barrier(CLK_LOCAL_MEM_FENCE)
-#define ucl_inline inline
-#define fetch4(ans,i,x) ans=x[i]
-#define fetch(ans,i,q) ans=q[i]
 
 #define ucl_atan atan
 #define ucl_cbrt cbrt
 #define ucl_ceil ceil
 #define ucl_abs fabs
 
-#ifdef _DOUBLE_DOUBLE
-#define NO_HARDWARE_TRANSCENDENTALS
-#endif
-
-#ifdef NO_HARDWARE_TRANSCENDENTALS
-
-#define ucl_exp exp
-#define ucl_powr powr
-#define ucl_rsqrt rsqrt
-#define ucl_sqrt sqrt
-#define ucl_recip(x) ((numtyp)1.0/(x))
-
-#else
+#if defined(FAST_MATH) && !defined(_DOUBLE_DOUBLE)
 
 #define ucl_exp native_exp
 #define ucl_powr native_powr
@@ -638,23 +175,128 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define ucl_sqrt native_sqrt
 #define ucl_recip native_recip
 
+#else
+
+#define ucl_exp exp
+#define ucl_powr powr
+#define ucl_rsqrt rsqrt
+#define ucl_sqrt sqrt
+#define ucl_recip(x) ((numtyp)1.0/(x))
+
 #endif
 
+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - SHUFFLE
+// -------------------------------------------------------------------------
+
+#if (SHUFFLE_AVAIL == 1)
+  #ifdef cl_intel_subgroups
+    #pragma OPENCL EXTENSION cl_intel_subgroups : enable
+    #define shfl_down(var, delta, width) \
+      intel_sub_group_shuffle_down(var, var, delta)
+    #define shfl_xor(var, lanemask, width) \
+      intel_sub_group_shuffle_xor(var, lanemask)
+    #define simd_broadcast_i(var, src, width) sub_group_broadcast(var, src)
+    #define simd_broadcast_f(var, src, width) sub_group_broadcast(var, src)
+    #define simd_broadcast_d(var, src, width) sub_group_broadcast(var, src)
+  #else
+    #ifdef _SINGLE_SINGLE
+      inline float shfl_down(float var, unsigned int delta, int width) {
+        float ret;
+        int c;
+        c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(delta), "r"(c), "r"(0xffffffff));
+        return ret;
+      }
+      inline float shfl_xor(float var, unsigned int lanemask, int width) {
+        float ret;
+        int c;
+        c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        return ret;
+      }
+    #else
+      inline double shfl_down(double var, unsigned int delta, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(delta), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.down.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(delta), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+      inline double shfl_xor(double var, unsigned int lanemask, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.bfly.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(lanemask), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+    #endif
+    inline int simd_broadcast_i(int var, unsigned int src, int width) {
+      int ret;
+      int c;
+      c = ((SIMD_SIZE-width) << 8) | 0x1f;
+      asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
+      return ret;
+    }
+    inline float simd_broadcast_f(float var, unsigned int src, int width) {
+      float ret;
+      int c;
+      c = ((SIMD_SIZE-width) << 8) | 0x1f;
+      asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=f"(ret) : "f"(var), "r"(src), "r"(c), "r"(0xffffffff));
+      return ret;
+    }
+    #ifdef _DOUBLE_DOUBLE
+      inline double simd_broadcast_d(double var, unsigned int src, int width) {
+        int c = ((SIMD_SIZE-width) << 8) | 0x1f;
+        int x,y,x2,y2;
+        double ans;
+        asm volatile ("mov.b64 {%0, %1}, %2;" : "=r"(y), "=r"(x) : "d"(var));
+        asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(x2) : "r"(x), "r"(src), "r"(c), "r"(0xffffffff));
+        asm volatile ("shfl.sync.idx.b32 %0, %1, %2, %3, %4;" : "=r"(y2) : "r"(y), "r"(src), "r"(c), "r"(0xffffffff));
+        asm volatile ("mov.b64 %0, {%1, %2};" : "=d"(ans) : "r"(y2), "r"(x2));
+        return ans;
+      }
+    #endif
+  #endif
+#endif
+
+// -------------------------------------------------------------------------
+//                      OPENCL KERNEL MACROS - SUBGROUPS
+// -------------------------------------------------------------------------
+
+#ifdef USE_OPENCL_SUBGROUPS
+  #ifndef cl_intel_subgroups
+    #pragma OPENCL EXTENSION cl_khr_subgroups : enable
+  #endif
+  #define simdsync() sub_group_barrier(CLK_LOCAL_MEM_FENCE)
+  #define simd_size() get_max_sub_group_size()
+#else
+  #define simdsync()
+  #define simd_size() SIMD_SIZE
+#endif
+
+// -------------------------------------------------------------------------
+//                            END OPENCL DEFINITIONS
+// -------------------------------------------------------------------------
+
 #endif
 
 // -------------------------------------------------------------------------
 //                  ARCHITECTURE INDEPENDENT DEFINITIONS
 // -------------------------------------------------------------------------
 
-#ifndef PPPM_MAX_SPLINE
-#define PPPM_MAX_SPLINE 8
-#endif
-
 #ifdef _DOUBLE_DOUBLE
 #define numtyp double
 #define numtyp2 double2
 #define numtyp4 double4
 #define acctyp double
+#define acctyp2 double2
 #define acctyp4 double4
 #endif
 
@@ -663,6 +305,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp double
+#define acctyp2 double2
 #define acctyp4 double4
 #endif
 
@@ -671,6 +314,7 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define numtyp2 float2
 #define numtyp4 float4
 #define acctyp float
+#define acctyp2 float2
 #define acctyp4 float4
 #endif
 
@@ -686,11 +330,9 @@ inline double shfl_xor(double var, int laneMask, int width) {
 #define NEIGHMASK 0x3FFFFFFF
 ucl_inline int sbmask(int j) { return j >> SBBITS & 3; };
 
-#ifndef BLOCK_ELLIPSE
-#define BLOCK_ELLIPSE BLOCK_PAIR
-#endif
-
-// default to 32-bit smallint and other ints, 64-bit bigint: same as defined in src/lmptype.h
-#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && !defined(LAMMPS_SMALLBIG)
+// default to 32-bit smallint and other ints, 64-bit bigint:
+// same as defined in src/lmptype.h
+#if !defined(LAMMPS_SMALLSMALL) && !defined(LAMMPS_BIGBIG) && \
+    !defined(LAMMPS_SMALLBIG)
 #define LAMMPS_SMALLBIG
 #endif
diff --git a/lib/gpu/lal_re_squared.cpp b/lib/gpu/lal_re_squared.cpp
index 81dc3b13a4..aabfb9d39f 100644
--- a/lib/gpu/lal_re_squared.cpp
+++ b/lib/gpu/lal_re_squared.cpp
@@ -116,7 +116,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
     host_write[i*4+2]=host_shape[i][2];
   }
   UCL_H_Vec<numtyp4> view4;
-  view4.view((numtyp4*)host_write.begin(),shape.numel(),*(this->ucl_device));
+  view4.view(host_write,shape.numel());
   ucl_copy(shape,view4,false);
 
   well.alloc(ntypes,*(this->ucl_device),UCL_READ_ONLY);
@@ -125,7 +125,7 @@ int RESquaredT::init(const int ntypes, double **host_shape, double **host_well,
     host_write[i*4+1]=host_well[i][1];
     host_write[i*4+2]=host_well[i][2];
   }
-  view4.view((numtyp4*)host_write.begin(),well.numel(),*(this->ucl_device));
+  view4.view(host_write,well.numel());
   ucl_copy(well,view4,false);
 
   _allocated=true;
@@ -172,18 +172,8 @@ double RESquaredT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void RESquaredT::loop(const bool _eflag, const bool _vflag) {
+int RESquaredT::loop(const int eflag, const int vflag) {
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
 
   int GX=0, NGX;
   int stride=this->nbor->nbor_pitch();
@@ -201,8 +191,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor1.stop();
 
       this->time_ellipsoid.start();
-      this->k_ellipsoid.set_size(GX,BX);
-      this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_elps_sel->set_size(GX,BX);
+      this->k_elps_sel->run(&this->atom->x, &this->atom->quat,
                             &this->shape, &this->well, &this->special_lj,
                             &this->sigma_epsilon, &this->_lj_types,
                             &this->nbor->dev_nbor, &stride,
@@ -218,8 +208,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor2.stop();
 
       this->time_ellipsoid2.start();
-      this->k_ellipsoid_sphere.set_size(GX,BX);
-      this->k_ellipsoid_sphere.run(&this->atom->x, &this->atom->quat,
+      this->k_elps_sphere_sel->set_size(GX,BX);
+      this->k_elps_sphere_sel->run(&this->atom->x, &this->atom->quat,
                                    &this->shape, &this->well, &this->special_lj,
                                    &this->sigma_epsilon, &this->_lj_types,
                                    &this->nbor->dev_nbor, &stride,
@@ -233,7 +223,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
         this->time_nbor3.zero();
         this->time_ellipsoid3.zero();
         this->time_lj.zero();
-        return;
+        return ainum;
       }
 
       // ------------ SPHERE_ELLIPSE ---------------
@@ -249,8 +239,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
       this->time_nbor3.stop();
 
       this->time_ellipsoid3.start();
-      this->k_sphere_ellipsoid.set_size(GX,BX);
-      this->k_sphere_ellipsoid.run(&this->atom->x, &this->atom->quat,
+      this->k_sphere_elps_sel->set_size(GX,BX);
+      this->k_sphere_elps_sel->run(&this->atom->x, &this->atom->quat,
                                    &this->shape, &this->well, &this->special_lj,
                                    &this->sigma_epsilon, &this->_lj_types,
                                    &this->nbor->dev_nbor, &stride,
@@ -277,8 +267,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
     this->time_lj.start();
     if (this->_last_ellipse<this->ans->inum()) {
       if (this->_shared_types) {
-        this->k_lj_fast.set_size(GX,BX);
-        this->k_lj_fast.run(&this->atom->x, &this->lj1, &this->lj3,
+        this->k_lj_sel->set_size(GX,BX);
+        this->k_lj_sel->run(&this->atom->x, &this->lj1, &this->lj3,
                             &this->special_lj, &stride,
                             &this->nbor->dev_packed, &this->ans->force,
                             &this->ans->engv, &this->dev_error,
@@ -303,8 +293,8 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                                  ELLIPSE_ELLIPSE,_shared_types,_lj_types);
     this->time_nbor1.stop();
     this->time_ellipsoid.start();
-    this->k_ellipsoid.set_size(GX,BX);
-    this->k_ellipsoid.run(&this->atom->x, &this->atom->quat,
+    this->k_elps_sel->set_size(GX,BX);
+    this->k_elps_sel->run(&this->atom->x, &this->atom->quat,
                           &this->shape, &this->well, &this->special_lj,
                           &this->sigma_epsilon, &this->_lj_types,
                           &this->nbor->dev_nbor, &stride, &this->ans->force,
@@ -312,6 +302,7 @@ void RESquaredT::loop(const bool _eflag, const bool _vflag) {
                           &eflag, &vflag, &ainum, &this->_threads_per_atom);
     this->time_ellipsoid.stop();
   }
+  return ainum;
 }
 
 template class RESquared<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_re_squared.cu b/lib/gpu/lal_re_squared.cu
index 8852a46913..c69a338749 100644
--- a/lib/gpu/lal_re_squared.cu
+++ b/lib/gpu/lal_re_squared.cu
@@ -51,33 +51,30 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=splj[0];
   sp_lj[1]=splj[1];
   sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
 
-  __local numtyp b_alpha, cr60;
-  b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);
+  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
+  const numtyp cr60=ucl_cbrt((numtyp)60.0);
 
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp4 f, tor;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -349,17 +346,17 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
         numtyp force=dUr*Ur+dUa*Ua;
         if (i==0) {
           f.x+=force;
-          if (vflag>0)
+          if (EVFLAG && vflag)
             virial[0]+=-r[0]*force;
         } else if (i==1) {
           f.y+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[1]+=-r[1]*force;
             virial[3]+=-r[0]*force;
           }
         } else {
           f.z+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[2]+=-r[2]*force;
             virial[4]+=-r[0]*force;
             virial[5]+=-r[1]*force;
@@ -452,8 +449,7 @@ __kernel void k_resquared(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
-                    vflag,ans,engv);
   } // if ii
+  store_answers_t(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
+                  vflag,ans,engv,inum);
 }
-
diff --git a/lib/gpu/lal_re_squared.h b/lib/gpu/lal_re_squared.h
index 9e4f4af67a..1b0a837764 100644
--- a/lib/gpu/lal_re_squared.h
+++ b/lib/gpu/lal_re_squared.h
@@ -82,7 +82,7 @@ class RESquared : public BaseEllipsoid<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_re_squared_lj.cu b/lib/gpu/lal_re_squared_lj.cu
index 112a4db8d9..ca1b08facd 100644
--- a/lib/gpu/lal_re_squared_lj.cu
+++ b/lib/gpu/lal_re_squared_lj.cu
@@ -17,12 +17,18 @@
 #include "lal_ellipsoid_extra.h"
 #endif
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
+#define local_allocate_store_ellipse_lj local_allocate_store_ellipse
+#else
+#define local_allocate_store_ellipse_lj()                                   \
+    __local acctyp red_acc[7][BLOCK_ELLIPSE / SIMD_SIZE];
+#endif
+
+#if (SHUFFLE_AVAIL == 0)
 
 #define store_answers_rt(f, tor, energy, virial, ii, astride, tid,           \
-                         t_per_atom, offset, eflag, vflag, ans, engv)        \
+                         t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
   if (t_per_atom>1) {                                                        \
-    __local acctyp red_acc[7][BLOCK_PAIR];                                   \
     red_acc[0][tid]=f.x;                                                     \
     red_acc[1][tid]=f.y;                                                     \
     red_acc[2][tid]=f.z;                                                     \
@@ -30,6 +36,7 @@
     red_acc[4][tid]=tor.y;                                                   \
     red_acc[5][tid]=tor.z;                                                   \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                          \
+      simdsync();                                                            \
       if (offset < s) {                                                      \
         for (int r=0; r<6; r++)                                              \
           red_acc[r][tid] += red_acc[r][tid+s];                              \
@@ -41,28 +48,39 @@
     tor.x=red_acc[3][tid];                                                   \
     tor.y=red_acc[4][tid];                                                   \
     tor.z=red_acc[5][tid];                                                   \
-    if (eflag>0 || vflag>0) {                                                \
-      for (int r=0; r<6; r++)                                                \
-        red_acc[r][tid]=virial[r];                                           \
-      red_acc[6][tid]=energy;                                                \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                        \
-        if (offset < s) {                                                    \
-          for (int r=0; r<7; r++)                                            \
-            red_acc[r][tid] += red_acc[r][tid+s];                            \
+    if (EVFLAG && (eflag || vflag)) {                                        \
+      if (vflag) {                                                           \
+        simdsync();                                                          \
+        for (int r=0; r<6; r++)                                              \
+          red_acc[r][tid]=virial[r];                                         \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                      \
+          simdsync();                                                        \
+          if (offset < s) {                                                  \
+            for (int r=0; r<6; r++)                                          \
+              red_acc[r][tid] += red_acc[r][tid+s];                          \
+          }                                                                  \
+        }                                                                    \
+        for (int r=0; r<6; r++)                                              \
+          virial[r]=red_acc[r][tid];                                         \
+      }                                                                      \
+      if (eflag) {                                                           \
+        simdsync();                                                          \
+        red_acc[0][tid]=energy;                                              \
+        for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                      \
+          simdsync();                                                        \
+          if (offset < s) red_acc[0][tid] += red_acc[0][tid+s];              \
         }                                                                    \
       }                                                                      \
-      for (int r=0; r<6; r++)                                                \
-        virial[r]=red_acc[r][tid];                                           \
-      energy=red_acc[6][tid];                                                \
+      energy=red_acc[0][tid];                                                \
     }                                                                        \
   }                                                                          \
-  if (offset==0) {                                                           \
+  if (offset==0 && ii<inum) {                                                \
     __global acctyp *ap1=engv+ii;                                            \
-    if (eflag>0) {                                                           \
+    if (EVFLAG && eflag) {                                                   \
       *ap1+=energy*(acctyp)0.5;                                              \
       ap1+=astride;                                                          \
     }                                                                        \
-    if (vflag>0) {                                                           \
+    if (EVFLAG && vflag) {                                                   \
       for (int i=0; i<6; i++) {                                              \
         *ap1+=virial[i]*(acctyp)0.5;                                         \
         ap1+=astride;                                                        \
@@ -82,32 +100,32 @@
 
 #else
 
-#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,          \
-                         t_per_atom, offset, eflag, vflag, ans, engv)       \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
-        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
-        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
+#define store_answers_rt(f, tor, energy, virial, ii, astride, tid,           \
+                         t_per_atom, offset, eflag, vflag, ans, engv, inum)  \
+  if (t_per_atom>1) {                                                        \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                          \
+        f.x += shfl_down(f.x, s, t_per_atom);                                \
+        f.y += shfl_down(f.y, s, t_per_atom);                                \
+        f.z += shfl_down(f.z, s, t_per_atom);                                \
+        tor.x += shfl_down(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_down(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_down(tor.z, s, t_per_atom);                            \
+        energy += shfl_down(energy, s, t_per_atom);                          \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
           for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+            virial[r] += shfl_down(virial[r], s, t_per_atom);                \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
+  if (offset==0 && ii<inum) {                                               \
     __global acctyp *ap1=engv+ii;                                           \
-    if (eflag>0) {                                                          \
+    if (EVFLAG && eflag) {                                                  \
       *ap1+=energy*(acctyp)0.5;                                             \
       ap1+=astride;                                                         \
     }                                                                       \
-    if (vflag>0) {                                                          \
+    if (EVFLAG && vflag) {                                                  \
       for (int i=0; i<6; i++) {                                             \
         *ap1+=virial[i]*(acctyp)0.5;                                        \
         ap1+=astride;                                                       \
@@ -147,35 +165,34 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=splj[0];
   sp_lj[1]=splj[1];
   sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
 
-  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
-  b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);
-  solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
-  solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
+  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
+  const numtyp cr60=ucl_cbrt((numtyp)60.0);
+  const numtyp solv_f_a =
+     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
+  const numtyp solv_f_r =
+     (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
-  acctyp energy=(acctyp)0;
-  acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp4 tor;
-  tor.x=(acctyp)0;
-  tor.y=(acctyp)0;
-  tor.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp4 f, tor;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  tor.x=(acctyp)0; tor.y=(acctyp)0; tor.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,i,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
@@ -316,17 +333,17 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
         numtyp force=dUr*Ur+dUa*Ua;
         if (i==0) {
           f.x+=force;
-          if (vflag>0)
+          if (EVFLAG && vflag)
             virial[0]+=-r[0]*force;
         } else if (i==1) {
           f.y+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[1]+=-r[1]*force;
             virial[3]+=-r[0]*force;
           }
         } else {
           f.z+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[2]+=-r[2]*force;
             virial[4]+=-r[0]*force;
             virial[5]+=-r[1]*force;
@@ -378,9 +395,9 @@ __kernel void k_resquared_ellipsoid_sphere(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,eflag,
-                     vflag,ans,engv);
   } // if ii
+  store_answers_rt(f,tor,energy,virial,ii,astride,tid,t_per_atom,offset,
+                   eflag,vflag,ans,engv,inum);
 }
 
 __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
@@ -403,31 +420,33 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse_lj();
+
   sp_lj[0]=splj[0];
   sp_lj[1]=splj[1];
   sp_lj[2]=splj[2];
   sp_lj[3]=splj[3];
 
-  __local numtyp b_alpha, cr60, solv_f_a, solv_f_r;
-  b_alpha=(numtyp)45.0/(numtyp)56.0;
-  cr60=ucl_cbrt((numtyp)60.0);
-  solv_f_a = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
-  solv_f_r = (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
+  const numtyp b_alpha=(numtyp)45.0/(numtyp)56.0;
+  const numtyp cr60=ucl_cbrt((numtyp)60.0);
+  const numtyp solv_f_a =
+    (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*-(numtyp)36.0);
+  const numtyp solv_f_r =
+    (numtyp)3.0/((numtyp)16.0*ucl_atan((numtyp)1.0)*(numtyp)2025.0);
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int j, numj;
-    __local int n_stride;
-    nbor_info_e(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
+    nbor_info_p(dev_nbor,stride,t_per_atom,ii,offset,j,numj,
                 n_stride,nbor_end,nbor);
 
     numtyp4 jx; fetch4(jx,j,pos_tex);
@@ -561,17 +580,17 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
         numtyp force=dUr*Ur+dUa*Ua;
         if (i==0) {
           f.x+=force;
-          if (vflag>0)
+          if (EVFLAG && vflag)
             virial[0]+=-r[0]*force;
         } else if (i==1) {
           f.y+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[1]+=-r[1]*force;
             virial[3]+=-r[0]*force;
           }
         } else {
           f.z+=force;
-          if (vflag>0) {
+          if (EVFLAG && vflag) {
             virial[2]+=-r[2]*force;
             virial[4]+=-r[0]*force;
             virial[5]+=-r[1]*force;
@@ -579,9 +598,9 @@ __kernel void k_resquared_sphere_ellipsoid(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
@@ -601,26 +620,27 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
   ii+=start;
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   sp_lj[0]=gum[0];
   sp_lj[1]=gum[1];
   sp_lj[2]=gum[2];
   sp_lj[3]=gum[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int itype=ix.w;
@@ -652,11 +672,11 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[ii].x*r6inv-lj3[ii].y);
           energy+=factor_lj*(e-lj3[ii].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -666,9 +686,9 @@ __kernel void k_resquared_lj(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
 
 __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
@@ -690,31 +710,32 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
   __local numtyp sp_lj[4];
   __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  int n_stride;
+  local_allocate_store_ellipse();
+
   if (tid<4)
     sp_lj[tid]=gum[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     lj1[tid]=lj1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       lj3[tid]=lj3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
-  f.x=(acctyp)0;
-  f.y=(acctyp)0;
-  f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
-    nbor_info_e(dev_ij,stride,t_per_atom,ii,offset,i,numj,
-                n_stride,nbor_end,nbor);
+    nbor_info_e_ss(dev_ij,stride,t_per_atom,ii,offset,i,numj,
+                   n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex);
     int iw=ix.w;
@@ -745,11 +766,11 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
           energy+=factor_lj*(e-lj3[mtype].z);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -760,8 +781,7 @@ __kernel void k_resquared_lj_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                ans,engv);
   } // if ii
+  acc_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+              ans,engv);
 }
-
diff --git a/lib/gpu/lal_soft.cpp b/lib/gpu/lal_soft.cpp
index 8e944fa0a5..e77be5a011 100644
--- a/lib/gpu/lal_soft.cpp
+++ b/lib/gpu/lal_soft.cpp
@@ -121,20 +121,9 @@ double SoftT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void SoftT::loop(const bool _eflag, const bool _vflag) {
+int SoftT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -142,8 +131,8 @@ void SoftT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom);
@@ -155,6 +144,7 @@ void SoftT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Soft<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_soft.cu b/lib/gpu/lal_soft.cu
index 5df34e7b1d..74ac0e0c97 100644
--- a/lib/gpu/lal_soft.cu
+++ b/lib/gpu/lal_soft.cu
@@ -40,22 +40,25 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -91,11 +94,11 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -106,9 +109,9 @@ __kernel void k_soft(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
@@ -125,25 +128,28 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -179,11 +185,11 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x * ((numtyp)1.0+cos(arg));
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -194,8 +200,8 @@ __kernel void k_soft_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_soft.h b/lib/gpu/lal_soft.h
index b33314ee03..fd86f62927 100644
--- a/lib/gpu/lal_soft.h
+++ b/lib/gpu/lal_soft.h
@@ -73,7 +73,7 @@ class Soft : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_soft_ext.cpp b/lib/gpu/lal_soft_ext.cpp
index 7c0cbe7973..a32a5e5a00 100644
--- a/lib/gpu/lal_soft_ext.cpp
+++ b/lib/gpu/lal_soft_ext.cpp
@@ -55,7 +55,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
   int init_ok=0;
   if (world_me==0)
     init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
-                      special_lj, inum, nall, 300,
+                      special_lj, inum, nall, max_nbors,
                       maxspecial, cell_size, gpu_split, screen);
 
   SLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **host_prefactor,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=SLMF.init(ntypes, cutsq, host_prefactor, host_cut,
-                        special_lj, inum, nall, 300, maxspecial,
+                        special_lj, inum, nall, max_nbors, maxspecial,
                         cell_size, gpu_split, screen);
 
     SLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_sw.cpp b/lib/gpu/lal_sw.cpp
index 5c7bd45c76..eb42c710cc 100644
--- a/lib/gpu/lal_sw.cpp
+++ b/lib/gpu/lal_sw.cpp
@@ -43,114 +43,83 @@ int SWT::bytes_per_atom(const int max_nbors) const {
 }
 
 template <class numtyp, class acctyp>
-int SWT::init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
-           const double cell_size, const double gpu_split, FILE *_screen,
-           int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-           const double* epsilon, const double* sigma,
-           const double* lambda, const double* gamma,
-           const double* costheta, const double* biga,
-           const double* bigb, const double* powerp,
-           const double* powerq, const double* cut, const double* cutsq)
-{
+int SWT::init(const int ntypes, const int nlocal, const int nall,
+              const int max_nbors, const double cell_size,
+              const double gpu_split, FILE *_screen, double **ncutsq,
+              double **ncut, double **sigma, double **powerp, double **powerq,
+              double **sigma_gamma, double **c1, double **c2, double **c3,
+              double **c4, double **c5, double **c6, double ***lambda_epsilon,
+              double ***costheta, const int *map, int ***e2param) {
+  _lj_types=ntypes;
+
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=1;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    int i=map[ii];
+    if (i<0) continue;
+    for (int jj=1; jj<ntypes; jj++) {
+      int j=map[jj];
+      if (j<0) continue;
+      if (powerp[ii][jj] != 4.0 || powerq[ii][jj] != 0.0)
+        spq=0;
+      for (int kk=1; kk<ntypes; kk++) {
+        int k=map[kk];
+        if (k<0) continue;
+        int param=e2param[i][j][k];
+        if (oldparam!=param) {
+          oldparam=param;
+          onetype=ntypes*ii+jj;
+          onetype3=ntypes*ntypes*ii+ntypes*jj+kk;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,sw,"k_sw","k_sw_three_center",
-                           "k_sw_three_end","k_sw_short_nbor");
+                           "k_sw_three_end","k_sw_short_nbor",onetype,
+                           onetype3,spq);
   if (success!=0)
     return success;
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes*ntypes*4,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+  host_write.zero();
 
-  _nparams = nparams;
-  _nelements = nelements;
-
-  UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
-                             UCL_WRITE_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    dview[i].x=(numtyp)0;
-    dview[i].y=(numtyp)0;
-    dview[i].z=(numtyp)0;
-    dview[i].w=(numtyp)0;
+  for (int i=1; i<ntypes; i++)
+    for (int j=1; j<ntypes; j++) {
+      double ccutsq = ncut[i][j]*ncut[i][j];
+      if (ccutsq > 0.0 && ncutsq[i][j]>=ccutsq)
+        ncutsq[i][j]=ccutsq*0.98;
   }
 
   // pack coefficients into arrays
-  sw1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(epsilon[i]);
-    dview[i].y=static_cast<numtyp>(sigma[i]);
-    dview[i].z=static_cast<numtyp>(lambda[i]);
-    dview[i].w=static_cast<numtyp>(gamma[i]);
-  }
-
-  ucl_copy(sw1,dview,false);
-  sw1_tex.get_texture(*(this->pair_program),"sw1_tex");
-  sw1_tex.bind_float(sw1,4);
-
-  sw2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(biga[i]);
-    dview[i].y=static_cast<numtyp>(bigb[i]);
-    dview[i].z=static_cast<numtyp>(powerp[i]);
-    dview[i].w=static_cast<numtyp>(powerq[i]);
-  }
-
-  ucl_copy(sw2,dview,false);
-  sw2_tex.get_texture(*(this->pair_program),"sw2_tex");
-  sw2_tex.bind_float(sw2,4);
-
-  sw3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-
-  for (int i=0; i<nparams; i++) {
-    double sw_cut = cut[i];
-    double sw_cutsq = cutsq[i];
-    if (sw_cutsq>=sw_cut*sw_cut)
-      sw_cutsq=sw_cut*sw_cut-1e-4;
-    dview[i].x=static_cast<numtyp>(sw_cut);
-    dview[i].y=static_cast<numtyp>(sw_cutsq);
-    dview[i].z=static_cast<numtyp>(costheta[i]);
-    dview[i].w=(numtyp)0;
-  }
-
-  ucl_copy(sw3,dview,false);
-  sw3_tex.get_texture(*(this->pair_program),"sw3_tex");
-  sw3_tex.bind_float(sw3,4);
-
-  UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
-                           *(this->ucl_device), UCL_WRITE_ONLY);
-
-  elem2param.alloc(nelements*nelements*nelements,*(this->ucl_device),
-                   UCL_READ_ONLY);
-
-  for (int i = 0; i < nelements; i++)
-    for (int j = 0; j < nelements; j++)
-      for (int k = 0; k < nelements; k++) {
-         int idx = i*nelements*nelements+j*nelements+k;
-         dview_elem2param[idx] = host_elem2param[i][j][k];
-      }
-
-  ucl_copy(elem2param,dview_elem2param,false);
-
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
-  for (int i = 0; i < ntypes; i++)
-    dview_map[i] = host_map[i];
-
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
-  ucl_copy(map,dview_map,false);
+  cutsq.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,ntypes,cutsq,host_write,ncutsq);
+  sw_pre.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,ntypes,sw_pre,host_write,ncut,sigma,
+                         powerp,powerq);
+  c_14.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,ntypes,c_14,host_write,c1,c2,c3,c4);
+  c_56.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,ntypes,c_56,host_write,c5,c6);
+  cut_sigma_gamma.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,ntypes,cut_sigma_gamma,host_write,ncut,
+                         sigma_gamma);
+  sw_pre3.alloc(ntypes*ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack2(ntypes,sw_pre3,host_write,lambda_epsilon,costheta);
 
   _allocated=true;
-  this->_max_bytes=sw1.row_bytes()+sw2.row_bytes()+sw3.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes();
+  this->_max_bytes=cutsq.row_bytes()+sw_pre.row_bytes()+c_14.row_bytes()+
+    c_56.row_bytes()+cut_sigma_gamma.row_bytes()+sw_pre3.row_bytes();
   return 0;
 }
 
@@ -160,11 +129,12 @@ void SWT::clear() {
     return;
   _allocated=false;
 
-  sw1.clear();
-  sw2.clear();
-  sw3.clear();
-  map.clear();
-  elem2param.clear();
+  cutsq.clear();
+  sw_pre.clear();
+  c_14.clear();
+  c_56.clear();
+  cut_sigma_gamma.clear();
+  sw_pre3.clear();
   this->clear_atomic();
 }
 
@@ -179,58 +149,33 @@ double SWT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
+int SWT::loop(const int eflag, const int vflag, const int evatom,
+              bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list
   int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
+  this->time_pair.start();
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
   this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &sw3, &map, &elem2param, &_nelements,
-                 &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                 &this->dev_short_nbor, &ainum,
-                 &nbor_pitch, &this->_threads_per_atom);
+  this->k_short_nbor.run(&this->atom->x, &cutsq, &_lj_types,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
 
   // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
   // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
   ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-  this->time_pair.start();
-  
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &sw1, &sw2, &sw3,
-                   &map, &elem2param, &_nelements,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
-                   &this->_threads_per_atom);
-
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &sw1, &sw2, &sw3,
-                           &map, &elem2param, &_nelements,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
-                           &nbor_pitch, &this->_threads_per_atom, &evatom);
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &cut_sigma_gamma, &sw_pre3,
+                           &_lj_types, &this->nbor->dev_nbor,
+                           &this->ans->force, &this->ans->engv, &eflag,
+                           &vflag, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
   #ifdef THREE_CONCURRENT
@@ -240,25 +185,32 @@ void SWT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &sw1, &sw2, &sw3,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
-
+    this->k_three_end_vatom.run(&this->atom->x, &cut_sigma_gamma,
+                                &sw_pre3, &_lj_types, &this->nbor->dev_nbor,
+                                &this->nbor->three_ilist, &end_ans->force,
+                                &end_ans->engv, &eflag, &vflag, &ainum,
+                                &nbor_pitch,&this->_threads_per_atom,
+                                &this->_gpu_nbor);
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &sw1, &sw2, &sw3,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
-
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &cut_sigma_gamma, &sw_pre3,
+                          &_lj_types, &this->nbor->dev_nbor,
+                          &this->nbor->three_ilist, &end_ans->force,
+                          &end_ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                          &this->_threads_per_atom, &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &sw_pre, &c_14, &c_56,
+                   &_lj_types, &this->nbor->dev_nbor,
+                   &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                   &ainum, &nbor_pitch, &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class SW<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_sw.cu b/lib/gpu/lal_sw.cu
index 2b38bd02dc..621ba87208 100644
--- a/lib/gpu/lal_sw.cu
+++ b/lib/gpu/lal_sw.cu
@@ -39,88 +39,161 @@ _texture( sw3_tex,int4);
 
 //#define THREE_CONCURRENT
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_ELLIPSE];                               \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
     ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -129,45 +202,45 @@ _texture( sw3_tex,int4);
   }
 
 #endif
+#endif
 
 __kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
-                           const __global numtyp4 *restrict sw3,
-                           const __global int *restrict map,
-                           const __global int *restrict elem2param,
-                           const int nelements,
-                           const __global int * dev_nbor,
-                           const __global int * dev_packed,
-                           __global int * dev_short_nbor,
-                           const int inum, const int nbor_pitch, const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+                              const __global numtyp * restrict cutsq,
+                              const int ntypes, __global int * dev_nbor,
+                              const __global int * dev_packed,
+                              const int inum, const int nbor_pitch,
+                              const int t_per_atom) {
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp sw_cutsq=cutsq[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
-    int itype=ix.w;
-    itype=map[itype];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp sw_cutsq=cutsq[mtype];
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -175,74 +248,69 @@ __kernel void k_sw_short_nbor(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<sw_cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
 __kernel void k_sw(const __global numtyp4 *restrict x_,
-                   const __global numtyp4 *restrict sw1,
-                   const __global numtyp4 *restrict sw2,
-                   const __global numtyp4 *restrict sw3,
-                   const __global int *restrict map,
-                   const __global int *restrict elem2param,
-                   const int nelements,
-                   const __global int * dev_nbor,
-                   const __global int * dev_packed,
-                   const __global int * dev_short_nbor,
+                   const __global numtyp4 * restrict sw_pre,
+                   const __global numtyp4 * restrict c_14,
+                   const __global numtyp2 * restrict c_56,
+                   const int ntypes, const __global int * dev_nbor,
                    __global acctyp4 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
-                   const int nbor_pitch, const int t_per_atom) {
-  __local int n_stride;
+                   const int nbor_pitch, const int t_per_atom,
+                   const int ev_stride) {
+  int n_stride;
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
+  local_allocate_store_pair();
+
+  #ifdef ONETYPE
+  const numtyp4 pre_sw=sw_pre[ONETYPE];
+  const numtyp4 pre_sw_c14=c_14[ONETYPE];
+  numtyp2 pre_sw_c56;
+  if (EVFLAG && eflag) pre_sw_c56=c_56[ONETYPE];
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem = dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
-
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #ifndef ONETYPE
+      int mtype=jx.w;
+      mtype+=itype;
+      #endif
 
       // Compute r12
       numtyp delx = ix.x-jx.x;
@@ -250,62 +318,49 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<sw3[ijparam].y) { // sw_cutsq = sw3[ijparam].y
-        numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-        numtyp sw_epsilon=sw1_ijparam.x;
-        numtyp sw_sigma=sw1_ijparam.y;
-        numtyp4 sw2_ijparam; fetch4(sw2_ijparam,ijparam,sw2_tex);
-        numtyp sw_biga=sw2_ijparam.x;
-        numtyp sw_bigb=sw2_ijparam.y;
-        numtyp sw_powerp=sw2_ijparam.z;
-        numtyp sw_powerq=sw2_ijparam.w;
-        numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
-        numtyp sw_cut=sw3_ijparam.x;
-        numtyp pre_sw_c1=sw_biga*sw_epsilon*sw_powerp*sw_bigb*
-            pow(sw_sigma,sw_powerp);
-        numtyp pre_sw_c2=sw_biga*sw_epsilon*sw_powerq*
-            pow(sw_sigma,sw_powerq);
-        numtyp pre_sw_c3=sw_biga*sw_epsilon*sw_bigb*
-            pow(sw_sigma,sw_powerp+(numtyp)1.0);
-        numtyp pre_sw_c4=sw_biga*sw_epsilon*
-            pow(sw_sigma,sw_powerq+(numtyp)1.0);
-        numtyp pre_sw_c5=sw_biga*sw_epsilon*sw_bigb*
-            pow(sw_sigma,sw_powerp);
-        numtyp pre_sw_c6=sw_biga*sw_epsilon*
-            pow(sw_sigma,sw_powerq);
+      #ifndef ONETYPE
+      numtyp4 pre_sw=sw_pre[mtype];
+      numtyp4 pre_sw_c14=c_14[mtype];
+      #endif
+      numtyp r=ucl_sqrt(rsq);
+      #ifdef SPQ
+      numtyp rp=r*r;
+      rp=ucl_recip(rp*rp);
+      numtyp rq=(numtyp)1.0;
+      #else
+      numtyp rp=ucl_powr(r,-pre_sw.z);
+      numtyp rq=ucl_powr(r,-pre_sw.w);
+      #endif
+      numtyp rainv=ucl_recip(r-pre_sw.x);
+      numtyp expsrainv=ucl_exp(pre_sw.y*rainv);
+      rainv*=rainv*r;
+      numtyp force = (pre_sw_c14.x*rp-pre_sw_c14.y*rq +
+                     (pre_sw_c14.z*rp-pre_sw_c14.w*rq) * rainv)*
+                     expsrainv*ucl_recip(rsq);
 
-        numtyp r=ucl_sqrt(rsq);
-        numtyp rp=ucl_powr(r,-sw_powerp);
-        numtyp rq=ucl_powr(r,-sw_powerq);
-        numtyp rainv=ucl_recip(r-sw_cut);
-        numtyp expsrainv=ucl_exp(sw_sigma*rainv);
-        rainv*=rainv*r;
-        numtyp force = (pre_sw_c1*rp-pre_sw_c2*rq +
-                       (pre_sw_c3*rp-pre_sw_c4*rq) * rainv)*
-                       expsrainv*ucl_recip(rsq);
+      f.x+=delx*force;
+      f.y+=dely*force;
+      f.z+=delz*force;
 
-        f.x+=delx*force;
-        f.y+=dely*force;
-        f.z+=delz*force;
+      if (EVFLAG && eflag) {
+        #ifndef ONETYPE
+        numtyp2 pre_sw_c56=c_56[mtype];
+        #endif
+        energy+=(pre_sw_c56.x*rp - pre_sw_c56.y*rq) * expsrainv;
+      }
 
-        if (eflag>0)
-          energy+=(pre_sw_c5*rp - pre_sw_c6*rq) * expsrainv;
-
-        if (vflag>0) {
-          virial[0] += delx*delx*force;
-          virial[1] += dely*dely*force;
-          virial[2] += delz*delz*force;
-          virial[3] += delx*dely*force;
-          virial[4] += delx*delz*force;
-          virial[5] += dely*delz*force;
-        }
+      if (EVFLAG && vflag) {
+        virial[0] += delx*delx*force;
+        virial[1] += dely*dely*force;
+        virial[2] += delz*delz*force;
+        virial[3] += delx*dely*force;
+        virial[4] += delx*delz*force;
+        virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 #define threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z, eflag, energy)  \
@@ -334,7 +389,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
   numtyp frad2 = facrad*gsrainvsq2;                                          \
-  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
+  numtyp facang = (numtyp)2.0 * sw_lambda_epsilon_ijk * facexp*delcs;        \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -349,9 +404,9 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   fky = delr2y*(frad2+csfac2)-delr1y*facang12;                               \
   fkz = delr2z*(frad2+csfac2)-delr1z*facang12;                               \
                                                                              \
-  if (eflag>0)                                                               \
+  if (EVFLAG && eflag)                                                       \
     energy+=facrad;                                                          \
-  if (vflag>0) {                                                             \
+  if (EVFLAG && vflag) {                                                     \
     virial[0] += delr1x*fjx + delr2x*fkx;                                    \
     virial[1] += delr1y*fjy + delr2y*fky;                                    \
     virial[2] += delr1z*fjz + delr2z*fkz;                                    \
@@ -384,7 +439,7 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
                                                                              \
   numtyp facrad = sw_lambda_epsilon_ijk * facexp*delcssq;                    \
   numtyp frad1 = facrad*gsrainvsq1;                                          \
-  numtyp facang = sw_lambda_epsilon2_ijk * facexp*delcs;                     \
+  numtyp facang = (numtyp)2.0 * sw_lambda_epsilon_ijk * facexp*delcs;        \
   numtyp facang12 = rinv12*facang;                                           \
   numtyp csfacang = cs*facang;                                               \
   numtyp csfac1 = rinvsq1*csfacang;                                          \
@@ -394,67 +449,68 @@ __kernel void k_sw(const __global numtyp4 *restrict x_,
   fjz = delr1z*(frad1+csfac1)-delr2z*facang12;                               \
 }
 
+#ifdef ONETYPE
+#define sw_cut_ij sw_cut
+#define sw_cut_ik sw_cut
+#define sw_sigma_gamma_ij sw_sigma_gamma
+#define sw_sigma_gamma_ik sw_sigma_gamma
+#endif
+
 __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
-                                const __global numtyp4 *restrict sw1,
-                                const __global numtyp4 *restrict sw2,
-                                const __global numtyp4 *restrict sw3,
-                                const __global int *restrict map,
-                                const __global int *restrict elem2param,
-                                const int nelements,
+                                const __global numtyp2 *restrict cut_sig_gamma,
+                                const __global numtyp2 *restrict sw_pre3,
+                                const int ntypes,
                                 const __global int * dev_nbor,
-                                const __global int * dev_packed,
-                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
                                 const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  local_allocate_store_three();
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  #ifdef ONETYPE
+  const numtyp sw_cut=cut_sig_gamma[ONETYPE].x;
+  const numtyp sw_sigma_gamma=cut_sig_gamma[ONETYPE].y;
+  const numtyp sw_lambda_epsilon_ijk=sw_pre3[ONETYPE3].x;
+  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
+  #endif
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-    int nborj_start = nbor_j;
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
+      #ifndef ONETYPE
+      int mtypej=jx.w;
+      mtypej+=itype;
+      #endif
 
       // Compute r12
       numtyp delr1x = jx.x-ix.x;
@@ -462,139 +518,116 @@ __kernel void k_sw_three_center(const __global numtyp4 *restrict x_,
       numtyp delr1z = jx.z-ix.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
 
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
-      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+      #ifndef ONETYPE
+      const numtyp sw_cut_ij=cut_sig_gamma[mtypej].x;
+      const numtyp sw_sigma_gamma_ij=cut_sig_gamma[mtypej].y;
+      #endif
 
-      if (rsq1 > sw3_ijparam.y) continue;
+      int nbor_k;
+      nbor_k = nbor_j-offset_j+offset_k;
+      if (nbor_k<=nbor_j) nbor_k += n_stride;
 
-      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_ijparam.x;
-
-      int nbor_k,k_end;
-      if (dev_packed==dev_nbor) {
-        nbor_k=nborj_start-offset_j+offset_k;
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      } else {
-        nbor_k = nbor_j-offset_j+offset_k;
-        if (nbor_k<=nbor_j) nbor_k += n_stride;
-        k_end = nbor_end;
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
-        if (dev_packed==dev_nbor && k <= j) continue;
-
         numtyp4 kx; fetch4(kx,k,pos_tex);
-        int ktype=kx.w;
-        ktype=map[ktype];
-        int ikparam=elem2param[itype*nelements*nelements+ktype*nelements+ktype];
-        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
+        #ifndef ONETYPE
+        const int ktype=kx.w;
+        const int mtypek=itype+ktype;
+        #endif
 
         numtyp delr2x = kx.x-ix.x;
         numtyp delr2y = kx.y-ix.y;
         numtyp delr2z = kx.z-ix.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        if (rsq2 < sw3_ikparam.y) {   // sw_cutsq=sw3[ikparam].y;
-          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_ikparam.x;
+        #ifndef ONETYPE
+        const numtyp sw_cut_ik=cut_sig_gamma[mtypek].x;
+        const numtyp sw_sigma_gamma_ik=cut_sig_gamma[mtypek].y;
+        const int mtypejk=ntypes*mtypej+ktype;
+        const numtyp sw_lambda_epsilon_ijk=sw_pre3[mtypejk].x;
+        const numtyp sw_costheta_ijk=sw_pre3[mtypejk].y;
+        #endif
 
-          int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
-          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
-          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
-          sw_costheta_ijk=sw3_ijkparam.z;
+        numtyp fjx, fjy, fjz, fkx, fky, fkz;
+        threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
 
-          numtyp fjx, fjy, fjz, fkx, fky, fkz;
-          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
-
-          f.x -= fjx + fkx;
-          f.y -= fjy + fky;
-          f.z -= fjz + fkz;
-        }
+        f.x -= fjx + fkx;
+        f.y -= fjy + fky;
+        f.z -= fjz + fkz;
       }
     } // for nbor
 
-    numtyp pre;
-    if (evatom==1)
-      pre=THIRD;
-    else
-      pre=(numtyp)2.0;
-    energy*=pre;
-    for (int i=0; i<6; i++)
-      virial[i]*=pre;
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-
+    if (EVFLAG) {
+      numtyp pre;
+      if (evatom==1)
+        pre=THIRD;
+      else
+        pre=(numtyp)2.0;
+      energy*=pre;
+      if (vflag)
+      for (int i=0; i<6; i++)
+        virial[i]*=pre;
+    }
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
 }
 
 __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
-                             const __global numtyp4 *restrict sw1,
-                             const __global numtyp4 *restrict sw2,
-                             const __global numtyp4 *restrict sw3,
-                             const __global int *restrict map,
-                             const __global int *restrict elem2param,
-                             const int nelements,
-                             const __global int * dev_nbor,
-                             const __global int * dev_packed,
+                             const __global numtyp2 *restrict cut_sig_gamma,
+                             const __global numtyp2 *restrict sw_pre3,
+                             const int ntypes, const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
-
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
+  #ifdef ONETYPE
+  const numtyp sw_cut=cut_sig_gamma[ONETYPE].x;
+  const numtyp sw_sigma_gamma=cut_sig_gamma[ONETYPE].y;
+  const numtyp sw_lambda_epsilon_ijk=sw_pre3[ONETYPE3].x;
+  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
+  #endif
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
+      #ifndef ONETYPE
+      const int jtype=jx.w;
+      const int mtypej=itype+jtype;
+      #endif
 
       // Compute r12
       numtyp delr1x = ix.x-jx.x;
@@ -602,148 +635,116 @@ __kernel void k_sw_three_end(const __global numtyp4 *restrict x_,
       numtyp delr1z = ix.z-jx.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
 
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
-      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+      #ifndef ONETYPE
+      const numtyp sw_cut_ij=cut_sig_gamma[mtypej].x;
+      const numtyp sw_sigma_gamma_ij=cut_sig_gamma[mtypej].y;
+      #endif
 
-      if (rsq1 > sw3_ijparam.y) continue;
-
-      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_ijparam.x;
-
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk&(t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
-        int ktype=kx.w;
-        ktype=map[ktype];
-        int ikparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; //jk
+        #ifndef ONETYPE
+        const int ktype=kx.w;
+        const int mtypek=jtype*ntypes+ktype;
+        #endif
 
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
-        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
 
-        if (rsq2 < sw3_ikparam.y) {
-          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_ikparam.x;
+        #ifndef ONETYPE
+        const numtyp sw_cut_ik=cut_sig_gamma[mtypek].x;
+        const numtyp sw_sigma_gamma_ik=cut_sig_gamma[mtypek].y;
+        const int mtypejik=jtype*ntypes*ntypes+itype+ktype;
+        const numtyp sw_lambda_epsilon_ijk=sw_pre3[mtypejik].x;
+        const numtyp sw_costheta_ijk=sw_pre3[mtypejik].y;
+        #endif
 
-          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; //jik
-          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
-          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
-          sw_costheta_ijk=sw3_ijkparam.z;
+        numtyp fjx, fjy, fjz;
+        threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
 
-          numtyp fjx, fjy, fjz;
-          threebody_half(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z);
-
-          f.x += fjx;
-          f.y += fjy;
-          f.z += fjz;
-        }
+        f.x += fjx;
+        f.y += fjy;
+        f.z += fjz;
       }
-
     } // for nbor
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
-                             const __global numtyp4 *restrict sw1,
-                             const __global numtyp4 *restrict sw2,
-                             const __global numtyp4 *restrict sw3,
-                             const __global int *restrict map,
-                             const __global int *restrict elem2param,
-                             const int nelements,
-                             const __global int * dev_nbor,
-                             const __global int * dev_packed,
+                             const __global numtyp2 *restrict cut_sig_gamma,
+                             const __global numtyp2 *restrict sw_pre3,
+                             const int ntypes, const __global int * dev_nbor,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp sw_sigma_gamma_ij, sw_cut_ij, sw_sigma_gamma_ik, sw_cut_ik;
-  numtyp sw_costheta_ijk, sw_lambda_epsilon_ijk, sw_lambda_epsilon2_ijk;
-
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
+  #ifdef ONETYPE
+  const numtyp sw_cut=cut_sig_gamma[ONETYPE].x;
+  const numtyp sw_sigma_gamma=cut_sig_gamma[ONETYPE].y;
+  const numtyp sw_lambda_epsilon_ijk=sw_pre3[ONETYPE3].x;
+  const numtyp sw_costheta_ijk=sw_pre3[ONETYPE3].y;
+  #endif
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
-    itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    itype*=ntypes;
+    #endif
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype=jx.w;
-      jtype=map[jtype];
+      #ifndef ONETYPE
+      const int jtype=jx.w;
+      const int mtypej=itype+jtype;
+      #endif
 
       // Compute r12
       numtyp delr1x = ix.x-jx.x;
@@ -751,88 +752,61 @@ __kernel void k_sw_three_end_vatom(const __global numtyp4 *restrict x_,
       numtyp delr1z = ix.z-jx.z;
       numtyp rsq1 = delr1x*delr1x+delr1y*delr1y+delr1z*delr1z;
 
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
-      numtyp4 sw3_ijparam; fetch4(sw3_ijparam,ijparam,sw3_tex);
+      #ifndef ONETYPE
+      const numtyp sw_cut_ij=cut_sig_gamma[mtypej].x;
+      const numtyp sw_sigma_gamma_ij=cut_sig_gamma[mtypej].y;
+      #endif
 
-      if (rsq1 > sw3_ijparam.y) continue;
-
-      numtyp4 sw1_ijparam; fetch4(sw1_ijparam,ijparam,sw1_tex);
-      sw_sigma_gamma_ij=sw1_ijparam.y*sw1_ijparam.w; //sw_sigma*sw_gamma;
-      sw_cut_ij=sw3_ijparam.x;
-
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk&(t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
-        int ktype=kx.w;
-        ktype=map[ktype];
-        int ikparam=elem2param[jtype*nelements*nelements+ktype*nelements+ktype]; // jk
-        numtyp4 sw3_ikparam; fetch4(sw3_ikparam,ikparam,sw3_tex);
+        #ifndef ONETYPE
+        const int ktype=kx.w;
+        const int mtypek=jtype*ntypes+ktype;
+        #endif
 
         numtyp delr2x = kx.x - jx.x;
         numtyp delr2y = kx.y - jx.y;
         numtyp delr2z = kx.z - jx.z;
         numtyp rsq2 = delr2x*delr2x + delr2y*delr2y + delr2z*delr2z;
 
-        if (rsq2 < sw3_ikparam.y) {
-          numtyp4 sw1_ikparam; fetch4(sw1_ikparam,ikparam,sw1_tex);
-          sw_sigma_gamma_ik=sw1_ikparam.y*sw1_ikparam.w; //sw_sigma*sw_gamma;
-          sw_cut_ik=sw3_ikparam.x;
+        #ifndef ONETYPE
+        const numtyp sw_cut_ik=cut_sig_gamma[mtypek].x;
+        const numtyp sw_sigma_gamma_ik=cut_sig_gamma[mtypek].y;
+        const int mtypejik=jtype*ntypes*ntypes+itype+ktype;
+        const numtyp sw_lambda_epsilon_ijk=sw_pre3[mtypejik].x;
+        const numtyp sw_costheta_ijk=sw_pre3[mtypejik].y;
+        #endif
 
-          int ijkparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype]; // jik
-          numtyp4 sw1_ijkparam; fetch4(sw1_ijkparam,ijkparam,sw1_tex);
-          sw_lambda_epsilon_ijk=sw1_ijkparam.x*sw1_ijkparam.z; //sw_lambda*sw_epsilon;
-          sw_lambda_epsilon2_ijk=(numtyp)2.0*sw_lambda_epsilon_ijk;
-          numtyp4 sw3_ijkparam; fetch4(sw3_ijkparam,ijkparam,sw3_tex);
-          sw_costheta_ijk=sw3_ijkparam.z;
+        numtyp fjx, fjy, fjz, fkx, fky, fkz;
+        threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
 
-          numtyp fjx, fjy, fjz, fkx, fky, fkz;
-          threebody(delr1x,delr1y,delr1z,delr2x,delr2y,delr2z,eflag,energy);
-
-          f.x += fjx;
-          f.y += fjy;
-          f.z += fjz;
-        }
+        f.x += fjx;
+        f.y += fjy;
+        f.z += fjz;
       }
-
     } // for nbor
     energy*=THIRD;
     for (int i=0; i<6; i++)
       virial[i]*=THIRD;
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
-
diff --git a/lib/gpu/lal_sw.h b/lib/gpu/lal_sw.h
index 1a2e025ae0..f8b4b465a5 100644
--- a/lib/gpu/lal_sw.h
+++ b/lib/gpu/lal_sw.h
@@ -37,14 +37,13 @@ class SW : public BaseThree<numtyp, acctyp> {
     * - -3 if there is an out of memory error
     * - -4 if the GPU library was not compiled for GPU
     * - -5 Double precision is not supported on card **/
-  int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
-           const double cell_size, const double gpu_split, FILE *screen,
-           int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-           const double* epsilon, const double* sigma,
-           const double* lambda, const double* gamma,
-           const double* costheta, const double* biga,
-           const double* bigb, const double* powerp,
-           const double* powerq, const double* cut, const double* cutsq);
+  int init(const int ntypes, const int nlocal, const int nall,
+           const int max_nbors, const double cell_size,
+           const double gpu_split, FILE *screen, double **ncutsq,
+           double **ncut, double **sigma, double **powerp, double **powerq,
+           double **sigma_gamma, double **c1, double **c2, double **c3,
+           double **c4, double **c5, double **c6, double ***lambda_epsilon,
+           double ***costheta, const int *map, int ***e2param);
 
   /// Clear all host and device data
   /** \note This is called at the beginning of the init() routine **/
@@ -64,22 +63,21 @@ class SW : public BaseThree<numtyp, acctyp> {
   /// Number of atom types
   int _lj_types;
 
-  /// sw1.x = epsilon, sw1.y = sigma, sw1.z = lambda, sw1.w = gamma
-  UCL_D_Vec<numtyp4> sw1;
-  /// sw2.x = biga, sw2.y = bigb, sw2.z = powerp, sw2.w = powerq
-  UCL_D_Vec<numtyp4> sw2;
-  /// sw3.x = cut, sw3.y = cutsq, sw3.z = costheta
-  UCL_D_Vec<numtyp4> sw3;
-
-  UCL_D_Vec<int> elem2param;
-  UCL_D_Vec<int> map;
-  int _nparams,_nelements;
-
-  UCL_Texture sw1_tex, sw2_tex, sw3_tex;
+  UCL_D_Vec<numtyp> cutsq;
+  /// sw_pre.x = cut, sw_pre.y = sigma, sw_pre.z = powerp, sw_pre.w = powerq
+  UCL_D_Vec<numtyp4> sw_pre;
+  /// c_14.x = c1, c_14.y = c2, c_14.z = c3, c_14.w = c4
+  UCL_D_Vec<numtyp4> c_14;
+  /// c_56.x = c5, c_56.y = c6
+  UCL_D_Vec<numtyp2> c_56;
+  /// cut_sigma_gamma.x = cut, cut_sigma_gamma.y = sigma_gamma
+  UCL_D_Vec<numtyp2> cut_sigma_gamma;
+  /// sw_pre3.x = lambda_epsilon, sw_pre3.y = costheta
+  UCL_D_Vec<numtyp2> sw_pre3;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 
 };
 
diff --git a/lib/gpu/lal_sw_ext.cpp b/lib/gpu/lal_sw_ext.cpp
index 1935ed615b..5158f135a3 100644
--- a/lib/gpu/lal_sw_ext.cpp
+++ b/lib/gpu/lal_sw_ext.cpp
@@ -27,15 +27,13 @@ static SW<PRECISION,ACC_PRECISION> SWMF;
 // ---------------------------------------------------------------------------
 // Allocate memory on host and device and copy constants to device
 // ---------------------------------------------------------------------------
-int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
-                const double cell_size, int &gpu_mode, FILE *screen,
-                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-                const double* sw_epsilon, const double* sw_sigma,
-                const double* sw_lambda, const double* sw_gamma,
-                const double* sw_costheta, const double* sw_biga,
-                const double* sw_bigb, const double* sw_powerp,
-                const double* sw_powerq, const double* sw_cut,
-                const double* sw_cutsq) {
+int sw_gpu_init(const int ntypes, const int inum, const int nall,
+                const int max_nbors, const double cell_size, int &gpu_mode,
+                FILE *screen, double **ncutsq, double **ncut, double **sigma,
+                double **powerp, double **powerq, double **sigma_gamma,
+                double **c1, double **c2, double **c3, double **c4,
+                double **c5, double **c6, double ***lambda_epsilon,
+                double ***costheta, const int *map, int ***e2param) {
   SWMF.clear();
   gpu_mode=SWMF.device->gpu_mode();
   double gpu_split=SWMF.device->particle_split();
@@ -62,10 +60,10 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
-                      host_map, nelements, host_elem2param, nparams,
-                      sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
-                      sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut, sw_cutsq);
+    init_ok=SWMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split,
+                      screen, ncutsq, ncut, sigma, powerp, powerq,
+                      sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon,
+                      costheta, map, e2param);
 
   SWMF.device->world_barrier();
   if (message)
@@ -81,11 +79,10 @@ int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=SWMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
-                        host_map, nelements, host_elem2param, nparams,
-                        sw_epsilon, sw_sigma, sw_lambda, sw_gamma, sw_costheta,
-                        sw_biga, sw_bigb, sw_powerp, sw_powerq, sw_cut,
-                        sw_cutsq);
+      init_ok=SWMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split,
+                        screen, ncutsq, ncut, sigma, powerp, powerq,
+                        sigma_gamma, c1, c2, c3, c4, c5, c6, lambda_epsilon,
+                        costheta, map, e2param);
 
     SWMF.device->gpu_barrier();
     if (message)
@@ -127,5 +124,3 @@ void sw_gpu_compute(const int ago, const int nlocal, const int nall,
 double sw_gpu_bytes() {
   return SWMF.host_memory_usage();
 }
-
-
diff --git a/lib/gpu/lal_table.cpp b/lib/gpu/lal_table.cpp
index d07b2716e4..0c336c6990 100644
--- a/lib/gpu/lal_table.cpp
+++ b/lib/gpu/lal_table.cpp
@@ -69,6 +69,20 @@ int TableT::init(const int ntypes,
   k_pair_spline_fast.set_function(*(this->pair_program),"k_table_spline_fast");
   k_pair_bitmap.set_function(*(this->pair_program),"k_table_bitmap");
   k_pair_bitmap_fast.set_function(*(this->pair_program),"k_table_bitmap_fast");
+
+  #if defined(LAL_OCL_EV_JIT)
+  k_pair_linear_noev.set_function(*(this->pair_program_noev),
+                                  "k_table_linear_fast");
+  k_pair_spline_noev.set_function(*(this->pair_program_noev),
+                                  "k_table_spline_fast");
+  k_pair_bitmap_noev.set_function(*(this->pair_program_noev),
+                                  "k_table_bitmap_fast");
+  #else
+  k_pair_linear_sel = &k_pair_linear_fast;
+  k_pair_spline_sel = &k_pair_spline_fast;
+  k_pair_bitmap_sel = &k_pair_bitmap_fast;
+  #endif
+
   _compiled_styles = true;
 
   // If atom type constants fit in shared memory use fast kernel
@@ -228,6 +242,11 @@ void TableT::clear() {
     k_pair_spline.clear();
     k_pair_bitmap_fast.clear();
     k_pair_bitmap.clear();
+    #if defined(LAL_OCL_EV_JIT)
+    k_pair_linear_noev.clear();
+    k_pair_spline_noev.clear();
+    k_pair_bitmap_noev.clear();
+    #endif
     _compiled_styles=false;
   }
 
@@ -243,19 +262,22 @@ double TableT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TableT::loop(const bool _eflag, const bool _vflag) {
+int TableT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
 
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) {
+    k_pair_linear_sel = &k_pair_linear_fast;
+    k_pair_spline_sel = &k_pair_spline_fast;
+    k_pair_bitmap_sel = &k_pair_bitmap_fast;
+  } else {
+    k_pair_linear_sel = &k_pair_linear_noev;
+    k_pair_spline_sel = &k_pair_spline_noev;
+    k_pair_bitmap_sel = &k_pair_bitmap_noev;
+  }
+  #endif
+
 
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
@@ -265,37 +287,37 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
   this->time_pair.start();
   if (shared_types) {
     if (_tabstyle == LOOKUP) {
-      this->k_pair_fast.set_size(GX,BX);
-      this->k_pair_fast.run(&this->atom->x, &tabindex, &coeff2, &coeff3,
+      this->k_pair_sel->set_size(GX,BX);
+      this->k_pair_sel->run(&this->atom->x, &tabindex, &coeff2, &coeff3,
                             &coeff4, &cutsq, &sp_lj, &this->nbor->dev_nbor,
                             &this->_nbor_data->begin(), &this->ans->force,
                             &this->ans->engv, &eflag, &vflag, &ainum,
                             &nbor_pitch, &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == LINEAR) {
-      this->k_pair_linear_fast.set_size(GX,BX);
-      this->k_pair_linear_fast.run(&this->atom->x, &tabindex, &coeff2,
-                                   &coeff3, &coeff4, &cutsq, &sp_lj,
-                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                                   &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch,
-                                   &this->_threads_per_atom, &_tablength);
+      k_pair_linear_sel->set_size(GX,BX);
+      k_pair_linear_sel->run(&this->atom->x, &tabindex, &coeff2,
+                             &coeff3, &coeff4, &cutsq, &sp_lj,
+                             &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                             &this->ans->force, &this->ans->engv,
+                             &eflag, &vflag, &ainum, &nbor_pitch,
+                             &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == SPLINE) {
-      this->k_pair_spline_fast.set_size(GX,BX);
-      this->k_pair_spline_fast.run(&this->atom->x, &tabindex, &coeff2,
-                                   &coeff3, &coeff4, &cutsq, &sp_lj,
-                                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                                   &this->ans->force, &this->ans->engv,
-                                   &eflag, &vflag, &ainum, &nbor_pitch,
-                                   &this->_threads_per_atom, &_tablength);
+      k_pair_spline_sel->set_size(GX,BX);
+      k_pair_spline_sel->run(&this->atom->x, &tabindex, &coeff2,
+                             &coeff3, &coeff4, &cutsq, &sp_lj,
+                             &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                             &this->ans->force, &this->ans->engv,
+                             &eflag, &vflag, &ainum, &nbor_pitch,
+                             &this->_threads_per_atom, &_tablength);
     } else if (_tabstyle == BITMAP) {
-      this->k_pair_bitmap_fast.set_size(GX,BX);
-      this->k_pair_bitmap_fast.run(&this->atom->x, &tabindex, &nshiftbits,
-                                   &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
-                                   &sp_lj, &this->nbor->dev_nbor,
-                                   &this->_nbor_data->begin(), &this->ans->force,
-                                   &this->ans->engv, &eflag, &vflag,
-                                   &ainum, &nbor_pitch,
-                                   &this->_threads_per_atom, &_tablength);
+      k_pair_bitmap_sel->set_size(GX,BX);
+      k_pair_bitmap_sel->run(&this->atom->x, &tabindex, &nshiftbits,
+                             &nmask, &coeff2, &coeff3, &coeff4, &cutsq,
+                             &sp_lj, &this->nbor->dev_nbor,
+                             &this->_nbor_data->begin(), &this->ans->force,
+                             &this->ans->engv, &eflag, &vflag,
+                             &ainum, &nbor_pitch,
+                             &this->_threads_per_atom, &_tablength);
     }
   } else {
     if (_tabstyle == LOOKUP) {
@@ -334,6 +356,7 @@ void TableT::loop(const bool _eflag, const bool _vflag) {
     }
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Table<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_table.cu b/lib/gpu/lal_table.cu
index 0cf0de2af0..eb29218712 100644
--- a/lib/gpu/lal_table.cu
+++ b/lib/gpu/lal_table.cu
@@ -58,24 +58,27 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -112,13 +115,13 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -129,9 +132,9 @@ __kernel void k_table(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_fast(const __global numtyp4 *restrict x_,
@@ -153,18 +156,22 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -173,7 +180,6 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -211,13 +217,13 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -228,9 +234,9 @@ __kernel void k_table_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 /// ---------------- LINEAR -------------------------------------------------
@@ -254,24 +260,27 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -312,13 +321,13 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -329,9 +338,9 @@ __kernel void k_table_linear(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
@@ -353,18 +362,22 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -373,7 +386,6 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -415,13 +427,13 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -432,9 +444,9 @@ __kernel void k_table_linear_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 /// ---------------- SPLINE -------------------------------------------------
@@ -458,24 +470,27 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -520,7 +535,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
             e = a * coeff3[idx].y + b * coeff3[idx+1].y +
@@ -529,7 +544,7 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -540,9 +555,9 @@ __kernel void k_table_spline(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_spline_fast(const __global numtyp4 *x_,
@@ -564,19 +579,22 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   __syncthreads();
 
   int tlm1 = tablength - 1;
@@ -584,7 +602,6 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -630,7 +647,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable < tlm1) {
             e = a * coeff3[idx].y + b * coeff3[idx+1].y +
@@ -639,7 +656,7 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
           }
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -650,9 +667,9 @@ __kernel void k_table_spline_fast(const __global numtyp4 *x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 /// ---------------- BITMAP -------------------------------------------------
@@ -678,24 +695,27 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   int tlm1 = tablength - 1;
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -739,13 +759,13 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -756,9 +776,9 @@ __kernel void k_table_bitmap(const __global numtyp4 *x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
@@ -782,18 +802,22 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
 
   __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     cutsq[tid]=cutsq_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
@@ -802,7 +826,6 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -847,13 +870,13 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e = (numtyp)0.0;
           if (itable <= tlm1)
             e = coeff3[idx].y + fraction*coeff4[idx].y;
           energy+=factor_lj*e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -864,7 +887,7 @@ __kernel void k_table_bitmap_fast(const __global numtyp4 *x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
diff --git a/lib/gpu/lal_table.h b/lib/gpu/lal_table.h
index 38ae012bee..b67a369dad 100644
--- a/lib/gpu/lal_table.h
+++ b/lib/gpu/lal_table.h
@@ -56,9 +56,10 @@ class Table : public BaseAtomic<numtyp, acctyp> {
   double host_memory_usage() const;
 
   // ------------------------- DEVICE KERNELS -------------------------
-  UCL_Kernel k_pair_linear, k_pair_linear_fast;
-  UCL_Kernel k_pair_spline, k_pair_spline_fast;
-  UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast;
+  UCL_Kernel k_pair_linear, k_pair_linear_fast, k_pair_linear_noev;
+  UCL_Kernel k_pair_spline, k_pair_spline_fast, k_pair_spline_noev;
+  UCL_Kernel k_pair_bitmap, k_pair_bitmap_fast, k_pair_bitmap_noev;
+  UCL_Kernel *k_pair_linear_sel, *k_pair_spline_sel, *k_pair_bitmap_sel;
 
   // --------------------------- TYPE DATA --------------------------
 
@@ -90,7 +91,7 @@ class Table : public BaseAtomic<numtyp, acctyp> {
  private:
   bool _allocated, _compiled_styles;
 
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_table_ext.cpp b/lib/gpu/lal_table_ext.cpp
index f067881b88..6237c4d7cd 100644
--- a/lib/gpu/lal_table_ext.cpp
+++ b/lib/gpu/lal_table_ext.cpp
@@ -55,7 +55,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
   int init_ok=0;
   if (world_me==0)
     init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size,
+                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
   TBMF.device->world_barrier();
@@ -73,7 +73,7 @@ int table_gpu_init(const int ntypes, double **cutsq, double ***table_coeffs,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=TBMF.init(ntypes, cutsq, table_coeffs, table_data,
-                      special_lj, inum, nall, 300, maxspecial, cell_size,
+                      special_lj, inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen, tabstyle, ntables, tablength);
 
     TBMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_tersoff.cpp b/lib/gpu/lal_tersoff.cpp
index 63691a2047..e0e87d9148 100644
--- a/lib/gpu/lal_tersoff.cpp
+++ b/lib/gpu/lal_tersoff.cpp
@@ -39,7 +39,7 @@ TersoffT::~Tersoff() {
 
 template <class numtyp, class acctyp>
 int TersoffT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
+  return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4;
 }
 
 template <class numtyp, class acctyp>
@@ -52,34 +52,82 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
                    const double* c, const double* d, const double* h, const double* gamma,
                    const double* beta, const double* powern, const double* host_cutsq)
 {
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=0;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (oldparam!=ijkparam) {
+          oldparam=ijkparam;
+          onetype=ntypes*ii+jj;
+          onetype3=ijkparam;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  if (onetype>=0) spq=powermint[onetype3];
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff,"k_tersoff_repulsive",
                            "k_tersoff_three_center", "k_tersoff_three_end",
-                           "k_tersoff_short_nbor");
+                           "k_tersoff_short_nbor",onetype,onetype3,spq,1);
   if (success!=0)
     return success;
 
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE);
+  if (this->nbor->max_nbors()) {
+    _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                  UCL_READ_WRITE);
+    _zetaij_eng.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                      UCL_READ_WRITE);
+  }
 
   k_zeta.set_function(*(this->pair_program),"k_tersoff_zeta");
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_zeta");
+  #else
+  k_zeta_selt = &k_zeta;
+  #endif
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
+  _ntypes=ntypes;
   _nparams = nparams;
   _nelements = nelements;
 
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
+                               UCL_READ_WRITE);
+  host_write.zero();
+  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
+          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+      }
+    }
+  }
+  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
+
+  // --------------------------------------------------------------------
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
 
@@ -90,32 +138,29 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
     dview[i].w=(numtyp)0;
   }
 
+  // pack coefficients into arrays
   // pack coefficients into arrays
   ts1.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(lam1[i]);
-    dview[i].y=static_cast<numtyp>(lam2[i]);
-    dview[i].z=static_cast<numtyp>(lam3[i]);
-    dview[i].w=static_cast<numtyp>(powermint[i]);
+    dview[i].x=static_cast<numtyp>(lam3[i]);
+    dview[i].y=static_cast<numtyp>(powermint[i]);
+    dview[i].z=static_cast<numtyp>(bigr[i]);
+    dview[i].w=static_cast<numtyp>(bigd[i]);
   }
 
   ucl_copy(ts1,dview,false);
-  ts1_tex.get_texture(*(this->pair_program),"ts1_tex");
-  ts1_tex.bind_float(ts1,4);
 
   ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
     dview[i].x=static_cast<numtyp>(biga[i]);
-    dview[i].y=static_cast<numtyp>(bigb[i]);
+    dview[i].y=static_cast<numtyp>(lam1[i]);
     dview[i].z=static_cast<numtyp>(bigr[i]);
     dview[i].w=static_cast<numtyp>(bigd[i]);
   }
 
   ucl_copy(ts2,dview,false);
-  ts2_tex.get_texture(*(this->pair_program),"ts2_tex");
-  ts2_tex.bind_float(ts2,4);
 
   ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -127,46 +172,28 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
   }
 
   ucl_copy(ts3,dview,false);
-  ts3_tex.get_texture(*(this->pair_program),"ts3_tex");
-  ts3_tex.bind_float(ts3,4);
 
   ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
-    dview[i].x=static_cast<numtyp>(c[i]);
-    dview[i].y=static_cast<numtyp>(d[i]);
+    dview[i].x=static_cast<numtyp>(c[i]*c[i]);
+    dview[i].y=static_cast<numtyp>(d[i]*d[i]);
     dview[i].z=static_cast<numtyp>(h[i]);
     dview[i].w=static_cast<numtyp>(gamma[i]);
   }
 
   ucl_copy(ts4,dview,false);
-  ts4_tex.get_texture(*(this->pair_program),"ts4_tex");
-  ts4_tex.bind_float(ts4,4);
 
   ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
   for (int i=0; i<nparams; i++) {
     dview[i].x=static_cast<numtyp>(beta[i]);
     dview[i].y=static_cast<numtyp>(powern[i]);
-    dview[i].z=(numtyp)0;
-    dview[i].w=(numtyp)0;
+    dview[i].z=static_cast<numtyp>(lam2[i]);
+    dview[i].w=static_cast<numtyp>(bigb[i]);
   }
 
   ucl_copy(ts5,dview,false);
-  ts5_tex.get_texture(*(this->pair_program),"ts5_tex");
-  ts5_tex.bind_float(ts5,4);
-
-  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
-                               UCL_WRITE_ONLY);
-  double cutsqmax = 0.0;
-  for (int i=0; i<nparams; i++) {
-    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
-    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
-  }
-  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  ucl_copy(cutsq,cutsq_view,false);
-
-  _cutshortsq = static_cast<numtyp>(cutsqmax);
 
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
@@ -183,17 +210,17 @@ int TersoffT::init(const int ntypes, const int nlocal, const int nall, const int
 
   ucl_copy(elem2param,dview_elem2param,false);
 
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
+  UCL_H_Vec<int> dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY);
   for (int i = 0; i < ntypes; i++)
     dview_map[i] = host_map[i];
 
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
+  map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY);
   ucl_copy(map,dview_map,false);
 
   _allocated=true;
   this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
-    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
+    ts4.row_bytes()+ts5.row_bytes()+map.row_bytes()+
+    elem2param.row_bytes()+_zetaij.row_bytes()+_zetaij_eng.row_bytes();
   return 0;
 }
 
@@ -208,12 +235,16 @@ void TersoffT::clear() {
   ts3.clear();
   ts4.clear();
   ts5.clear();
-  cutsq.clear();
+  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
+  _zetaij_eng.clear();
 
   k_zeta.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -229,75 +260,60 @@ double TersoffT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
-  // build the short neighbor list
-  int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
+int TersoffT::loop(const int eflag, const int vflag, const int evatom,
+                   bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // re-allocate zetaij if necessary
   int nall = this->_nall;
-  if (nall*this->_max_nbors > _zetaij.cols()) {
+  if (nall*this->nbor->max_nbors() > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
+    _zetaij.clear();
+    _zetaij_eng.clear();
+    success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax,
+                                        *(this->ucl_device),
+                                        UCL_READ_WRITE) == UCL_SUCCESS);
+    success = success && (_zetaij_eng.alloc(this->nbor->max_nbors()*_nmax,
+                                            *(this->ucl_device),
+                                            UCL_READ_WRITE) == UCL_SUCCESS);
+    if (!success) return 0;
   }
 
-  nbor_pitch=this->nbor->nbor_pitch();
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  this->time_pair.start();
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_zeta_selt = &k_zeta;
+  else k_zeta_selt = &k_zeta_noev;
+  #endif
+
   GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
                                (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+  k_zeta_selt->set_size(GX,BX);
+  k_zeta_selt->run(&this->atom->x, &ts1, &ts3, &ts4, &ts5,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
+                   &_zetaij_eng, &this->nbor->dev_nbor, &eflag, &this->_ainum,
+                   &nbor_pitch, &this->_threads_per_atom);
 
   ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  this->time_pair.start();
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
-                   &this->_threads_per_atom);
-
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
-                           &nbor_pitch, &this->_threads_per_atom, &evatom);
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &ts1, &ts4, &map,
+                           &elem2param, &_nelements, &_nparams, &_zetaij,
+                           &_zetaij_eng, &this->nbor->dev_nbor,
+                           &this->ans->force, &this->ans->engv, &eflag,
+                           &vflag, &ainum, &nbor_pitch,
+                           &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
   #ifdef THREE_CONCURRENT
@@ -307,24 +323,34 @@ void TersoffT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts4, &map, &elem2param,
+                          &_nelements, &_nparams, &_zetaij, &_zetaij_eng,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
 
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &ts1, &ts4, &map, &elem2param,
+                          &_nelements, &_nparams, &_zetaij, &_zetaij_eng,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &ts2, &map, &elem2param, &_nelements,
+                   &_nparams, &this->nbor->dev_nbor, &this->ans->force,
+                   &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                   &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class Tersoff<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_tersoff.cu b/lib/gpu/lal_tersoff.cu
index b08fddfd6e..03ce68be77 100644
--- a/lib/gpu/lal_tersoff.cu
+++ b/lib/gpu/lal_tersoff.cu
@@ -18,99 +18,28 @@
 
 #ifndef _DOUBLE_DOUBLE
 _texture( pos_tex,float4);
-_texture( ts1_tex,float4);
-_texture( ts2_tex,float4);
-_texture( ts3_tex,float4);
-_texture( ts4_tex,float4);
-_texture( ts5_tex,float4);
 #else
 _texture_2d( pos_tex,int4);
-_texture( ts1_tex,int4);
-_texture( ts2_tex,int4);
-_texture( ts3_tex,int4);
-_texture( ts4_tex,int4);
-_texture( ts5_tex,int4);
 #endif
 
 #else
 #define pos_tex x_
-#define ts1_tex ts1
-#define ts2_tex ts2
-#define ts3_tex ts3
-#define ts4_tex ts4
-#define ts5_tex ts5
 #endif
 
 //#define THREE_CONCURRENT
 
 #define TWOTHIRD (numtyp)0.66666666666666666667
 
-#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom,    \
-                 i, nbor_j, offset_j, idx)                                  \
-  if (nbor_mem==packed_mem) {                                               \
-    int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;                       \
-    idx = jj*n_stride + i*t_per_atom + offset_j;                            \
-  } else {                                                                  \
-    idx = nbor_j;                                                           \
-  }
+#if (SHUFFLE_AVAIL == 0)
 
-#if (ARCH < 300)
-
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
-    old.x+=f.x;                                                             \
-    old.y+=f.y;                                                             \
-    old.z+=f.z;                                                             \
-    ans[ii]=old;                                                            \
-  }
+#define local_allocate_acc_zeta()                                           \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         red_acc[tid] += red_acc[tid+s];                                     \
       }                                                                     \
@@ -118,36 +47,168 @@ _texture( ts5_tex,int4);
     z=red_acc[tid];                                                         \
   }
 
-#else
-
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define local_allocate_acc_zeta()
+
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      z += shfl_down(z, s, t_per_atom);                                     \
+    }                                                                       \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -155,63 +216,68 @@ _texture( ts5_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#define acc_zeta(z, tid, t_per_atom, offset)                                \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      z += shfl_xor(z, s, t_per_atom);                                      \
-    }                                                                       \
-  }
+#endif
+#endif
 
+#ifdef LAL_SIMD_IP_SYNC
+#define t_per_atom t_per_atom_in
+#else
+#define t_per_atom 1
 #endif
 
 __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
+                                   const __global numtyp *restrict cutsq_pair,
+                                   const int ntypes, __global int * dev_nbor,
                                    const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp _cutshortsq,
                                    const int inum, const int nbor_pitch,
-                                   const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+                                   const int t_per_atom_in) {
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp cutsq=cutsq_pair[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp cutsq=cutsq_pair[mtype];
+      #endif
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -224,74 +290,84 @@ __kernel void k_tersoff_short_nbor(const __global numtyp4 *restrict x_,
 
 __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict ts1_in,
-                             const __global numtyp4 *restrict ts2_in,
                              const __global numtyp4 *restrict ts3_in,
                              const __global numtyp4 *restrict ts4_in,
                              const __global numtyp4 *restrict ts5_in,
-                             const __global numtyp *restrict cutsq,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements, const int nparams,
-                             __global acctyp4 * zetaij,
+                             __global acctyp2 * zetaij,
+                             __global acctyp * zetaij_eng,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
-                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
-                             const int nbor_pitch, const int t_per_atom) {
-  __local int tpa_sq,n_stride;
-  tpa_sq = fast_mul(t_per_atom,t_per_atom);
+                             const int nbor_pitch, const int t_per_atom_in) {
+  const int tpa_sq = fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset,n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_acc_zeta();
+
+  #ifndef ONETYPE
   // must be increased if there will be more than 3 elements in the future.
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts3[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   __local numtyp4 ts5[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts3[tid]=ts3_in[tid];
     ts4[tid]=ts4_in[tid];
     ts5[tid]=ts5_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp ijkparam_lam3 = ts1_in[ONETYPE3].x;
+  const int ijkparam_powermint = SPQ; // ts1_in[ONETYPE3].y;
+  const numtyp ijkparam_bigr = ts1_in[ONETYPE3].z;
+  const numtyp ijkparam_bigd = ts1_in[ONETYPE3].w;
+  const numtyp ijkparam_c = ts4_in[ONETYPE3].x;
+  const numtyp ijkparam_d = ts4_in[ONETYPE3].y;
+  const numtyp ijkparam_h =  ts4_in[ONETYPE3].z;
+  const numtyp ijkparam_gamma =  ts4_in[ONETYPE3].w;
+  const numtyp ijparam_c1 = ts3_in[ONETYPE3].x;
+  const numtyp ijparam_c2 = ts3_in[ONETYPE3].y;
+  const numtyp ijparam_c3 = ts3_in[ONETYPE3].z;
+  const numtyp ijparam_c4 = ts3_in[ONETYPE3].w;
+  const numtyp ijparam_beta = ts5_in[ONETYPE3].x;
+  const numtyp ijparam_powern = ts5_in[ONETYPE3].y;
+  const numtyp ijparam_lam2 = ts5_in[ONETYPE3].z;
+  const numtyp ijparam_bigb = ts5_in[ONETYPE3].w;
+  #endif
 
   acctyp z = (acctyp)0;
 
-  __syncthreads();
-
   if (ii<inum) {
     int nbor_j, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #endif
 
       // Compute rij
       numtyp4 delr1, delr2;
@@ -299,151 +375,143 @@ __kernel void k_tersoff_zeta(const __global numtyp4 *restrict x_,
       delr1.y = jx.y-ix.y;
       delr1.z = jx.z-ix.z;
       numtyp rsq1 = delr1.x*delr1.x+delr1.y*delr1.y+delr1.z*delr1.z;
+      const numtyp r1 = ucl_sqrt(rsq1);
 
       // compute zeta_ij
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex); //x_[k];
         int ktype=kx.w;
+        #ifndef ONETYPE
         ktype=map[ktype];
-        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
-
+        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+
+                                ktype];
+        #endif
         // Compute rik
         delr2.x = kx.x-ix.x;
         delr2.y = kx.y-ix.y;
         delr2.z = kx.z-ix.z;
         numtyp rsq2 = delr2.x*delr2.x+delr2.y*delr2.y+delr2.z*delr2.z;
 
-        if (rsq2 > cutsq[ijkparam]) continue;
-
-        numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
-        numtyp ijkparam_lam3 = ts1_ijkparam.z;
-        numtyp ijkparam_powermint = ts1_ijkparam.w;
-        numtyp4 ts2_ijkparam = ts2[ijkparam]; //fetch4(ts2_ijkparam,ijkparam,ts2_tex);
-        numtyp ijkparam_bigr = ts2_ijkparam.z;
-        numtyp ijkparam_bigd = ts2_ijkparam.w;
-        numtyp4 ts4_ijkparam = ts4[ijkparam]; //fetch4(ts4_ijkparam,ijkparam,ts4_tex);
-        numtyp ijkparam_c = ts4_ijkparam.x;
-        numtyp ijkparam_d = ts4_ijkparam.y;
-        numtyp ijkparam_h = ts4_ijkparam.z;
-        numtyp ijkparam_gamma = ts4_ijkparam.w;
-        z += zeta(ijkparam_powermint, ijkparam_lam3, ijkparam_bigr, ijkparam_bigd,
-                  ijkparam_c, ijkparam_d, ijkparam_h, ijkparam_gamma,
-                  rsq1, rsq2, delr1, delr2);
+        #ifndef ONETYPE
+        const numtyp4 ts1_ijkparam = ts1[ijkparam];
+        const numtyp ijkparam_lam3 = ts1_ijkparam.x;
+        const int ijkparam_powermint = ts1_ijkparam.y;
+        const numtyp ijkparam_bigr = ts1_ijkparam.z;
+        const numtyp ijkparam_bigd = ts1_ijkparam.w;
+        const numtyp4 ts4_ijkparam = ts4[ijkparam];
+        const numtyp ijkparam_c = ts4_ijkparam.x;
+        const numtyp ijkparam_d = ts4_ijkparam.y;
+        const numtyp ijkparam_h = ts4_ijkparam.z;
+        const numtyp ijkparam_gamma = ts4_ijkparam.w;
+        #endif
+        z += zeta(ijkparam_powermint, ijkparam_lam3, ijkparam_bigr,
+                  ijkparam_bigd, ijkparam_c, ijkparam_d, ijkparam_h,
+                  ijkparam_gamma, r1, rsq2, delr1, delr2);
       }
 
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
       acc_zeta(z, tid, t_per_atom, offset_k);
 
-      numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
-      numtyp ijparam_lam2 = ts1_ijparam.y;
-      numtyp4 ts2_ijparam = ts2[ijparam]; //fetch4(ts2_ijparam,ijparam,ts2_tex);
-      numtyp ijparam_bigb = ts2_ijparam.y;
-      numtyp ijparam_bigr = ts2_ijparam.z;
-      numtyp ijparam_bigd = ts2_ijparam.w;
-      numtyp4 ts3_ijparam = ts3[ijparam]; //fetch4(ts3_ijparam,ijparam,ts3_tex);
-      numtyp ijparam_c1 = ts3_ijparam.x;
-      numtyp ijparam_c2 = ts3_ijparam.y;
-      numtyp ijparam_c3 = ts3_ijparam.z;
-      numtyp ijparam_c4 = ts3_ijparam.w;
-      numtyp4 ts5_ijparam = ts5[ijparam]; //fetch4(ts5_ijparam,ijparam,ts5_tex);
-      numtyp ijparam_beta = ts5_ijparam.x;
-      numtyp ijparam_powern = ts5_ijparam.y;
+      #ifndef ONETYPE
+      const numtyp ijparam_bigr = ts1[ijparam].z;
+      const numtyp ijparam_bigd = ts1[ijparam].w;
+      const numtyp4 ts3_ijparam = ts3[ijparam];
+      const numtyp ijparam_c1 = ts3_ijparam.x;
+      const numtyp ijparam_c2 = ts3_ijparam.y;
+      const numtyp ijparam_c3 = ts3_ijparam.z;
+      const numtyp ijparam_c4 = ts3_ijparam.w;
+      const numtyp4 ts5_ijparam = ts5[ijparam];
+      const numtyp ijparam_beta = ts5_ijparam.x;
+      const numtyp ijparam_powern = ts5_ijparam.y;
+      const numtyp ijparam_lam2 = ts5_ijparam.z;
+      const numtyp ijparam_bigb = ts5_ijparam.w;
+      #else
+      const numtyp ijparam_bigr = ijkparam_bigr;
+      const numtyp ijparam_bigd = ijkparam_bigd;
+      #endif
 
       if (offset_k == 0) {
         numtyp fpfeng[4];
         force_zeta(ijparam_bigb, ijparam_bigr, ijparam_bigd, ijparam_lam2,
-                   ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2, ijparam_c3,
-                   ijparam_c4, rsq1, z, eflag, fpfeng);
-        acctyp4 zij;
+                   ijparam_beta, ijparam_powern, ijparam_c1, ijparam_c2,
+                   ijparam_c3, ijparam_c4, r1, z, eflag, fpfeng);
+        acctyp2 zij;
         zij.x = fpfeng[0];
         zij.y = fpfeng[1];
-        zij.z = fpfeng[2];
-        zij.w = z;
-        zetaij[idx] = zij;
+        zetaij[nbor_j-2*nbor_pitch] = zij;
+        if (EVFLAG && eflag) zetaij_eng[nbor_j-2*nbor_pitch] = fpfeng[2];
       }
-
     } // for nbor
   } // if ii
 }
 
 __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
-                                  const __global numtyp4 *restrict ts1_in,
                                   const __global numtyp4 *restrict ts2_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
-                                  const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
+                                  const int t_per_atom_in,
+                                  const int ev_stride) {
+  int tid, ii, offset, n_stride;
   atom_info(t_per_atom,ii,tid,offset);
 
-  __local numtyp4 ts1[SHARED_SIZE];
+  local_allocate_store_pair();
+
+  #ifndef ONETYPE
   __local numtyp4 ts2[SHARED_SIZE];
   if (tid<nparams) {
-    ts1[tid]=ts1_in[tid];
     ts2[tid]=ts2_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp ijparam_biga = ts2_in[ONETYPE3].x;
+  const numtyp ijparam_lam1 = ts2_in[ONETYPE3].y;
+  const numtyp ijparam_bigr = ts2_in[ONETYPE3].z;
+  const numtyp ijparam_bigd = ts2_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
-
-  __syncthreads();
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
-
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
+    #endif
 
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
       int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #endif
 
       // Compute r12
 
@@ -452,14 +520,15 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq >= cutsq[ijparam]) continue;
+      #ifndef ONETYPE
+      numtyp4 ts2_ijparam = ts2[ijparam];
+      const numtyp ijparam_biga = ts2_ijparam.x;
+      const numtyp ijparam_lam1 = ts2_ijparam.y;
+      const numtyp ijparam_bigr = ts2_ijparam.z;
+      const numtyp ijparam_bigd = ts2_ijparam.w;
+      #endif
 
       numtyp feng[2];
-      numtyp ijparam_lam1 = ts1[ijparam].x;
-      numtyp4 ts2_ijparam = ts2[ijparam];
-      numtyp ijparam_biga = ts2_ijparam.x;
-      numtyp ijparam_bigr = ts2_ijparam.z;
-      numtyp ijparam_bigd = ts2_ijparam.w;
 
       repulsive(ijparam_bigr, ijparam_bigd, ijparam_lam1, ijparam_biga,
                 rsq, eflag, feng);
@@ -469,9 +538,9 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
       f.y+=dely*force;
       f.z+=delz*force;
 
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=feng[1];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         virial[0] += delx*delx*force;
         virial[1] += dely*dely*force;
         virial[2] += delz*delz*force;
@@ -480,86 +549,85 @@ __kernel void k_tersoff_repulsive(const __global numtyp4 *restrict x_,
         virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
                                      const __global numtyp4 *restrict ts1_in,
-                                     const __global numtyp4 *restrict ts2_in,
                                      const __global numtyp4 *restrict ts4_in,
-                                     const __global numtyp *restrict cutsq,
                                      const __global int *restrict map,
                                      const __global int *restrict elem2param,
                                      const int nelements, const int nparams,
-                                     const __global acctyp4 *restrict zetaij,
+                                     const __global acctyp2 *restrict zetaij,
+                                     const __global acctyp *restrict zetaij_e,
                                      const __global int * dev_nbor,
-                                     const __global int * dev_packed,
-                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
-                                     const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
+                                     const int t_per_atom_in,
+                                      const int evatom) {
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1
 
+  local_allocate_store_three();
+
+  #ifndef ONETYPE
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts4[tid]=ts4_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lam3 = ts1_in[ONETYPE3].x;
+  const int powermint = SPQ; // ts1_in[ONETYPE3].y;
+  const numtyp bigr = ts1_in[ONETYPE3].z;
+  const numtyp bigd = ts1_in[ONETYPE3].w;
+  const numtyp c = ts4_in[ONETYPE3].x;
+  const numtyp d = ts4_in[ONETYPE3].y;
+  const numtyp h = ts4_in[ONETYPE3].z;
+  const numtyp gamma = ts4_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-  __syncthreads();
-
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+      #endif
 
       // Compute r12
       numtyp delr1[3];
@@ -567,26 +635,22 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
-      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
       // look up for zeta_ij
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
+      acctyp2 zeta_ij = zetaij[nbor_j-2*nbor_pitch];
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
-        energy+=zeta_ij.z*tpainv;
+      if (EVFLAG && eflag) {
+        energy+=zetaij_e[nbor_j-2*nbor_pitch]*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += delr1[0]*delr1[0]*mforce;
         virial[1] += delr1[1]*delr1[1]*mforce;
@@ -597,48 +661,45 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        #ifndef ONETYPE
         int ktype=kx.w;
         ktype=map[ktype];
-        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+ktype];
+        int ijkparam=elem2param[itype*nelements*nelements+jtype*nelements+
+                                ktype];
+        #endif
 
         numtyp delr2[3];
         delr2[0] = kx.x-ix.x;
         delr2[1] = kx.y-ix.y;
         delr2[2] = kx.z-ix.z;
-        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        numtyp rsq2 = delr2[0]*delr2[0]+delr2[1]*delr2[1]+delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[ijkparam]) continue;
+        #ifndef ONETYPE
+        const numtyp4 ts1_ijkparam = ts1[ijkparam];
+        const numtyp lam3 = ts1_ijkparam.x;
+        const int powermint = ts1_ijkparam.y;
+        const numtyp bigr = ts1_ijkparam.z;
+        const numtyp bigd = ts1_ijkparam.w;
+        const numtyp4 ts4_ijkparam = ts4[ijkparam];
+        const numtyp c = ts4_ijkparam.x;
+        const numtyp d = ts4_ijkparam.y;
+        const numtyp h = ts4_ijkparam.z;
+        const numtyp gamma = ts4_ijkparam.w;
+        #endif
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
         numtyp fi[3], fj[3], fk[3];
-        numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
-        lam3 = ts1_ijkparam.z;
-        powermint = ts1_ijkparam.w;
-        numtyp4 ts2_ijkparam = ts2[ijkparam]; //fetch4(ts2_ijkparam,ijkparam,ts2_tex);
-        bigr = ts2_ijkparam.z;
-        bigd = ts2_ijkparam.w;
-        numtyp4 ts4_ijkparam = ts4[ijkparam]; //fetch4(ts4_ijkparam,ijkparam,ts4_tex);
-        c = ts4_ijkparam.x;
-        d = ts4_ijkparam.y;
-        h = ts4_ijkparam.z;
-        gamma = ts4_ijkparam.w;
-        if (vflag>0)
-          attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
-                     prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
+        if (EVFLAG && vflag)
+          attractive(bigr, bigd, powermint, lam3, c, d, h, gamma, prefactor,
+                     r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
         else
           attractive_fi(bigr, bigd, powermint, lam3, c, d, h, gamma,
                         prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi);
@@ -646,7 +707,7 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           acctyp v[6];
           numtyp pre = (numtyp)2.0;
           if (evatom==1) pre = TWOTHIRD;
@@ -662,87 +723,90 @@ __kernel void k_tersoff_three_center(const __global numtyp4 *restrict x_,
         }
       } // nbor_k
     } // for nbor_j
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,
-                     offset,eflag,vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,
+                offset,eflag,vflag,ans,engv);
 }
 
 __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts1_in,
-                                  const __global numtyp4 *restrict ts2_in,
                                   const __global numtyp4 *restrict ts4_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
-                                  const __global acctyp4 *restrict zetaij,
+                                  const __global acctyp2 *restrict zetaij,
+                                  const __global acctyp *restrict zetaij_e,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
                                   const __global int * dev_ilist,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
-                                  const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
+                                  const int t_per_atom_in,
+                                  const int gpu_nbor) {
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
+  #ifndef ONETYPE
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts4[tid]=ts4_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lam3 = ts1_in[ONETYPE3].x;
+  const int powermint = SPQ; //ts1_in[ONETYPE3].y;
+  const numtyp bigr = ts1_in[ONETYPE3].z;
+  const numtyp bigd = ts1_in[ONETYPE3].w;
+  const numtyp c = ts4_in[ONETYPE3].x;
+  const numtyp d = ts4_in[ONETYPE3].y;
+  const numtyp h = ts4_in[ONETYPE3].z;
+  const numtyp gamma = ts4_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
-
-  __syncthreads();
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
+      #endif
 
       // Compute r12
       numtyp delr1[3];
@@ -756,62 +820,52 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
-
       // look up for zeta_ji: find i in the j's neighbor list
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      // Due to conditional, only works on hardware w/out IP divergence in subg
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
+      acctyp2 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
-        energy+=zeta_ji.z*tpainv;
+      if (EVFLAG && eflag) {
+        energy+=zetaij_e[ijnum-2*nbor_pitch]*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -823,62 +877,62 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        #ifndef ONETYPE
         int ktype=kx.w;
         ktype=map[ktype];
-        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
+        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+
+                                ktype];
+        #endif
 
         numtyp delr2[3];
         delr2[0] = kx.x-jx.x;
         delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
-        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        numtyp rsq2 = delr2[0]*delr2[0]+delr2[1]*delr2[1]+delr2[2]*delr2[2];
 
+        #ifndef ONETYPE
+        numtyp4 ts1_param = ts1[jikparam];
+        numtyp lam3 = ts1_param.x;
+        int powermint = ts1_param.y;
+        numtyp bigr = ts1_param.z;
+        numtyp bigd = ts1_param.w;
+        numtyp4 ts4_param = ts4[jikparam];
+        numtyp c = ts4_param.x;
+        numtyp d = ts4_param.y;
+        numtyp h = ts4_param.z;
+        numtyp gamma = ts4_param.w;
+        #endif
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
-        numtyp4 ts1_param, ts2_param, ts4_param;
         numtyp fi[3];
 
-        ts1_param = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
-        ts4_param = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex);
-        c = ts4_param.x;
-        d = ts4_param.y;
-        h = ts4_param.z;
-        gamma = ts4_param.w;
         attractive_fj(bigr, bigd, powermint, lam3, c, d, h, gamma,
                       prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi);
         f.x += fi[0];
         f.y += fi[1];
         f.z += fi[2];
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
-        numtyp prefactor_jk = zeta_jk.y;
-        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
-        ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
-        ts4_param = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex);
+        numtyp prefactor_jk = zetaij[nbor_k-2*nbor_pitch].y;
+        #ifndef ONETYPE
+        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+
+                                itype];
+        ts1_param = ts1[jkiparam];
+        lam3 = ts1_param.x;
+        powermint = ts1_param.y;
+        bigr = ts1_param.z;
+        bigd = ts1_param.w;
+        ts4_param = ts4[jkiparam];
         c = ts4_param.x;
         d = ts4_param.y;
         h = ts4_param.z;
         gamma = ts4_param.w;
+        #endif
         attractive_fk(bigr, bigd, powermint, lam3, c, d, h, gamma,
                       prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi);
         f.x += fi[0];
@@ -886,92 +940,94 @@ __kernel void k_tersoff_three_end(const __global numtyp4 *restrict x_,
         f.z += fi[2];
       } // for nbor_k
     } // for nbor_j
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
-                                        const __global numtyp4 *restrict ts1_in,
-                                        const __global numtyp4 *restrict ts2_in,
-                                        const __global numtyp4 *restrict ts4_in,
-                                        const __global numtyp *restrict cutsq,
-                                        const __global int *restrict map,
-                                        const __global int *restrict elem2param,
-                                        const int nelements, const int nparams,
-                                        const __global acctyp4 *restrict zetaij,
-                                        const __global int * dev_nbor,
-                                        const __global int * dev_packed,
-                                        const __global int * dev_ilist,
-                                        const __global int * dev_short_nbor,
-                                        __global acctyp4 *restrict ans,
-                                        __global acctyp *restrict engv,
-                                        const int eflag, const int vflag,
-                                        const int inum,  const int nbor_pitch,
-                                        const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
-  numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
+                                    const __global numtyp4 *restrict ts1_in,
+                                    const __global numtyp4 *restrict ts4_in,
+                                    const __global int *restrict map,
+                                    const __global int *restrict elem2param,
+                                    const int nelements, const int nparams,
+                                    const __global acctyp2 *restrict zetaij,
+                                    const __global acctyp *restrict zetaij_e,
+                                    const __global int * dev_nbor,
+                                    const __global int * dev_ilist,
+                                    __global acctyp4 *restrict ans,
+                                    __global acctyp *restrict engv,
+                                    const int eflag, const int vflag,
+                                    const int inum,  const int nbor_pitch,
+                                    const int t_per_atom_in,
+                                    const int gpu_nbor) {
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
+  #ifndef ONETYPE
   __local numtyp4 ts1[SHARED_SIZE];
-  __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
   if (tid<nparams) {
     ts1[tid]=ts1_in[tid];
-    ts2[tid]=ts2_in[tid];
     ts4[tid]=ts4_in[tid];
   }
+  __syncthreads();
+  #else
+  const numtyp lam3 = ts1_in[ONETYPE3].x;
+  const int powermint = SPQ; //ts1_in[ONETYPE3].y;
+  const numtyp bigr = ts1_in[ONETYPE3].z;
+  const numtyp bigd = ts1_in[ONETYPE3].w;
+  const numtyp c = ts4_in[ONETYPE3].x;
+  const numtyp d = ts4_in[ONETYPE3].y;
+  const numtyp h = ts4_in[ONETYPE3].z;
+  const numtyp gamma = ts4_in[ONETYPE3].w;
+  #endif
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int red_acc[BLOCK_PAIR];
-
-  __syncthreads();
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
     int itype=ix.w;
     itype=map[itype];
+    #endif
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      #ifndef ONETYPE
       int jtype=jx.w;
       jtype=map[jtype];
+      #endif
 
       // Compute r12
       numtyp delr1[3];
@@ -985,62 +1041,53 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          red_acc[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = red_acc[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      // Due to conditional, only works on hardware w/out IP divergence in subg
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
+      acctyp2 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
-        energy+=zeta_ji.z*tpainv;
+      if (EVFLAG && eflag) {
+        energy+=zetaij_e[ijnum-2*nbor_pitch]*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -1052,41 +1099,44 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
 
         numtyp4 kx; fetch4(kx,k,pos_tex);
+        #ifndef ONETYPE
         int ktype=kx.w;
         ktype=map[ktype];
-        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+ktype];
+        int jikparam=elem2param[jtype*nelements*nelements+itype*nelements+
+                                ktype];
+        #endif
 
         numtyp delr2[3];
         delr2[0] = kx.x-jx.x;
         delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
-        numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
+        numtyp rsq2 = delr2[0]*delr2[0]+delr2[1]*delr2[1]+delr2[2]*delr2[2];
+
+        #ifndef ONETYPE
+        numtyp4 ts1_param = ts1[jikparam];
+        numtyp lam3 = ts1_param.x;
+        int powermint = ts1_param.y;
+        numtyp bigr = ts1_param.z;
+        numtyp bigd = ts1_param.w;
+        numtyp4 ts4_param = ts4[jikparam];
+        numtyp c = ts4_param.x;
+        numtyp d = ts4_param.y;
+        numtyp h = ts4_param.z;
+        numtyp gamma = ts4_param.w;
+        #endif
 
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
-
         numtyp fi[3], fj[3], fk[3];
-        numtyp4 ts1_param, ts2_param, ts4_param;
-        ts1_param = ts1[jikparam]; //fetch4(ts1_jikparam,jikparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jikparam]; //fetch4(ts2_jikparam,jikparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
-        ts4_param = ts4[jikparam]; //fetch4(ts4_jikparam,jikparam,ts4_tex);
-        c = ts4_param.x;
-        d = ts4_param.y;
-        h = ts4_param.z;
-        gamma = ts4_param.w;
         attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
-                   prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj, fk);
+                   prefactor_ji, r1, r1inv, r2, r2inv, mdelr1, delr2, fi, fj,
+                   fk);
         f.x += fj[0];
         f.y += fj[1];
         f.z += fj[2];
@@ -1098,26 +1148,25 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
         virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
-        numtyp prefactor_jk = zeta_jk.y;
+        numtyp prefactor_jk = zetaij[nbor_k-2*nbor_pitch].y;
 
-        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
-        ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
-        lam3 = ts1_param.z;
-        powermint = ts1_param.w;
-        ts2_param = ts2[jkiparam]; //fetch4(ts2_jkiparam,jkiparam,ts2_tex);
-        bigr = ts2_param.z;
-        bigd = ts2_param.w;
+        #ifndef ONETYPE
+        int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+
+                                itype];
+        ts1_param = ts1[jkiparam];
+        lam3 = ts1_param.x;
+        powermint = ts1_param.y;
+        bigr = ts1_param.z;
+        bigd = ts1_param.w;
         ts4_param = ts4[jkiparam]; //fetch4(ts4_jkiparam,jkiparam,ts4_tex);
         c = ts4_param.x;
         d = ts4_param.y;
         h = ts4_param.z;
         gamma = ts4_param.w;
         attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
-                   prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi, fj, fk);
+                   prefactor_jk, r2, r2inv, r1, r1inv, delr2, mdelr1, fi, fj,
+                   fk);
+        #endif
         f.x += fk[0];
         f.y += fk[1];
         f.z += fk[2];
@@ -1130,14 +1179,13 @@ __kernel void k_tersoff_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
       }
     } // for nbor
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_tersoff.h b/lib/gpu/lal_tersoff.h
index 51e64c987b..8f99569162 100644
--- a/lib/gpu/lal_tersoff.h
+++ b/lib/gpu/lal_tersoff.h
@@ -59,41 +59,36 @@ class Tersoff : public BaseThree<numtyp, acctyp> {
 
   // --------------------------- TYPE DATA --------------------------
 
-  /// If atom type constants fit in shared memory, use fast kernels
-  bool shared_types;
-
   /// Number of atom types
-  int _lj_types;
+  int _ntypes;
 
-  /// ts1.x = lam1, ts1.y = lam2,  ts1.z = lam3, ts1.w = powermint
+  /// ts1.x = lam3, ts1.y = powermint,  ts1.z = c3, ts1.w = c4
   UCL_D_Vec<numtyp4> ts1;
-  /// ts2.x = biga, ts2.y = bigb,  ts2.z = bigr, ts2.w = bigd
+  /// ts2.x = biga, ts2.y = lam1,  ts2.z = bigr, ts2.w = bigd
   UCL_D_Vec<numtyp4> ts2;
   /// ts3.x = c1,   ts3.y = c2,    ts3.z = c3,   ts3.w = c4
   UCL_D_Vec<numtyp4> ts3;
-  /// ts4.x = c,    ts4.y = d,     ts4.z = h,    ts4.w = gamma
+  /// ts4.x = c*c,  ts4.y = d*d,   ts4.z = h,    ts4.w = gamma
   UCL_D_Vec<numtyp4> ts4;
-  /// ts5.x = beta, ts5.y = powern
+  /// ts5.x = beta, ts5.y = powern, ts5.z = lam2, ts5.w = bigb
   UCL_D_Vec<numtyp4> ts5;
 
-  UCL_D_Vec<numtyp> cutsq;
+  UCL_D_Vec<numtyp> cutsq_pair;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
   int _nparams,_nelements;
 
   /// Per-atom arrays:
-  /// zetaij.x = force, zetaij.y = prefactor, zetaij.z = evdwl,
-  /// zetaij.w = zetaij
-  UCL_D_Vec<acctyp4>   _zetaij;
+  /// zetaij.x = force, zetaij.y = prefactor
+  UCL_D_Vec<acctyp2>   _zetaij;
+  UCL_D_Vec<acctyp> _zetaij_eng;
 
-  UCL_Kernel k_zeta;
-  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-  numtyp _cutshortsq;
+  UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 };
 
 }
diff --git a/lib/gpu/lal_tersoff_ext.cpp b/lib/gpu/lal_tersoff_ext.cpp
index 749842864f..ac700d014a 100644
--- a/lib/gpu/lal_tersoff_ext.cpp
+++ b/lib/gpu/lal_tersoff_ext.cpp
@@ -63,7 +63,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=TSMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
                       ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                       ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -84,7 +84,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=TSMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=TSMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
                         ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                         ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -99,7 +99,7 @@ int tersoff_gpu_init(const int ntypes, const int inum, const int nall, const int
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    TSMF.estimate_gpu_overhead();
+    TSMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_tersoff_extra.h b/lib/gpu/lal_tersoff_extra.h
index 7ee29751b7..da2568aa1b 100644
--- a/lib/gpu/lal_tersoff_extra.h
+++ b/lib/gpu/lal_tersoff_extra.h
@@ -55,11 +55,9 @@ ucl_inline numtyp ters_gijk(const numtyp costheta,
                             const numtyp param_h,
                             const numtyp param_gamma)
 {
-  const numtyp ters_c = param_c * param_c;
-  const numtyp ters_d = param_d * param_d;
   const numtyp hcth = param_h - costheta;
-  return param_gamma*((numtyp)1.0 + ters_c*ucl_recip(ters_d) -
-         ters_c *ucl_recip(ters_d + hcth*hcth));
+  return param_gamma*((numtyp)1.0 + param_c*ucl_recip(param_d) -
+         param_c *ucl_recip(param_d + hcth*hcth));
 }
 
 /* ---------------------------------------------------------------------- */
@@ -68,19 +66,20 @@ ucl_inline numtyp ters_gijk_d(const numtyp costheta,
                               const numtyp param_c,
                               const numtyp param_d,
                               const numtyp param_h,
-                              const numtyp param_gamma)
+                              const numtyp param_gamma,
+                              numtyp *ans_d)
 {
-  const numtyp ters_c = param_c * param_c;
-  const numtyp ters_d = param_d * param_d;
   const numtyp hcth = param_h - costheta;
-  const numtyp numerator = (numtyp)-2.0 * ters_c * hcth;
-  const numtyp denominator = ucl_recip(ters_d + hcth*hcth);
-  return param_gamma*numerator*denominator*denominator;
+  const numtyp idhh=ucl_recip(param_d + hcth*hcth);
+  const numtyp numerator = (numtyp)-2.0 * param_c * hcth;
+  *ans_d=param_gamma*numerator*idhh*idhh;
+  return param_gamma*((numtyp)1.0+param_c*ucl_recip(param_d)-param_c*idhh);
 }
 
 /* ---------------------------------------------------------------------- */
 
-ucl_inline void costheta_d(const numtyp rij_hat[3],
+ucl_inline void costheta_d(const numtyp cos_theta,
+                           const numtyp rij_hat[3],
                            const numtyp rij,
                            const numtyp rik_hat[3],
                            const numtyp rik,
@@ -89,9 +88,6 @@ ucl_inline void costheta_d(const numtyp rij_hat[3],
                            numtyp *drk)
 {
   // first element is derivative wrt Ri, second wrt Rj, third wrt Rk
-
-  numtyp cos_theta = vec3_dot(rij_hat,rik_hat);
-
   vec3_scaleadd(-cos_theta,rij_hat,rik_hat,drj);
   vec3_scale(ucl_recip(rij),drj,drj);
   vec3_scaleadd(-cos_theta,rik_hat,rij_hat,drk);
@@ -107,7 +103,9 @@ ucl_inline numtyp ters_fc(const numtyp r,
                           const numtyp param_bigd)
 {
   if (r < param_bigr-param_bigd) return (numtyp)1.0;
+  #ifndef ONETYPE
   if (r > param_bigr+param_bigd) return (numtyp)0.0;
+  #endif
   return (numtyp)0.5*((numtyp)1.0 - sin(MY_PI2*(r - param_bigr)/param_bigd));
 }
 
@@ -115,24 +113,23 @@ ucl_inline numtyp ters_fc(const numtyp r,
 
 ucl_inline numtyp ters_fc_d(const numtyp r,
                             const numtyp param_bigr,
-                            const numtyp param_bigd)
+                            const numtyp param_bigd,
+                            numtyp *ans_d)
 {
-  if (r < param_bigr-param_bigd) return (numtyp)0.0;
-  if (r > param_bigr+param_bigd) return (numtyp)0.0;
-  return -(MY_PI4/param_bigd) * cos(MY_PI2*(r - param_bigr)/param_bigd);
-}
-
-/* ---------------------------------------------------------------------- */
-
-ucl_inline numtyp ters_fa(const numtyp r,
-                          const numtyp param_bigb,
-                          const numtyp param_bigr,
-                          const numtyp param_bigd,
-                          const numtyp param_lam2)
-{
-  if (r > param_bigr + param_bigd) return (numtyp)0.0;
-  return -param_bigb * ucl_exp(-param_lam2 * r) *
-    ters_fc(r,param_bigr,param_bigd);
+  if (r < param_bigr-param_bigd) {
+    *ans_d=(numtyp)0.0;
+    return (numtyp)1.0;
+  }
+  #ifndef ONETYPE
+  if (r > param_bigr+param_bigd) {
+    *ans_d=(numtyp)0.0;
+    return (numtyp)0.0;
+  }
+  #endif
+  const numtyp ibigd = ucl_recip(param_bigd);
+  const numtyp angle = MY_PI2*(r - param_bigr)*ibigd;
+  *ans_d=-(MY_PI4*ibigd) * cos(angle);
+  return (numtyp)0.5*((numtyp)1.0 - sin(angle));
 }
 
 /* ---------------------------------------------------------------------- */
@@ -141,33 +138,17 @@ ucl_inline numtyp ters_fa_d(const numtyp r,
                             const numtyp param_bigb,
                             const numtyp param_bigr,
                             const numtyp param_bigd,
-                            const numtyp param_lam2)
+                            const numtyp param_lam2,
+                            numtyp *ans_d)
 {
+  #ifndef ONETYPE
   if (r > param_bigr + param_bigd) return (numtyp)0.0;
-  return param_bigb * ucl_exp(-param_lam2 * r) * (param_lam2 *
-    ters_fc(r,param_bigr,param_bigd) - ters_fc_d(r,param_bigr,param_bigd));
-}
-
-/* ---------------------------------------------------------------------- */
-
-ucl_inline numtyp ters_bij(const numtyp zeta,
-                           const numtyp param_beta,
-                           const numtyp param_powern,
-                           const numtyp param_c1,
-                           const numtyp param_c2,
-                           const numtyp param_c3,
-                           const numtyp param_c4)
-{
-  numtyp tmp = param_beta * zeta;
-  if (tmp > param_c1) return ucl_rsqrt(tmp);
-  if (tmp > param_c2)
-    return ((numtyp)1.0 - ucl_powr(tmp,-param_powern) /
-      ((numtyp)2.0*param_powern))*ucl_rsqrt(tmp);
-  if (tmp < param_c4) return (numtyp)1.0;
-  if (tmp < param_c3)
-    return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
-  return ucl_powr((numtyp)1.0 + ucl_powr(tmp,param_powern),
-    (numtyp)-1.0/((numtyp)2.0*param_powern));
+  #endif
+  numtyp dfc;
+  const numtyp fc=ters_fc_d(r,param_bigr,param_bigd,&dfc);
+  const numtyp blr = param_bigb * ucl_exp(-param_lam2 * r);
+  *ans_d = blr * (param_lam2 * fc - dfc);
+  return -blr * fc;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -178,24 +159,35 @@ ucl_inline numtyp ters_bij_d(const numtyp zeta,
                              const numtyp param_c1,
                              const numtyp param_c2,
                              const numtyp param_c3,
-                             const numtyp param_c4)
+                             const numtyp param_c4,
+                             numtyp *ans_d)
 {
-  numtyp tmp = param_beta * zeta;
-  if (tmp > param_c1)
-    return param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
-  if (tmp > param_c2)
-    return param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
-    // error in negligible 2nd term fixed 9/30/2015
-                // (1.0 - 0.5*(1.0 +  1.0/(2.0*param->powern)) *
-      ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 /((numtyp)2.0 * param_powern)) *
-       ucl_powr(tmp,-param_powern)));
-  if (tmp < param_c4) return (numtyp)0.0;
-  if (tmp < param_c3)
-    return (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
-
-  numtyp tmp_n = ucl_powr(tmp,param_powern);
-  return (numtyp)-0.5 * ucl_powr((numtyp)1.0+tmp_n, (numtyp) -
-    (numtyp)1.0-((numtyp)1.0 / ((numtyp)2.0 * param_powern)))*tmp_n / zeta;
+  const numtyp tmp = param_beta * zeta;
+  if (tmp > param_c1) {
+    *ans_d = param_beta * (numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5);
+    return ucl_rsqrt(tmp);
+  }
+  if (tmp > param_c2) {
+    const numtyp ptmp = ucl_powr(tmp,-param_powern);
+    const numtyp i2n = ucl_recip((numtyp)2.0 * param_powern);
+    *ans_d = param_beta * ((numtyp)-0.5*ucl_powr(tmp,(numtyp)-1.5) *
+                           ((numtyp)1.0 - ((numtyp)1.0 + (numtyp)1.0 * i2n) *
+                            ptmp));
+    return ((numtyp)1.0 - ptmp * i2n)*ucl_rsqrt(tmp);
+  }
+  if (tmp < param_c4) {
+    *ans_d = (numtyp)0.0;
+    return (numtyp)1.0;
+  }
+  if (tmp < param_c3) {
+    *ans_d = (numtyp)-0.5*param_beta * ucl_powr(tmp,param_powern-(numtyp)1.0);
+    return (numtyp)1.0 - ucl_powr(tmp,param_powern)/((numtyp)2.0*param_powern);
+  }
+  const numtyp tmp_n = (numtyp)1.0+ucl_powr(tmp,param_powern);
+  const numtyp i2n = -ucl_recip((numtyp)2.0*param_powern);
+  *ans_d = (numtyp)-0.5*ucl_powr(tmp_n,(numtyp)-1.0+i2n)*(tmp_n-(numtyp)1.0)/
+    zeta;
+  return ucl_powr(tmp_n, i2n);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -207,7 +199,7 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor,
                                 const numtyp rik,
                                 const numtyp param_bigr,
                                 const numtyp param_bigd,
-                                const numtyp param_powermint,
+                                const int param_powermint,
                                 const numtyp param_lam3,
                                 const numtyp param_c,
                                 const numtyp param_d,
@@ -220,25 +212,23 @@ ucl_inline void ters_zetaterm_d(const numtyp prefactor,
   numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp;
   numtyp dcosdri[3],dcosdrj[3],dcosdrk[3];
 
-  fc = ters_fc(rik,param_bigr,param_bigd);
-  dfc = ters_fc_d(rik,param_bigr,param_bigd);
+  fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Ri
   // dri = -dfc*gijk*ex_delr*rik_hat;
@@ -277,7 +267,7 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor,
                                    const numtyp rik,
                                    const numtyp param_bigr,
                                    const numtyp param_bigd,
-                                   const numtyp param_powermint,
+                                   const int param_powermint,
                                    const numtyp param_lam3,
                                    const numtyp param_c,
                                    const numtyp param_d,
@@ -288,25 +278,23 @@ ucl_inline void ters_zetaterm_d_fi(const numtyp prefactor,
   numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp;
   numtyp dcosdri[3],dcosdrj[3],dcosdrk[3];
 
-  fc = ters_fc(rik,param_bigr,param_bigd);
-  dfc = ters_fc_d(rik,param_bigr,param_bigd);
+  fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Ri
   // dri = -dfc*gijk*ex_delr*rik_hat;
@@ -327,7 +315,7 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor,
                                    const numtyp rik,
                                    const numtyp param_bigr,
                                    const numtyp param_bigd,
-                                   const numtyp param_powermint,
+                                   const int param_powermint,
                                    const numtyp param_lam3,
                                    const numtyp param_c,
                                    const numtyp param_d,
@@ -341,21 +329,20 @@ ucl_inline void ters_zetaterm_d_fj(const numtyp prefactor,
   fc = ters_fc(rik,param_bigr,param_bigd);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Rj
   // drj = fc*gijk_d*ex_delr*dcosdrj;
@@ -373,7 +360,7 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor,
                                    const numtyp rik,
                                    const numtyp param_bigr,
                                    const numtyp param_bigd,
-                                   const numtyp param_powermint,
+                                   const int param_powermint,
                                    const numtyp param_lam3,
                                    const numtyp param_c,
                                    const numtyp param_d,
@@ -384,25 +371,23 @@ ucl_inline void ters_zetaterm_d_fk(const numtyp prefactor,
   numtyp gijk,gijk_d,ex_delr,ex_delr_d,fc,dfc,cos_theta,tmp;
   numtyp dcosdri[3],dcosdrj[3],dcosdrk[3];
 
-  fc = ters_fc(rik,param_bigr,param_bigd);
-  dfc = ters_fc_d(rik,param_bigr,param_bigd);
+  fc = ters_fc_d(rik,param_bigr,param_bigd,&dfc);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) tmp = t*t*t;
+  if (param_powermint == 3) tmp = t*t*t;
   else tmp = t;
 
   if (tmp > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
   else if (tmp < (numtyp)-69.0776) ex_delr = (numtyp)0.0;
   else ex_delr = ucl_exp(tmp);
 
-  if ((int)param_powermint == 3)
+  if (param_powermint == 3)
     ex_delr_d = (numtyp)3.0*param_lam3*t*t*ex_delr;
   else ex_delr_d = param_lam3 * ex_delr;
 
   cos_theta = vec3_dot(rij_hat,rik_hat);
-  gijk = ters_gijk(cos_theta,param_c,param_d,param_h,param_gamma);
-  gijk_d = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma);
-  costheta_d(rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
+  gijk = ters_gijk_d(cos_theta,param_c,param_d,param_h,param_gamma,&gijk_d);
+  costheta_d(cos_theta,rij_hat,rij,rik_hat,rik,dcosdri,dcosdrj,dcosdrk);
 
   // compute the derivative wrt Rk
   // drk = dfc*gijk*ex_delr*rik_hat;
@@ -427,18 +412,17 @@ ucl_inline void repulsive(const numtyp param_bigr,
 {
   numtyp r,tmp_fc,tmp_fc_d,tmp_exp;
   r = ucl_sqrt(rsq);
-  tmp_fc = ters_fc(r,param_bigr,param_bigd);
-  tmp_fc_d = ters_fc_d(r,param_bigr,param_bigd);
-  tmp_exp = ucl_exp(-param_lam1 * r);
+  tmp_fc = ters_fc_d(r,param_bigr,param_bigd,&tmp_fc_d);
+  tmp_exp = param_biga * ucl_exp(-param_lam1 * r);
   // fforce
-  ans[0] = -param_biga*tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r);
+  ans[0] = -tmp_exp*(tmp_fc_d - tmp_fc*param_lam1)*ucl_recip(r);
   // eng
-  if (eflag) ans[1] = tmp_fc * param_biga * tmp_exp;
+  if (EVFLAG && eflag) ans[1] = tmp_fc * tmp_exp;
 }
 
 /* ---------------------------------------------------------------------- */
 
-ucl_inline numtyp zeta(const numtyp param_powermint,
+ucl_inline numtyp zeta(const int param_powermint,
                        const numtyp param_lam3,
                        const numtyp param_bigr,
                        const numtyp param_bigd,
@@ -446,20 +430,19 @@ ucl_inline numtyp zeta(const numtyp param_powermint,
                        const numtyp param_d,
                        const numtyp param_h,
                        const numtyp param_gamma,
-                       const numtyp rsqij,
+                       const numtyp rij,
                        const numtyp rsqik,
                        const numtyp4 delrij,
                        const numtyp4 delrik)
 {
-  numtyp rij,rik,costheta,arg,ex_delr;
+  numtyp rik,costheta,arg,ex_delr;
 
-  rij = ucl_sqrt(rsqij);
   rik = ucl_sqrt(rsqik);
   costheta = (delrij.x*delrik.x + delrij.y*delrik.y +
               delrij.z*delrik.z) / (rij*rik);
 
   numtyp t = param_lam3*(rij-rik);
-  if ((int)param_powermint == 3) arg = t*t*t;
+  if (param_powermint == 3) arg = t*t*t;
   else arg = t;
 
   if (arg > (numtyp)69.0776) ex_delr = (numtyp)1.e30;
@@ -482,22 +465,19 @@ ucl_inline void force_zeta(const numtyp param_bigb,
                            const numtyp param_c2,
                            const numtyp param_c3,
                            const numtyp param_c4,
-                           const numtyp rsq,
+                           const numtyp r,
                            const numtyp zeta_ij,
                            const int eflag,
                            numtyp fpfeng[4])
 {
-  numtyp r,fa,fa_d,bij;
+  numtyp fa,fa_d,bij,bij_d;
 
-  r = ucl_sqrt(rsq);
-  fa = ters_fa(r,param_bigb,param_bigr,param_bigd,param_lam2);
-  fa_d = ters_fa_d(r,param_bigb,param_bigr,param_bigd,param_lam2);
-  bij = ters_bij(zeta_ij,param_beta,param_powern,
-                 param_c1,param_c2, param_c3, param_c4);
-  fpfeng[0] = (numtyp)0.5*bij*fa_d * ucl_recip(r); // fforce
-  fpfeng[1] = (numtyp)-0.5*fa * ters_bij_d(zeta_ij,param_beta, param_powern,
-           param_c1,param_c2, param_c3, param_c4); // prefactor
-  if (eflag) fpfeng[2] = (numtyp)0.5*bij*fa; // eng
+  fa = ters_fa_d(r,param_bigb,param_bigr,param_bigd,param_lam2,&fa_d);
+  bij = ters_bij_d(zeta_ij,param_beta,param_powern,
+                   param_c1,param_c2, param_c3, param_c4, &bij_d);
+  fpfeng[0] = (numtyp)0.5*bij*fa_d*ucl_recip(r); // fforce
+  fpfeng[1] = (numtyp)-0.5*fa*bij_d; // prefactor
+  if (EVFLAG && eflag) fpfeng[2] = (numtyp)0.5*bij*fa; // eng
 }
 
 /* ----------------------------------------------------------------------
@@ -508,7 +488,7 @@ ucl_inline void force_zeta(const numtyp param_bigb,
 
 ucl_inline void attractive(const numtyp param_bigr,
                            const numtyp param_bigd,
-                           const numtyp param_powermint,
+                           const int param_powermint,
                            const numtyp param_lam3,
                            const numtyp param_c,
                            const numtyp param_d,
@@ -535,7 +515,7 @@ ucl_inline void attractive(const numtyp param_bigr,
 
 ucl_inline void attractive_fi(const numtyp param_bigr,
                               const numtyp param_bigd,
-                              const numtyp param_powermint,
+                              const int param_powermint,
                               const numtyp param_lam3,
                               const numtyp param_c,
                               const numtyp param_d,
@@ -560,7 +540,7 @@ ucl_inline void attractive_fi(const numtyp param_bigr,
 
 ucl_inline void attractive_fj(const numtyp param_bigr,
                               const numtyp param_bigd,
-                              const numtyp param_powermint,
+                              const int param_powermint,
                               const numtyp param_lam3,
                               const numtyp param_c,
                               const numtyp param_d,
@@ -585,7 +565,7 @@ ucl_inline void attractive_fj(const numtyp param_bigr,
 
 ucl_inline void attractive_fk(const numtyp param_bigr,
                               const numtyp param_bigd,
-                              const numtyp param_powermint,
+                              const int param_powermint,
                               const numtyp param_lam3,
                               const numtyp param_c,
                               const numtyp param_d,
@@ -610,5 +590,3 @@ ucl_inline void attractive_fk(const numtyp param_bigr,
 
 
 #endif
-
-
diff --git a/lib/gpu/lal_tersoff_mod.cpp b/lib/gpu/lal_tersoff_mod.cpp
index 2b56991cc6..b7b0fff1b9 100644
--- a/lib/gpu/lal_tersoff_mod.cpp
+++ b/lib/gpu/lal_tersoff_mod.cpp
@@ -39,7 +39,7 @@ TersoffMT::~TersoffMod() {
 
 template <class numtyp, class acctyp>
 int TersoffMT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
+  return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4;
 }
 
 template <class numtyp, class acctyp>
@@ -52,34 +52,78 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
                    const double* c5, const double* h, const double* beta, const double* powern,
                    const double* powern_del, const double* ca1, const double* host_cutsq)
 {
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=1;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (oldparam!=ijkparam) {
+          oldparam=ijkparam;
+          onetype=ntypes*ii+jj;
+          onetype3=ijkparam;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_mod,"k_tersoff_mod_repulsive",
-                           "k_tersoff_mod_three_center", "k_tersoff_mod_three_end",
-                           "k_tersoff_mod_short_nbor");
+                           "k_tersoff_mod_three_center",
+                           "k_tersoff_mod_three_end",
+                           "k_tersoff_mod_short_nbor",onetype,onetype3,0,1);
   if (success!=0)
     return success;
 
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE);
+  if (this->nbor->max_nbors())
+    _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                  UCL_READ_WRITE);
 
   k_zeta.set_function(*(this->pair_program),"k_tersoff_mod_zeta");
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_mod_zeta");
+  #else
+  k_zeta_selt = &k_zeta;
+  #endif
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
+  _ntypes=ntypes;
   _nparams = nparams;
   _nelements = nelements;
 
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
+                               UCL_READ_WRITE);
+  host_write.zero();
+  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
+          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+      }
+    }
+  }
+  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
+
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
 
@@ -101,8 +145,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts1,dview,false);
-  ts1_tex.get_texture(*(this->pair_program),"ts1_tex");
-  ts1_tex.bind_float(ts1,4);
 
   ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -114,8 +156,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts2,dview,false);
-  ts2_tex.get_texture(*(this->pair_program),"ts2_tex");
-  ts2_tex.bind_float(ts2,4);
 
   ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -127,8 +167,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts3,dview,false);
-  ts3_tex.get_texture(*(this->pair_program),"ts3_tex");
-  ts3_tex.bind_float(ts3,4);
 
   ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -140,8 +178,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts4,dview,false);
-  ts4_tex.get_texture(*(this->pair_program),"ts4_tex");
-  ts4_tex.bind_float(ts4,4);
 
   ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -153,20 +189,6 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
   }
 
   ucl_copy(ts5,dview,false);
-  ts5_tex.get_texture(*(this->pair_program),"ts5_tex");
-  ts5_tex.bind_float(ts5,4);
-
-  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
-                               UCL_WRITE_ONLY);
-  double cutsqmax = 0.0;
-  for (int i=0; i<nparams; i++) {
-    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
-    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
-  }
-  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  ucl_copy(cutsq,cutsq_view,false);
-
-  _cutshortsq = static_cast<numtyp>(cutsqmax);
 
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
@@ -183,17 +205,16 @@ int TersoffMT::init(const int ntypes, const int nlocal, const int nall, const in
 
   ucl_copy(elem2param,dview_elem2param,false);
 
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
+  UCL_H_Vec<int> dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY);
   for (int i = 0; i < ntypes; i++)
     dview_map[i] = host_map[i];
 
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
+  map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY);
   ucl_copy(map,dview_map,false);
 
   _allocated=true;
   this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
-    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
+    ts4.row_bytes()+map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
   return 0;
 }
 
@@ -208,12 +229,15 @@ void TersoffMT::clear() {
   ts3.clear();
   ts4.clear();
   ts5.clear();
-  cutsq.clear();
+  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
 
   k_zeta.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -229,74 +253,54 @@ double TersoffMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
-  // build the short neighbor list
-  int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
+int TersoffMT::loop(const int eflag, const int vflag, const int evatom,
+                    bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // re-allocate zetaij if necessary
   int nall = this->_nall;
-  if (nall*this->_max_nbors > _zetaij.cols()) {
+  if (nall*this->nbor->max_nbors() > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
+    _zetaij.clear();
+    success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax,
+                                        *(this->ucl_device),
+                                        UCL_READ_WRITE) == UCL_SUCCESS);
+    if (!success) return 0;
   }
 
-  nbor_pitch=this->nbor->nbor_pitch();
+  // build the short neighbor list
+  int ainum=this->_ainum;
+  this->time_pair.start();
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_zeta_selt = &k_zeta;
+  else k_zeta_selt = &k_zeta_noev;
+  #endif
+
   GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
                                (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &cutsq,
+  k_zeta_selt->set_size(GX,BX);
+  k_zeta_selt->run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5,
                    &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
-  this->time_pair.start();
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &ts1, &ts2, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
+                   &this->nbor->dev_nbor,&eflag, &this->_ainum, &nbor_pitch,
                    &this->_threads_per_atom);
 
+  ainum=this->ans->inum();
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &ts5,
                            &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
+                           &this->nbor->dev_nbor, &this->ans->force,
+                           &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
@@ -307,24 +311,34 @@ void TersoffMT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
+    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
 
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &ts5, &cutsq,
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &ts5,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                                (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &ts1, &ts2, &map, &elem2param,
+                   &_nelements, &_nparams, &this->nbor->dev_nbor,
+                   &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                   &ainum, &nbor_pitch, &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class TersoffMod<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_tersoff_mod.cu b/lib/gpu/lal_tersoff_mod.cu
index 0f45653264..44b04c6933 100644
--- a/lib/gpu/lal_tersoff_mod.cu
+++ b/lib/gpu/lal_tersoff_mod.cu
@@ -18,99 +18,28 @@
 
 #ifndef _DOUBLE_DOUBLE
 _texture( pos_tex,float4);
-_texture( ts1_tex,float4);
-_texture( ts2_tex,float4);
-_texture( ts3_tex,float4);
-_texture( ts4_tex,float4);
-_texture( ts5_tex,float4);
 #else
 _texture_2d( pos_tex,int4);
-_texture( ts1_tex,int4);
-_texture( ts2_tex,int4);
-_texture( ts3_tex,int4);
-_texture( ts4_tex,int4);
-_texture( ts5_tex,int4);
 #endif
 
 #else
 #define pos_tex x_
-#define ts1_tex ts1
-#define ts2_tex ts2
-#define ts3_tex ts3
-#define ts4_tex ts4
-#define ts5_tex ts5
 #endif
 
 //#define THREE_CONCURRENT
 
 #define TWOTHIRD (numtyp)0.66666666666666666667
 
-#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom,    \
-                 i, nbor_j, offset_j, idx)                                  \
-  if (nbor_mem==packed_mem) {                                               \
-    int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;                       \
-    idx = jj*n_stride + i*t_per_atom + offset_j;                            \
-  } else {                                                                  \
-    idx = nbor_j;                                                           \
-  }
+#if (SHUFFLE_AVAIL == 0)
 
-#if (ARCH < 300)
-
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
-    old.x+=f.x;                                                             \
-    old.y+=f.y;                                                             \
-    old.z+=f.z;                                                             \
-    ans[ii]=old;                                                            \
-  }
+#define local_allocate_acc_zeta()                                           \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         red_acc[tid] += red_acc[tid+s];                                     \
       }                                                                     \
@@ -118,36 +47,168 @@ _texture( ts5_tex,int4);
     z=red_acc[tid];                                                         \
   }
 
-#else
-
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define local_allocate_acc_zeta()
+
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      z += shfl_down(z, s, t_per_atom);                                     \
+    }                                                                       \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -155,63 +216,62 @@ _texture( ts5_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#define acc_zeta(z, tid, t_per_atom, offset)                                \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      z += shfl_xor(z, s, t_per_atom);                                      \
-    }                                                                       \
-  }
-
+#endif
 #endif
 
 __kernel void k_tersoff_mod_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
+                                   const __global numtyp *restrict cutsq_pair,
+                                   const int ntypes, __global int * dev_nbor,
                                    const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp _cutshortsq,
                                    const int inum, const int nbor_pitch,
                                    const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp cutsq=cutsq_pair[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp cutsq=cutsq_pair[mtype];
+      #endif
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -228,22 +288,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict ts3_in,
                              const __global numtyp4 *restrict ts4_in,
                              const __global numtyp4 *restrict ts5_in,
-                             const __global numtyp *restrict cutsq,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements, const int nparams,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
-                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
-  __local int tpa_sq,n_stride;
-  tpa_sq = fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq = fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_acc_zeta();
+
   // must be increased if there will be more than 3 elements in the future.
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
@@ -264,28 +322,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int nbor_j, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -304,14 +354,8 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -327,8 +371,6 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
         delr2.z = kx.z-ix.z;
         numtyp rsq2 = delr2.x*delr2.x+delr2.y*delr2.y+delr2.z*delr2.z;
 
-        if (rsq2 > cutsq[ijkparam]) continue;
-
         numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
         numtyp ijkparam_lam3 = ts1_ijkparam.z;
         numtyp ijkparam_powermint = ts1_ijkparam.w;
@@ -348,9 +390,6 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
                   ijkparam_c5, rsq1, rsq2, delr1, delr2);
       }
 
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
       acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@@ -376,7 +415,7 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
         zij.y = fpfeng[1];
         zij.z = fpfeng[2];
         zij.w = z;
-        zetaij[idx] = zij;
+        zetaij[nbor_j-2*nbor_pitch] = zij;
       }
 
     } // for nbor
@@ -386,22 +425,20 @@ __kernel void k_tersoff_mod_zeta(const __global numtyp4 *restrict x_,
 __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts1_in,
                                   const __global numtyp4 *restrict ts2_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
-                                  const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
+                                  const int t_per_atom, const int ev_stride) {
+  int tid, ii, offset, n_stride;
   atom_info(t_per_atom,ii,tid,offset);
 
+  local_allocate_store_pair();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   if (tid<nparams) {
@@ -409,36 +446,28 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
     ts2[tid]=ts2_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -453,8 +482,6 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq >= cutsq[ijparam]) continue;
-
       numtyp feng[2];
       numtyp ijparam_lam1 = ts1[ijparam].x;
       numtyp4 ts2_ijparam = ts2[ijparam];
@@ -470,9 +497,9 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
       f.y+=dely*force;
       f.z+=delz*force;
 
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=feng[1];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         virial[0] += delx*delx*force;
         virial[1] += dely*dely*force;
         virial[2] += delz*delz*force;
@@ -481,11 +508,9 @@ __kernel void k_tersoff_mod_repulsive(const __global numtyp4 *restrict x_,
         virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
@@ -493,26 +518,24 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
                                      const __global numtyp4 *restrict ts2_in,
                                      const __global numtyp4 *restrict ts4_in,
                                      const __global numtyp4 *restrict ts5_in,
-                                     const __global numtyp *restrict cutsq,
                                      const __global int *restrict map,
                                      const __global int *restrict elem2param,
                                      const int nelements, const int nparams,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
-                                     const __global int * dev_packed,
-                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
                                      const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -524,46 +547,37 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -571,26 +585,22 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
-      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
       // look up for zeta_ij
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
+      acctyp4 zeta_ij = zetaij[nbor_j-2*nbor_pitch];
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ij.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += delr1[0]*delr1[0]*mforce;
         virial[1] += delr1[1]*delr1[1]*mforce;
@@ -601,14 +611,8 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -624,7 +628,6 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-ix.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[ijkparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -643,7 +646,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         numtyp4 ts5_ijkparam = ts5[ijkparam]; //fetch4(ts5_ijkparam,ijkparam,ts5_tex);
         c5 = ts5_ijkparam.x;
         h = ts5_ijkparam.y;
-        if (vflag>0)
+        if (EVFLAG && vflag)
           attractive(bigr, bigd, powermint, lam3, h, c1, c2, c3, c4, c5,
                      prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
         else
@@ -653,7 +656,7 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           acctyp v[6];
           numtyp pre = (numtyp)2.0;
           if (evatom==1) pre = TWOTHIRD;
@@ -669,10 +672,9 @@ __kernel void k_tersoff_mod_three_center(const __global numtyp4 *restrict x_,
         }
       } // nbor_k
     } // for nbor_j
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,
-                     offset,eflag,vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,
+                offset,eflag,vflag,ans,engv);
 }
 
 __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
@@ -680,27 +682,25 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts2_in,
                                   const __global numtyp4 *restrict ts4_in,
                                   const __global numtyp4 *restrict ts5_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
                                   const __global int * dev_ilist,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
                                   const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -712,23 +712,25 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -737,17 +739,9 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -766,62 +760,51 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
-
       // look up for zeta_ji: find i in the j's neighbor list
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -833,7 +816,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -874,11 +857,7 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
         ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
@@ -902,15 +881,14 @@ __kernel void k_tersoff_mod_three_end(const __global numtyp4 *restrict x_,
         f.z += fi[2];
       } // for nbor_k
     } // for nbor_j
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
@@ -918,27 +896,25 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global numtyp4 *restrict ts2_in,
                                         const __global numtyp4 *restrict ts4_in,
                                         const __global numtyp4 *restrict ts5_in,
-                                        const __global numtyp *restrict cutsq,
                                         const __global int *restrict map,
                                         const __global int *restrict elem2param,
                                         const int nelements, const int nparams,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
-                                        const __global int * dev_packed,
                                         const __global int * dev_ilist,
-                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
                                         const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c1, c2, c3, c4, c5, h;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -950,23 +926,25 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
     ts5[tid]=ts5_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -975,17 +953,9 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1004,62 +974,52 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -1071,7 +1031,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1086,8 +1046,6 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         delr2[1] = kx.y-jx.y;
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
-
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -1120,10 +1078,7 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
         virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
 
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -1155,14 +1110,13 @@ __kernel void k_tersoff_mod_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
       }
     } // for nbor
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_tersoff_mod.h b/lib/gpu/lal_tersoff_mod.h
index 29a561c71d..0baa1307cb 100644
--- a/lib/gpu/lal_tersoff_mod.h
+++ b/lib/gpu/lal_tersoff_mod.h
@@ -63,7 +63,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
   bool shared_types;
 
   /// Number of atom types
-  int _lj_types;
+  int _ntypes;
 
   /// ts1.x = lam1, ts1.y = lam2,  ts1.z = lam3, ts1.w = powermint
   UCL_D_Vec<numtyp4> ts1;
@@ -76,7 +76,7 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
   /// ts5.x = c5, ts5.y = h
   UCL_D_Vec<numtyp4> ts5;
 
-  UCL_D_Vec<numtyp> cutsq;
+  UCL_D_Vec<numtyp> cutsq_pair;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
@@ -87,13 +87,11 @@ class TersoffMod : public BaseThree<numtyp, acctyp> {
   /// zetaij.w = zetaij
   UCL_D_Vec<acctyp4>   _zetaij;
 
-  UCL_Kernel k_zeta;
-  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex;
-  numtyp _cutshortsq;
+  UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 };
 
 }
diff --git a/lib/gpu/lal_tersoff_mod_ext.cpp b/lib/gpu/lal_tersoff_mod_ext.cpp
index cce9df8713..cac284fb70 100644
--- a/lib/gpu/lal_tersoff_mod_ext.cpp
+++ b/lib/gpu/lal_tersoff_mod_ext.cpp
@@ -63,7 +63,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=TSMMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=TSMMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
                       ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                       ts_biga, ts_bigb, ts_bigr, ts_bigd, ts_c1, ts_c2,
@@ -84,7 +84,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=TSMMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=TSMMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
                         ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                         ts_biga, ts_bigb, ts_bigr, ts_bigd, ts_c1, ts_c2,
@@ -99,7 +99,7 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    TSMMF.estimate_gpu_overhead();
+    TSMMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_tersoff_zbl.cpp b/lib/gpu/lal_tersoff_zbl.cpp
index 7d254d568d..4456712b0a 100644
--- a/lib/gpu/lal_tersoff_zbl.cpp
+++ b/lib/gpu/lal_tersoff_zbl.cpp
@@ -39,7 +39,7 @@ TersoffZT::~TersoffZBL() {
 
 template <class numtyp, class acctyp>
 int TersoffZT::bytes_per_atom(const int max_nbors) const {
-  return this->bytes_per_atom_atomic(max_nbors);
+  return this->bytes_per_atom_atomic(max_nbors)+max_nbors*sizeof(acctyp)*4;
 }
 
 template <class numtyp, class acctyp>
@@ -59,34 +59,78 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
                     const double global_a_0, const double global_epsilon_0,
                     const double* host_cutsq)
 {
+  int oldparam=-1;
+  int onetype=-1;
+  int onetype3=0;
+  int spq=1;
+  int mtypes=0;
+  #ifdef USE_OPENCL
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (oldparam!=ijkparam) {
+          oldparam=ijkparam;
+          onetype=ntypes*ii+jj;
+          onetype3=ijkparam;
+          mtypes++;
+        }
+      }
+    }
+  }
+  if (mtypes>1) onetype=-1;
+  #endif
+
   int success;
   success=this->init_three(nlocal,nall,max_nbors,0,cell_size,gpu_split,
                            _screen,tersoff_zbl,"k_tersoff_zbl_repulsive",
-                           "k_tersoff_zbl_three_center", "k_tersoff_zbl_three_end",
-                           "k_tersoff_zbl_short_nbor");
+                           "k_tersoff_zbl_three_center",
+                           "k_tersoff_zbl_three_end",
+                           "k_tersoff_zbl_short_nbor",onetype,onetype3,0,1);
   if (success!=0)
     return success;
 
   int ef_nall=nall;
   if (ef_nall==0)
     ef_nall=2000;
-  _zetaij.alloc(ef_nall*max_nbors,*(this->ucl_device),UCL_READ_WRITE);
+  if (this->nbor->max_nbors())
+    _zetaij.alloc(ef_nall*this->nbor->max_nbors(),*(this->ucl_device),
+                 UCL_READ_WRITE);
 
   k_zeta.set_function(*(this->pair_program),"k_tersoff_zbl_zeta");
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.set_function(*(this->pair_program_noev),"k_tersoff_zbl_zeta");
+  #else
+  k_zeta_selt = &k_zeta;
+  #endif
 
-  // If atom type constants fit in shared memory use fast kernel
-  int lj_types=ntypes;
-  shared_types=false;
-  int max_shared_types=this->device->max_shared_types();
-  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
-    lj_types=max_shared_types;
-    shared_types=true;
-  }
-  _lj_types=lj_types;
-
+  _ntypes = ntypes;
   _nparams = nparams;
   _nelements = nelements;
 
+  UCL_H_Vec<numtyp> host_write(ntypes*ntypes,*(this->ucl_device),
+                               UCL_READ_WRITE);
+  host_write.zero();
+  cutsq_pair.alloc(ntypes*ntypes,*(this->ucl_device),UCL_READ_ONLY);
+  for (int ii=1; ii<ntypes; ii++) {
+    const int i=host_map[ii];
+    for (int jj=1; jj<ntypes; jj++) {
+      const int j=host_map[jj];
+      for (int kk=1; kk<ntypes; kk++) {
+        const int k=host_map[kk];
+        if (i<0 || j<0 || k<0) continue;
+        const int ijkparam = host_elem2param[i][j][k];
+        if (host_cutsq[ijkparam]>host_write[ii*ntypes+jj])
+          host_write[ii*ntypes+jj]=host_cutsq[ijkparam];
+      }
+    }
+  }
+  ucl_copy(cutsq_pair,host_write,ntypes*ntypes);
+
   UCL_H_Vec<numtyp4> dview(nparams,*(this->ucl_device),
                            UCL_WRITE_ONLY);
 
@@ -108,8 +152,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts1,dview,false);
-  ts1_tex.get_texture(*(this->pair_program),"ts1_tex");
-  ts1_tex.bind_float(ts1,4);
 
   ts2.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -121,8 +163,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts2,dview,false);
-  ts2_tex.get_texture(*(this->pair_program),"ts2_tex");
-  ts2_tex.bind_float(ts2,4);
 
   ts3.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -134,8 +174,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts3,dview,false);
-  ts3_tex.get_texture(*(this->pair_program),"ts3_tex");
-  ts3_tex.bind_float(ts3,4);
 
   ts4.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -147,8 +185,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts4,dview,false);
-  ts4_tex.get_texture(*(this->pair_program),"ts4_tex");
-  ts4_tex.bind_float(ts4,4);
 
   ts5.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -160,8 +196,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts5,dview,false);
-  ts5_tex.get_texture(*(this->pair_program),"ts5_tex");
-  ts5_tex.bind_float(ts5,4);
 
   ts6.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
 
@@ -173,20 +207,6 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
   }
 
   ucl_copy(ts6,dview,false);
-  ts6_tex.get_texture(*(this->pair_program),"ts6_tex");
-  ts6_tex.bind_float(ts6,4);
-
-  UCL_H_Vec<numtyp> cutsq_view(nparams,*(this->ucl_device),
-                               UCL_WRITE_ONLY);
-  double cutsqmax = 0.0;
-  for (int i=0; i<nparams; i++) {
-    cutsq_view[i]=static_cast<numtyp>(host_cutsq[i]);
-    if (cutsqmax < host_cutsq[i]) cutsqmax = host_cutsq[i];
-  }
-  cutsq.alloc(nparams,*(this->ucl_device),UCL_READ_ONLY);
-  ucl_copy(cutsq,cutsq_view,false);
-
-  _cutshortsq = static_cast<numtyp>(cutsqmax);
 
   UCL_H_Vec<int> dview_elem2param(nelements*nelements*nelements,
                            *(this->ucl_device), UCL_WRITE_ONLY);
@@ -203,11 +223,11 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
 
   ucl_copy(elem2param,dview_elem2param,false);
 
-  UCL_H_Vec<int> dview_map(lj_types, *(this->ucl_device), UCL_WRITE_ONLY);
+  UCL_H_Vec<int> dview_map(ntypes, *(this->ucl_device), UCL_WRITE_ONLY);
   for (int i = 0; i < ntypes; i++)
     dview_map[i] = host_map[i];
 
-  map.alloc(lj_types,*(this->ucl_device), UCL_READ_ONLY);
+  map.alloc(ntypes,*(this->ucl_device), UCL_READ_ONLY);
   ucl_copy(map,dview_map,false);
 
   _global_e = global_e;
@@ -216,8 +236,8 @@ int TersoffZT::init(const int ntypes, const int nlocal, const int nall,
 
   _allocated=true;
   this->_max_bytes=ts1.row_bytes()+ts2.row_bytes()+ts3.row_bytes()+
-    ts4.row_bytes()+ts5.row_bytes()+cutsq.row_bytes()+
-    map.row_bytes()+elem2param.row_bytes()+_zetaij.row_bytes();
+    ts4.row_bytes()+ts5.row_bytes()+map.row_bytes()+elem2param.row_bytes()+
+    _zetaij.row_bytes();
   return 0;
 }
 
@@ -233,12 +253,15 @@ void TersoffZT::clear() {
   ts4.clear();
   ts5.clear();
   ts6.clear();
-  cutsq.clear();
+  cutsq_pair.clear();
   map.clear();
   elem2param.clear();
   _zetaij.clear();
 
   k_zeta.clear();
+  #if defined(LAL_OCL_EV_JIT)
+  k_zeta_noev.clear();
+  #endif
 
   this->clear_atomic();
 }
@@ -254,75 +277,54 @@ double TersoffZT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
-  // build the short neighbor list
-  int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
+int TersoffZT::loop(const int eflag, const int vflag, const int evatom,
+                    bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // re-allocate zetaij if necessary
   int nall = this->_nall;
-  if (nall*this->_max_nbors > _zetaij.cols()) {
+  if (nall*this->nbor->max_nbors() > _zetaij.cols()) {
     int _nmax=static_cast<int>(static_cast<double>(nall)*1.10);
-    _zetaij.resize(this->_max_nbors*_nmax);
+    _zetaij.clear();
+    success = success && (_zetaij.alloc(this->nbor->max_nbors()*_nmax,
+                                        *(this->ucl_device),
+                                        UCL_READ_WRITE) == UCL_SUCCESS);
+    if (!success) return 0;
   }
 
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
-                               (BX/(JTHREADS*KTHREADS))));
-
-  this->k_zeta.set_size(GX,BX);
-  this->k_zeta.run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &eflag, &this->_ainum, &nbor_pitch, &this->_threads_per_atom);
-
-  ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
-
+  // build the short neighbor list
+  int ainum=this->_ainum;
   this->time_pair.start();
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &ts1, &ts2, &ts6,
-                   &_global_e, &_global_a_0, &_global_epsilon_0, &cutsq,
-                   &map, &elem2param, &_nelements, &_nparams,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->dev_short_nbor,
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
+
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &cutsq_pair, &_ntypes,
+                         &this->nbor->dev_nbor, &this->nbor->dev_packed,
+                         &ainum, &nbor_pitch, &this->_threads_per_atom);
+
+  #if defined(LAL_OCL_EV_JIT)
+  if (eflag || vflag) k_zeta_selt = &k_zeta;
+  else k_zeta_selt = &k_zeta_noev;
+  #endif
+
+  GX=static_cast<int>(ceil(static_cast<double>(this->_ainum)/
+                           (BX/(JTHREADS*KTHREADS))));
+  k_zeta_selt->set_size(GX,BX);
+  k_zeta_selt->run(&this->atom->x, &ts1, &ts2, &ts3, &ts4, &ts5, &ts6,
+                   &map, &elem2param, &_nelements, &_nparams, &_zetaij,
+                   &this->nbor->dev_nbor, &eflag, &this->_ainum, &nbor_pitch,
                    &this->_threads_per_atom);
 
+  ainum=this->ans->inum();
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &map,
+                           &elem2param, &_nelements, &_nparams, &_zetaij,
+                           &this->nbor->dev_nbor, &this->ans->force,
+                           &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
 
   Answer<numtyp,acctyp> *end_ans;
@@ -333,24 +335,35 @@ void TersoffZT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
+    this->k_three_end_vatom.run(&this->atom->x, &ts1, &ts2, &ts4,
                           &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
 
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &ts1, &ts2, &ts4, &cutsq,
-                          &map, &elem2param, &_nelements, &_nparams, &_zetaij,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &ts1, &ts2, &ts4, &map,
+                          &elem2param, &_nelements, &_nparams, &_zetaij,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                           (BX/this->_threads_per_atom)));
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &ts1, &ts2, &ts6, &_global_e, &_global_a_0,
+                   &_global_epsilon_0, &map, &elem2param, &_nelements,
+                   &_nparams, &this->nbor->dev_nbor, &this->ans->force,
+                   &this->ans->engv, &eflag, &vflag, &ainum, &nbor_pitch,
+                   &this->_threads_per_atom, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class TersoffZBL<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_tersoff_zbl.cu b/lib/gpu/lal_tersoff_zbl.cu
index f631cab91f..fce1ccc406 100644
--- a/lib/gpu/lal_tersoff_zbl.cu
+++ b/lib/gpu/lal_tersoff_zbl.cu
@@ -48,72 +48,16 @@ _texture( ts6_tex,int4);
 
 #define TWOTHIRD (numtyp)0.66666666666666666667
 
-#define zeta_idx(nbor_mem, packed_mem, nbor_pitch, n_stride, t_per_atom,    \
-                 i, nbor_j, offset_j, idx)                                  \
-  if (nbor_mem==packed_mem) {                                               \
-    int jj = (nbor_j-offset_j-2*nbor_pitch)/n_stride;                       \
-    idx = jj*n_stride + i*t_per_atom + offset_j;                            \
-  } else {                                                                  \
-    idx = nbor_j;                                                           \
-  }
+#if (SHUFFLE_AVAIL == 0)
 
-#if (ARCH < 300)
-
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
-  if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
-      }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
-      }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
-    }                                                                       \
-  }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
-    acctyp4 old=ans[ii];                                                    \
-    old.x+=f.x;                                                             \
-    old.y+=f.y;                                                             \
-    old.z+=f.z;                                                             \
-    ans[ii]=old;                                                            \
-  }
+#define local_allocate_acc_zeta()                                           \
+    __local acctyp red_acc[BLOCK_PAIR];
 
 #define acc_zeta(z, tid, t_per_atom, offset)                                \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[BLOCK_PAIR];                                     \
     red_acc[tid]=z;                                                         \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      simdsync();                                                           \
       if (offset < s) {                                                     \
         red_acc[tid] += red_acc[tid+s];                                     \
       }                                                                     \
@@ -121,36 +65,168 @@ _texture( ts6_tex,int4);
     z=red_acc[tid];                                                         \
   }
 
-#else
-
 #define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
-                        offset, eflag, vflag, ans, engv)                    \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      f.x += shfl_xor(f.x, s, t_per_atom);                                  \
-      f.y += shfl_xor(f.y, s, t_per_atom);                                  \
-      f.z += shfl_xor(f.z, s, t_per_atom);                                  \
-      energy += shfl_xor(energy, s, t_per_atom);                            \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define local_allocate_acc_zeta()
+
+#define acc_zeta(z, tid, t_per_atom, offset)                                \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      z += shfl_down(z, s, t_per_atom);                                     \
+    }                                                                       \
+  }
+
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1) {                                                       \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -158,63 +234,62 @@ _texture( ts6_tex,int4);
     ans[ii]=old;                                                            \
   }
 
-#define acc_zeta(z, tid, t_per_atom, offset)                                \
-  if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      z += shfl_xor(z, s, t_per_atom);                                      \
-    }                                                                       \
-  }
-
+#endif
 #endif
 
 __kernel void k_tersoff_zbl_short_nbor(const __global numtyp4 *restrict x_,
-                                   const __global int * dev_nbor,
+                                   const __global numtyp *restrict cutsq_pair,
+                                   const int ntypes, __global int * dev_nbor,
                                    const __global int * dev_packed,
-                                   __global int * dev_short_nbor,
-                                   const numtyp _cutshortsq,
                                    const int inum, const int nbor_pitch,
                                    const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+  const int ii=GLOBAL_ID_X;
+
+  #ifdef ONETYPE
+  const numtyp cutsq=cutsq_pair[ONETYPE];
+  #endif
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    #ifndef ONETYPE
+    const int itype=ix.w*ntypes;
+    #endif
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      #ifndef ONETYPE
+      const int mtype=jx.w+itype;
+      const numtyp cutsq=cutsq_pair[mtype];
+      #endif
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<cutsq) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -232,22 +307,20 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                              const __global numtyp4 *restrict ts4_in,
                              const __global numtyp4 *restrict ts5_in,
                              const __global numtyp4 *restrict ts6_in,
-                             const __global numtyp *restrict cutsq,
                              const __global int *restrict map,
                              const __global int *restrict elem2param,
                              const int nelements, const int nparams,
                              __global acctyp4 * zetaij,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
-                             const __global int * dev_short_nbor,
                              const int eflag, const int inum,
                              const int nbor_pitch, const int t_per_atom) {
-  __local int tpa_sq,n_stride;
-  tpa_sq = fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq = fast_mul(t_per_atom,t_per_atom);
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_acc_zeta();
+
   // must be increased if there will be more than 3 elements in the future.
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
@@ -270,28 +343,20 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
 
   if (ii<inum) {
     int nbor_j, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
 
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -310,14 +375,8 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
       z = (acctyp)0;
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k < k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k < nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == j) continue;
@@ -333,8 +392,6 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
         delr2.z = kx.z-ix.z;
         numtyp rsq2 = delr2.x*delr2.x+delr2.y*delr2.y+delr2.z*delr2.z;
 
-        if (rsq2 > cutsq[ijkparam]) continue;
-
         numtyp4 ts1_ijkparam = ts1[ijkparam]; //fetch4(ts1_ijkparam,ijkparam,ts1_tex);
         numtyp ijkparam_lam3 = ts1_ijkparam.z;
         numtyp ijkparam_powermint = ts1_ijkparam.w;
@@ -351,9 +408,6 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
                   rsq1, rsq2, delr1, delr2);
       }
 
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
       acc_zeta(z, tid, t_per_atom, offset_k);
 
       numtyp4 ts1_ijparam = ts1[ijparam]; //fetch4(ts1_ijparam,ijparam,ts1_tex);
@@ -384,7 +438,7 @@ __kernel void k_tersoff_zbl_zeta(const __global numtyp4 *restrict x_,
         zij.y = fpfeng[1];
         zij.z = fpfeng[2];
         zij.w = z;
-        zetaij[idx] = zij;
+        zetaij[nbor_j-2*nbor_pitch] = zij;
       }
 
     } // for nbor
@@ -397,22 +451,20 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts6_in,
                                   const numtyp global_e, const numtyp global_a_0,
                                   const numtyp global_epsilon_0,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum, const int nbor_pitch,
-                                  const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
+                                  const int t_per_atom, const int ev_stride) {
+  int tid, ii, offset, n_stride;
   atom_info(t_per_atom,ii,tid,offset);
 
+  local_allocate_store_pair();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts6[SHARED_SIZE];
@@ -422,36 +474,28 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
     ts6[tid]=ts6_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end, i, numj;
-    const __global int* nbor_mem=dev_packed;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset,i,numj,
+                n_stride,nbor_end,nbor);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor];
-      nbor += n_stride;
-      nbor_end = nbor+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor<nbor_end; nbor+=n_stride) {
 
-      int j=nbor_mem[nbor];
+      int j=dev_nbor[nbor];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -466,8 +510,6 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq >= cutsq[ijparam]) continue;
-
       numtyp feng[2];
       numtyp ijparam_lam1 = ts1[ijparam].x;
       numtyp4 ts2_ijparam = ts2[ijparam];
@@ -489,9 +531,9 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
       f.y+=dely*force;
       f.z+=delz*force;
 
-      if (eflag>0)
+      if (EVFLAG && eflag)
         energy+=feng[1];
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         virial[0] += delx*delx*force;
         virial[1] += dely*dely*force;
         virial[2] += delz*delz*force;
@@ -500,37 +542,33 @@ __kernel void k_tersoff_zbl_repulsive(const __global numtyp4 *restrict x_,
         virial[5] += dely*delz*force;
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  store_answers_p(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                  ans,engv,ev_stride);
 }
 
 __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
                                      const __global numtyp4 *restrict ts1_in,
                                      const __global numtyp4 *restrict ts2_in,
                                      const __global numtyp4 *restrict ts4_in,
-                                     const __global numtyp *restrict cutsq,
                                      const __global int *restrict map,
                                      const __global int *restrict elem2param,
                                      const int nelements, const int nparams,
                                      const __global acctyp4 *restrict zetaij,
                                      const __global int * dev_nbor,
-                                     const __global int * dev_packed,
-                                     const __global int * dev_short_nbor,
                                      __global acctyp4 *restrict ans,
                                      __global acctyp *restrict engv,
                                      const int eflag, const int vflag,
                                      const int inum,  const int nbor_pitch,
                                      const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset); // offset ranges from 0 to tpa_sq-1
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -540,46 +578,37 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
   numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
     int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
       int jtype=jx.w;
       jtype=map[jtype];
-      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
 
       // Compute r12
       numtyp delr1[3];
@@ -587,26 +616,22 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       delr1[1] = jx.y-ix.y;
       delr1[2] = jx.z-ix.z;
       numtyp rsq1 = delr1[0]*delr1[0] + delr1[1]*delr1[1] + delr1[2]*delr1[2];
-      if (rsq1 >= cutsq[ijparam]) continue;
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
 
       // look up for zeta_ij
-      // idx to zetaij is shifted by n_stride relative to nbor_j in dev_short_nbor
-      int idx = nbor_j;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ij = zetaij[idx]; // fetch(zeta_ij,idx,zeta_tex);
+      acctyp4 zeta_ij = zetaij[nbor_j-2*nbor_pitch];
       numtyp force = zeta_ij.x*tpainv;
       numtyp prefactor = zeta_ij.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ij.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += delr1[0]*delr1[0]*mforce;
         virial[1] += delr1[1]*delr1[1]*mforce;
@@ -617,14 +642,8 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
       }
 
       int nbor_k = nborj_start-offset_j+offset_k;
-      int k_end = nbor_end;
-      if (dev_packed==dev_nbor) {
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
-
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (j == k) continue;
@@ -640,7 +659,6 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-ix.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[ijkparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -656,7 +674,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         d = ts4_ijkparam.y;
         h = ts4_ijkparam.z;
         gamma = ts4_ijkparam.w;
-        if (vflag>0)
+        if (EVFLAG && vflag)
           attractive(bigr, bigd, powermint, lam3, c, d, h, gamma,
                      prefactor, r1, r1inv, r2, r2inv, delr1, delr2, fi, fj, fk);
         else
@@ -666,7 +684,7 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           acctyp v[6];
           numtyp pre = (numtyp)2.0;
           if (evatom==1) pre = TWOTHIRD;
@@ -682,37 +700,34 @@ __kernel void k_tersoff_zbl_three_center(const __global numtyp4 *restrict x_,
         }
       } // nbor_k
     } // for nbor_j
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,
-                     offset,eflag,vflag,ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,
+                offset,eflag,vflag,ans,engv);
 }
 
 __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
                                   const __global numtyp4 *restrict ts1_in,
                                   const __global numtyp4 *restrict ts2_in,
                                   const __global numtyp4 *restrict ts4_in,
-                                  const __global numtyp *restrict cutsq,
                                   const __global int *restrict map,
                                   const __global int *restrict elem2param,
                                   const int nelements, const int nparams,
                                   const __global acctyp4 *restrict zetaij,
                                   const __global int * dev_nbor,
-                                  const __global int * dev_packed,
                                   const __global int * dev_ilist,
-                                  const __global int * dev_short_nbor,
                                   __global acctyp4 *restrict ans,
                                   __global acctyp *restrict engv,
                                   const int eflag, const int vflag,
                                   const int inum,  const int nbor_pitch,
                                   const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -722,23 +737,25 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int ijnum_shared[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem=dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -747,17 +764,9 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -776,62 +785,51 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
-
       // look up for zeta_ji: find i in the j's neighbor list
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          ijnum_shared[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = ijnum_shared[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; // fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -843,7 +841,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -881,11 +879,7 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         f.y += fi[1];
         f.z += fi[2];
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
         ts1_param = ts1[jkiparam]; //fetch4(ts1_jkiparam,jkiparam,ts1_tex);
@@ -906,42 +900,39 @@ __kernel void k_tersoff_zbl_three_end(const __global numtyp4 *restrict x_,
         f.z += fi[2];
       } // for nbor_k
     } // for nbor_j
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
                                         const __global numtyp4 *restrict ts1_in,
                                         const __global numtyp4 *restrict ts2_in,
                                         const __global numtyp4 *restrict ts4_in,
-                                        const __global numtyp *restrict cutsq,
                                         const __global int *restrict map,
                                         const __global int *restrict elem2param,
                                         const int nelements, const int nparams,
                                         const __global acctyp4 *restrict zetaij,
                                         const __global int * dev_nbor,
-                                        const __global int * dev_packed,
                                         const __global int * dev_ilist,
-                                        const __global int * dev_short_nbor,
                                         __global acctyp4 *restrict ans,
                                         __global acctyp *restrict engv,
                                         const int eflag, const int vflag,
                                         const int inum,  const int nbor_pitch,
                                         const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp lam3, powermint, bigr, bigd, c, d, h, gamma;
 
-  int tid, ii, offset;
+  int tid, ii, offset, n_stride;
   atom_info(tpa_sq,ii,tid,offset);
 
+  local_allocate_store_three();
+
   __local numtyp4 ts1[SHARED_SIZE];
   __local numtyp4 ts2[SHARED_SIZE];
   __local numtyp4 ts4[SHARED_SIZE];
@@ -951,23 +942,25 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
     ts4[tid]=ts4_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
-  __local int red_acc[BLOCK_PAIR];
+  #ifdef LAL_SIMD_IP_SYNC
+  __local int localk[BLOCK_PAIR];
+  #endif
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
@@ -976,17 +969,9 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
     numtyp tpainv = ucl_recip((numtyp)t_per_atom);
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -1005,62 +990,52 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
       mdelr1[1] = -delr1[1];
       mdelr1[2] = -delr1[2];
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
       int nbork_start = nbor_k;
 
       // look up for zeta_ji
+      #ifdef LAL_SIMD_IP_SYNC
       int m = tid / t_per_atom;
-      int ijnum = -1;
+      #endif
+      int ijnum;
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
         if (k == i) {
+          #ifdef LAL_SIMD_IP_SYNC
+          localk[m] = nbor_k;
+          #else
           ijnum = nbor_k;
-          red_acc[m] = ijnum;
+          #endif
           break;
         }
       }
 
       numtyp r1 = ucl_sqrt(rsq1);
       numtyp r1inv = ucl_rsqrt(rsq1);
-      if (ijnum < 0) ijnum = red_acc[m];
+      #ifdef LAL_SIMD_IP_SYNC
+      simdsync();
+      ijnum = localk[m];
+      #endif
 
-      // idx to zetaij is shifted by n_stride relative to ijnum in dev_short_nbor
-      int idx = ijnum;
-      if (dev_packed==dev_nbor) idx -= n_stride;
-      acctyp4 zeta_ji = zetaij[idx]; //  fetch(zeta_ji,idx,zeta_tex);
+      acctyp4 zeta_ji = zetaij[ijnum-2*nbor_pitch];
       numtyp force = zeta_ji.x*tpainv;
       numtyp prefactor_ji = zeta_ji.y;
       f.x += delr1[0]*force;
       f.y += delr1[1]*force;
       f.z += delr1[2]*force;
 
-      if (eflag>0) {
+      if (EVFLAG && eflag) {
         energy+=zeta_ji.z*tpainv;
       }
-      if (vflag>0) {
+      if (EVFLAG && vflag) {
         numtyp mforce = -force;
         virial[0] += mdelr1[0]*mdelr1[0]*mforce;
         virial[1] += mdelr1[1]*mdelr1[1]*mforce;
@@ -1072,7 +1047,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
 
       // attractive forces
       for (nbor_k = nbork_start; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -1088,7 +1063,6 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         delr2[2] = kx.z-jx.z;
         numtyp rsq2 = delr2[0]*delr2[0] + delr2[1]*delr2[1] + delr2[2]*delr2[2];
 
-        if (rsq2 > cutsq[jikparam]) continue;
         numtyp r2 = ucl_sqrt(rsq2);
         numtyp r2inv = ucl_rsqrt(rsq2);
 
@@ -1118,10 +1092,7 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[4] += TWOTHIRD*(mdelr1[0]*fj[2] + delr2[0]*fk[2]);
         virial[5] += TWOTHIRD*(mdelr1[1]*fj[2] + delr2[1]*fk[2]);
 
-        // idx to zetaij is shifted by n_stride relative to nbor_k in dev_short_nbor
-        int idx = nbor_k;
-        if (dev_packed==dev_nbor) idx -= n_stride;
-        acctyp4 zeta_jk = zetaij[idx]; // fetch(zeta_jk,idx,zeta_tex);
+        acctyp4 zeta_jk = zetaij[nbor_k-2*nbor_pitch];
         numtyp prefactor_jk = zeta_jk.y;
 
         int jkiparam=elem2param[jtype*nelements*nelements+ktype*nelements+itype];
@@ -1150,14 +1121,13 @@ __kernel void k_tersoff_zbl_three_end_vatom(const __global numtyp4 *restrict x_,
         virial[5] += TWOTHIRD*(delr2[1]*fj[2] + mdelr1[1]*fk[2]);
       }
     } // for nbor
-
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_tersoff_zbl.h b/lib/gpu/lal_tersoff_zbl.h
index eb03e9fb02..b82b391765 100644
--- a/lib/gpu/lal_tersoff_zbl.h
+++ b/lib/gpu/lal_tersoff_zbl.h
@@ -65,7 +65,7 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   bool shared_types;
 
   /// Number of atom types
-  int _lj_types;
+  int _ntypes;
 
   /// ts1.x = lam1, ts1.y = lam2,  ts1.z = lam3, ts1.w = powermint
   UCL_D_Vec<numtyp4> ts1;
@@ -80,7 +80,7 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   /// ts6.x = Z_i, ts6.y = Z_j, ts6.z = ZBLcut, ts6.w = ZBLexpscale
   UCL_D_Vec<numtyp4> ts6;
 
-  UCL_D_Vec<numtyp> cutsq;
+  UCL_D_Vec<numtyp> cutsq_pair;
 
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
@@ -91,15 +91,13 @@ class TersoffZBL : public BaseThree<numtyp, acctyp> {
   /// zetaij.w = zetaij
   UCL_D_Vec<acctyp4>   _zetaij;
 
-  UCL_Kernel k_zeta;
-  UCL_Texture ts1_tex, ts2_tex, ts3_tex, ts4_tex, ts5_tex, ts6_tex;
+  UCL_Kernel k_zeta, k_zeta_noev, *k_zeta_selt;
 
   numtyp _global_e,_global_a_0,_global_epsilon_0;
-  numtyp _cutshortsq;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 };
 
 }
diff --git a/lib/gpu/lal_tersoff_zbl_ext.cpp b/lib/gpu/lal_tersoff_zbl_ext.cpp
index d1a9e090b6..518b535627 100644
--- a/lib/gpu/lal_tersoff_zbl_ext.cpp
+++ b/lib/gpu/lal_tersoff_zbl_ext.cpp
@@ -70,7 +70,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=TSZMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+    init_ok=TSZMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
                       ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                       ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -93,7 +93,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=TSZMF.init(ntypes, inum, nall, 300, cell_size, gpu_split, screen,
+      init_ok=TSZMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
                         ts_lam1, ts_lam2, ts_lam3, ts_powermint,
                         ts_biga, ts_bigb, ts_bigr, ts_bigd,
@@ -110,7 +110,7 @@ int tersoff_zbl_gpu_init(const int ntypes, const int inum, const int nall,
     fprintf(screen,"\n");
 
   if (init_ok==0)
-    TSZMF.estimate_gpu_overhead();
+    TSZMF.estimate_gpu_overhead(1);
   return init_ok;
 }
 
diff --git a/lib/gpu/lal_ufm.cpp b/lib/gpu/lal_ufm.cpp
index a86d07f340..f6a48d4470 100644
--- a/lib/gpu/lal_ufm.cpp
+++ b/lib/gpu/lal_ufm.cpp
@@ -131,20 +131,9 @@ double UFMT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void UFMT::loop(const bool _eflag, const bool _vflag) {
+int UFMT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -152,8 +141,8 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &uf1, &uf3, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &uf1, &uf3, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -166,6 +155,7 @@ void UFMT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class UFM<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_ufm.cu b/lib/gpu/lal_ufm.cu
index 03d1e85bdf..9d6c7b978a 100644
--- a/lib/gpu/lal_ufm.cu
+++ b/lib/gpu/lal_ufm.cu
@@ -40,16 +40,19 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -81,10 +84,10 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
         f.y += dely*force;
         f.z += delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += - factor_lj * uf3[mtype].x*log(1.0 - expuf) - uf3[mtype].z;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -95,9 +98,9 @@ __kernel void k_ufm(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
@@ -116,26 +119,29 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 uf1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 uf3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     uf1[tid]=uf1_in[tid];
-    if (eflag>0)
+    if (EVFLAG && eflag)
       uf3[tid]=uf3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor, nbor_end;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -167,10 +173,10 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
         f.y += dely*force;
         f.z += delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           energy += - factor_lj * uf3[mtype].x * log(1.0 - expuf) - uf3[mtype].z;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -181,8 +187,8 @@ __kernel void k_ufm_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_ufm.h b/lib/gpu/lal_ufm.h
index 14b96bcc86..390af831ba 100644
--- a/lib/gpu/lal_ufm.h
+++ b/lib/gpu/lal_ufm.h
@@ -77,7 +77,7 @@ class UFM : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_ufm_ext.cpp b/lib/gpu/lal_ufm_ext.cpp
index 12809a28fb..432cbb2e63 100644
--- a/lib/gpu/lal_ufm_ext.cpp
+++ b/lib/gpu/lal_ufm_ext.cpp
@@ -57,7 +57,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
   int init_ok=0;
   if (world_me==0)
     init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
-                        offset, special_lj, inum, nall, 300,
+                        offset, special_lj, inum, nall, max_nbors,
                         maxspecial, cell_size, gpu_split, screen);
 
   UFMLMF.device->world_barrier();
@@ -75,7 +75,7 @@ int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=UFMLMF.init(ntypes, cutsq, host_uf1, host_uf2, host_uf3,
-                         offset, special_lj, inum, nall, 300, maxspecial,
+                         offset, special_lj, inum, nall, max_nbors, maxspecial,
                          cell_size, gpu_split, screen);
 
     UFMLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_vashishta.cpp b/lib/gpu/lal_vashishta.cpp
index 4af8a0f71c..c343de3f55 100644
--- a/lib/gpu/lal_vashishta.cpp
+++ b/lib/gpu/lal_vashishta.cpp
@@ -50,7 +50,7 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
            const double* gamma, const double* eta,
            const double* lam1inv, const double* lam4inv,
            const double* zizj, const double* mbigd,
-           const double* dvrc, const double* big6w, 
+           const double* dvrc, const double* big6w,
            const double* heta, const double* bigh,
            const double* bigw, const double* c0,
            const double* costheta, const double* bigb,
@@ -138,8 +138,6 @@ int VashishtaT::init(const int ntypes, const int nlocal, const int nall, const i
     dview[i].w=static_cast<numtyp>(r0[i]);
   }
 
-  _cutshortsq = static_cast<numtyp>(r0sqmax);
-
   ucl_copy(param4,dview,false);
   param4_tex.get_texture(*(this->pair_program),"param4_tex");
   param4_tex.bind_float(param4,4);
@@ -212,60 +210,33 @@ double VashishtaT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
-  // Compute the block size and grid size to keep all cores busy
-  int BX=this->block_pair();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
+int VashishtaT::loop(const int eflag, const int vflag, const int evatom,
+                     bool &success) {
+  const int nbor_pitch=this->nbor->nbor_pitch();
 
   // build the short neighbor list
   int ainum=this->_ainum;
-  int nbor_pitch=this->nbor->nbor_pitch();
-  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/
-                               (BX/this->_threads_per_atom)));
-
-  this->k_short_nbor.set_size(GX,BX);
-  this->k_short_nbor.run(&this->atom->x, &this->nbor->dev_nbor,
-                         &this->_nbor_data->begin(),
-                         &this->dev_short_nbor, &_cutshortsq, &ainum,
-                         &nbor_pitch, &this->_threads_per_atom);
-
-  // this->_nbor_data == nbor->dev_packed for gpu_nbor == 0 and tpa > 1
-  // this->_nbor_data == nbor->dev_nbor for gpu_nbor == 1 or tpa == 1
-  ainum=this->ans->inum();
-  nbor_pitch=this->nbor->nbor_pitch();
-  GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
-                               (BX/this->_threads_per_atom)));
   this->time_pair.start();
 
-  // note that k_pair does not run with the short neighbor list
-  this->k_pair.set_size(GX,BX);
-  this->k_pair.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                   &map, &elem2param, &_nelements,
-                   &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                   &this->ans->force, &this->ans->engv,
-                   &eflag, &vflag, &ainum, &nbor_pitch,
-                   &this->_threads_per_atom);
+  int BX=this->block_pair();
+  int GX=static_cast<int>(ceil(static_cast<double>(ainum)/BX));
+  this->k_short_nbor.set_size(GX,BX);
+  this->k_short_nbor.run(&this->atom->x, &param4, &map, &elem2param,
+                         &_nelements, &_nparams, &this->nbor->dev_nbor,
+                         &this->nbor->dev_packed, &ainum, &nbor_pitch,
+                         &this->_threads_per_atom);
 
+  ainum=this->ans->inum();
   BX=this->block_size();
   GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                            (BX/(KTHREADS*JTHREADS))));
-  
-  this->k_three_center.set_size(GX,BX);
-  this->k_three_center.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                           &map, &elem2param, &_nelements,
-                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                           &this->dev_short_nbor,
-                           &this->ans->force, &this->ans->engv, &eflag, &vflag, &ainum,
+  this->k_3center_sel->set_size(GX,BX);
+  this->k_3center_sel->run(&this->atom->x, &param1, &param2, &param3, &param4,
+                           &param5, &map, &elem2param, &_nelements,
+                           &this->nbor->dev_nbor, &this->ans->force,
+                           &this->ans->engv, &eflag, &vflag, &ainum,
                            &nbor_pitch, &this->_threads_per_atom, &evatom);
+
   Answer<numtyp,acctyp> *end_ans;
   #ifdef THREE_CONCURRENT
   end_ans=this->ans2;
@@ -274,23 +245,34 @@ void VashishtaT::loop(const bool _eflag, const bool _vflag, const int evatom) {
   #endif
   if (evatom!=0) {
     this->k_three_end_vatom.set_size(GX,BX);
-    this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_three_end_vatom.run(&this->atom->x, &param1, &param2, &param3,
+                          &param4, &param5, &map, &elem2param, &_nelements,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   } else {
-    this->k_three_end.set_size(GX,BX);
-    this->k_three_end.run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
-                          &map, &elem2param, &_nelements,
-                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
-                          &this->nbor->dev_ilist, &this->dev_short_nbor,
-                          &end_ans->force, &end_ans->engv, &eflag, &vflag, &ainum,
-                          &nbor_pitch, &this->_threads_per_atom, &this->_gpu_nbor);
+    this->k_3end_sel->set_size(GX,BX);
+    this->k_3end_sel->run(&this->atom->x, &param1, &param2, &param3, &param4,
+                          &param5, &map, &elem2param, &_nelements,
+                          &this->nbor->dev_nbor, &this->nbor->three_ilist,
+                          &end_ans->force, &end_ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->_threads_per_atom,
+                          &this->_gpu_nbor);
   }
 
+  BX=this->block_pair();
+  int GXT=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                           (BX/this->_threads_per_atom)));
+  // note that k_pair does not run with the short neighbor list
+  this->k_sel->set_size(GXT,BX);
+  this->k_sel->run(&this->atom->x, &param1, &param2, &param3, &param4, &param5,
+                   &map, &elem2param, &_nelements, &this->nbor->dev_packed,
+                   &this->ans->force, &this->ans->engv,  &eflag, &vflag,
+                   &ainum, &nbor_pitch, &GX);
+
   this->time_pair.stop();
+  return GX;
 }
 
 template class Vashishta<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_vashishta.cu b/lib/gpu/lal_vashishta.cu
index da15aaf09a..6c9ba14b4a 100644
--- a/lib/gpu/lal_vashishta.cu
+++ b/lib/gpu/lal_vashishta.cu
@@ -32,6 +32,14 @@ _texture( param4_tex,int4);
 _texture( param5_tex,int4);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define param1_tex param1
+#define param2_tex param2
+#define param3_tex param3
+#define param4_tex param4
+#define param5_tex param5
+#endif
+
 #else
 #define pos_tex x_
 #define param1_tex param1
@@ -41,92 +49,167 @@ _texture( param5_tex,int4);
 #define param5_tex param5
 #endif
 
+
+
 #define THIRD (numtyp)0.66666666666666666667
 
 //#define THREE_CONCURRENT
 
-#if (ARCH < 300)
+#if (SHUFFLE_AVAIL == 0)
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    __local acctyp red_acc[6][BLOCK_ELLIPSE];                               \
-    red_acc[0][tid]=f.x;                                                    \
-    red_acc[1][tid]=f.y;                                                    \
-    red_acc[2][tid]=f.z;                                                    \
-    red_acc[3][tid]=energy;                                                 \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-      if (offset < s) {                                                     \
-        for (int r=0; r<4; r++)                                             \
-          red_acc[r][tid] += red_acc[r][tid+s];                             \
+    simd_reduce_add3(t_per_atom, red_acc, offset, tid, f.x, f.y, f.z);      \
+    if (EVFLAG && (vflag==2 || eflag==2)) {                                 \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_add1(t_per_atom, red_acc, offset, tid, energy);         \
       }                                                                     \
-    }                                                                       \
-    f.x=red_acc[0][tid];                                                    \
-    f.y=red_acc[1][tid];                                                    \
-    f.z=red_acc[2][tid];                                                    \
-    energy=red_acc[3][tid];                                                 \
-    if (vflag>0) {                                                          \
-      for (int r=0; r<6; r++)                                               \
-        red_acc[r][tid]=virial[r];                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-        if (offset < s) {                                                   \
-          for (int r=0; r<6; r++)                                           \
-            red_acc[r][tid] += red_acc[r][tid+s];                           \
-        }                                                                   \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        simd_reduce_arr(6, t_per_atom, red_acc, offset, tid, virial);       \
       }                                                                     \
-      for (int r=0; r<6; r++)                                               \
-        virial[r]=red_acc[r][tid];                                          \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
-        ei+=inum;                                                           \
-      }                                                                     \
-    }                                                                       \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
     old.z+=f.z;                                                             \
     ans[ii]=old;                                                            \
+  }                                                                         \
+  if (EVFLAG && (eflag || vflag)) {                                         \
+    int ei=BLOCK_ID_X;                                                      \
+    if (eflag!=2 && vflag!=2) {                                             \
+      if (eflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_add1(simd_size(), red_acc, tid, energy);               \
+        if (vflag) __syncthreads();                                         \
+        if (tid==0) {                                                       \
+          engv[ei]+=energy*(acctyp)0.5;                                     \
+          ei+=ev_stride;                                                    \
+        }                                                                   \
+      }                                                                     \
+      if (vflag) {                                                          \
+        simdsync();                                                         \
+        block_reduce_arr(6, simd_size(), red_acc, tid, virial);             \
+        if (tid==0) {                                                       \
+          for (int r=0; r<6; r++) {                                         \
+            engv[ei]+=virial[r]*(acctyp)0.5;                                \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (EVFLAG && eflag) {                                                \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
+        ei+=inum;                                                           \
+      }                                                                     \
+      if (EVFLAG && vflag) {                                                \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
   }
 
 #else
 
-#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom, offset, \
-                      eflag, vflag, ans, engv)                              \
+#if (EVFLAG == 1)
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
   if (t_per_atom>1) {                                                       \
-    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
-        f.x += shfl_xor(f.x, s, t_per_atom);                                \
-        f.y += shfl_xor(f.y, s, t_per_atom);                                \
-        f.z += shfl_xor(f.z, s, t_per_atom);                                \
-        energy += shfl_xor(energy, s, t_per_atom);                          \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
-          for (int r=0; r<6; r++)                                           \
-            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
-      }                                                                     \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+    if (vflag==2 || eflag==2) {                                             \
+      if (eflag)                                                            \
+        simd_reduce_add1(t_per_atom,energy);                                \
+      if (vflag)                                                            \
+        simd_reduce_arr(6, t_per_atom,virial);                              \
     }                                                                       \
   }                                                                         \
-  if (offset==0) {                                                          \
-    int ei=ii;                                                              \
-    if (eflag>0) {                                                          \
-      engv[ei]+=energy*(acctyp)0.5;                                         \
-      ei+=inum;                                                             \
-    }                                                                       \
-    if (vflag>0) {                                                          \
-      for (int i=0; i<6; i++) {                                             \
-        engv[ei]+=virial[i]*(acctyp)0.5;                                    \
+  if (offset==0 && ii<inum) {                                               \
+    acctyp4 old=ans[ii];                                                    \
+    old.x+=f.x;                                                             \
+    old.y+=f.y;                                                             \
+    old.z+=f.z;                                                             \
+    ans[ii]=old;                                                            \
+  }                                                                         \
+  if (eflag || vflag) {                                                     \
+    if (eflag!=2 && vflag!=2) {                                             \
+      const int vwidth = simd_size();                                       \
+      const int voffset = tid & (simd_size() - 1);                          \
+      const int bnum = tid/simd_size();                                     \
+      int active_subgs = BLOCK_SIZE_X/simd_size();                          \
+      for ( ; active_subgs > 1; active_subgs /= vwidth) {                   \
+        if (active_subgs < BLOCK_SIZE_X/simd_size()) __syncthreads();       \
+        if (bnum < active_subgs) {                                          \
+          if (eflag) {                                                      \
+            simd_reduce_add1(vwidth, energy);                               \
+            if (voffset==0) red_acc[6][bnum] = energy;                      \
+          }                                                                 \
+          if (vflag) {                                                      \
+            simd_reduce_arr(6, vwidth, virial);                             \
+            if (voffset==0)                                                 \
+              for (int r=0; r<6; r++) red_acc[r][bnum]=virial[r];           \
+          }                                                                 \
+        }                                                                   \
+                                                                            \
+        __syncthreads();                                                    \
+        if (tid < active_subgs) {                                           \
+            if (eflag) energy = red_acc[6][tid];                            \
+          if (vflag)                                                        \
+            for (int r = 0; r < 6; r++) virial[r] = red_acc[r][tid];        \
+        } else {                                                            \
+          if (eflag) energy = (acctyp)0;                                    \
+          if (vflag) for (int r = 0; r < 6; r++) virial[r] = (acctyp)0;     \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      if (bnum == 0) {                                                      \
+        int ei=BLOCK_ID_X;                                                  \
+        if (eflag) {                                                        \
+          simd_reduce_add1(vwidth, energy);                                 \
+          if (tid==0) {                                                     \
+            engv[ei]+=energy*(acctyp)0.5;                                   \
+            ei+=ev_stride;                                                  \
+          }                                                                 \
+        }                                                                   \
+        if (vflag) {                                                        \
+          simd_reduce_arr(6, vwidth, virial);                               \
+          if (tid==0) {                                                     \
+            for (int r=0; r<6; r++) {                                       \
+              engv[ei]+=virial[r]*(acctyp)0.5;                              \
+              ei+=ev_stride;                                                \
+            }                                                               \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    } else if (offset==0 && ii<inum) {                                      \
+      int ei=ii;                                                            \
+      if (eflag) {                                                          \
+        engv[ei]+=energy*(acctyp)0.5;                                       \
         ei+=inum;                                                           \
       }                                                                     \
+      if (vflag) {                                                          \
+        for (int i=0; i<6; i++) {                                           \
+          engv[ei]+=virial[i]*(acctyp)0.5;                                  \
+          ei+=inum;                                                         \
+        }                                                                   \
+      }                                                                     \
     }                                                                       \
+  }
+
+#else
+
+#define store_answers_p(f, energy, virial, ii, inum, tid, t_per_atom,       \
+                        offset, eflag, vflag, ans, engv, ev_stride)         \
+  if (t_per_atom>1)                                                         \
+    simd_reduce_add3(t_per_atom, f.x, f.y, f.z);                            \
+  if (offset==0 && ii<inum) {                                               \
     acctyp4 old=ans[ii];                                                    \
     old.x+=f.x;                                                             \
     old.y+=f.y;                                                             \
@@ -135,55 +218,58 @@ _texture( param5_tex,int4);
   }
 
 #endif
+#endif
 
 __kernel void k_vashishta_short_nbor(const __global numtyp4 *restrict x_,
-                                     const __global int * dev_nbor,
+                                     const __global numtyp4 *restrict param4,
+                                     const __global int *restrict map,
+                                     const __global int *restrict elem2param,
+                                     const int nelements, const int nparams,
+                                     __global int * dev_nbor,
                                      const __global int * dev_packed,
-                                     __global int * dev_short_nbor,
-                                     const numtyp _cutshortsq,
                                      const int inum, const int nbor_pitch,
                                      const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+  const int ii=GLOBAL_ID_X;
 
   if (ii<inum) {
-    int nbor, nbor_end;
-    int i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    itype=map[itype];
+    int newj=0;
 
-    int ncount = 0;
-    int m = nbor;
-    dev_short_nbor[m] = 0;
-    int nbor_short = nbor+n_stride;
-
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
-
-      int j=dev_packed[nbor];
-      int nj = j;
-      j &= NEIGHMASK;
+    __global int *out_list=dev_nbor+2*nbor_pitch+ii*t_per_atom;
+    const int out_stride=nbor_pitch*t_per_atom-t_per_atom;
 
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
+      int sj=dev_packed[nbor];
+      int j = sj & NEIGHMASK;
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
 
+      int jtype=jx.w;
+      jtype=map[jtype];
+      int ijparam=elem2param[itype*nelements*nelements+jtype*nelements+jtype];
+
       // Compute r12
       numtyp delx = ix.x-jx.x;
       numtyp dely = ix.y-jx.y;
       numtyp delz = ix.z-jx.z;
       numtyp rsq = delx*delx+dely*dely+delz*delz;
 
-      if (rsq<_cutshortsq) {
-        dev_short_nbor[nbor_short] = nj;
-        nbor_short += n_stride;
-        ncount++;
+      if (rsq<param4[ijparam].x) {
+        *out_list=sj;
+        out_list++;
+        newj++;
+        if ((newj & (t_per_atom-1))==0)
+          out_list+=out_stride;
       }
     } // for nbor
-
-    // store the number of neighbors for each thread
-    dev_short_nbor[m] = ncount;
-
+    dev_nbor[ii+nbor_pitch]=newj;
   } // if ii
 }
 
@@ -196,35 +282,37 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
                    const __global int *restrict map,
                    const __global int *restrict elem2param,
                    const int nelements,
-                   const __global int * dev_nbor,
                    const __global int * dev_packed,
                    __global acctyp4 *restrict ans,
                    __global acctyp *restrict engv,
                    const int eflag, const int vflag, const int inum,
-                   const int nbor_pitch, const int t_per_atom) {
-  __local int n_stride;
-  int tid, ii, offset;
-  atom_info(t_per_atom,ii,tid,offset);
+                   const int nbor_pitch, const int ev_stride) {
+  const int ii=GLOBAL_ID_X;
+
+  local_allocate_store_pair();
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
-    int nbor, nbor_end, i, numj;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
-              n_stride,nbor_end,nbor);
+    const int i=dev_packed[ii];
+    int nbor=ii+nbor_pitch;
+    const int numj=dev_packed[nbor];
+    nbor+=nbor_pitch;
+    const int nbor_end=nbor+fast_mul(numj,nbor_pitch);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    for ( ; nbor<nbor_end; nbor+=n_stride) {
+    for ( ; nbor<nbor_end; nbor+=nbor_pitch) {
 
       int j=dev_packed[nbor];
       j &= NEIGHMASK;
@@ -280,10 +368,10 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0)
+        if (EVFLAG && eflag)
           energy += (param3_bigh*reta+vc2-vc3-param3_bigw*r6inv-r*param3_dvrc+param3_c0);
 
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -293,11 +381,10 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
         }
       }
     } // for nbor
-
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
-
+  const int tid=THREAD_ID_X;
+  store_answers_p(f,energy,virial,ii,inum,tid,1,0,eflag,vflag,ans,engv,
+                  ev_stride);
 }
 
 #define threebody(delr1x, delr1y, delr1z, eflag, energy)                     \
@@ -344,9 +431,9 @@ __kernel void k_vashishta(const __global numtyp4 *restrict x_,
   fky = delr2y*(frad2+csfac2)-delr1y*facang12;                               \
   fkz = delr2z*(frad2+csfac2)-delr1z*facang12;                               \
                                                                              \
-  if (eflag>0)                                                               \
+  if (EVFLAG && eflag)                                                       \
     energy+=facrad;                                                          \
-  if (vflag>0) {                                                             \
+  if (EVFLAG && vflag) {                                                     \
     virial[0] += delr1x*fjx + delr2x*fkx;                                    \
     virial[1] += delr1y*fjy + delr2y*fky;                                    \
     virial[2] += delr1z*fjz + delr2z*fkz;                                    \
@@ -402,54 +489,45 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
                                 const __global int *restrict elem2param,
                                 const int nelements,
                                 const __global int * dev_nbor,
-                                const __global int * dev_packed,
-                                const __global int * dev_short_nbor,
                                 __global acctyp4 *restrict ans,
                                 __global acctyp *restrict engv,
                                 const int eflag, const int vflag,
                                 const int inum,  const int nbor_pitch,
                                 const int t_per_atom, const int evatom) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp param_gamma_ij, param_r0sq_ij, param_r0_ij, param_gamma_ik, param_r0sq_ik, param_r0_ik;
   numtyp param_costheta_ijk, param_bigc_ijk, param_bigb_ijk, param_big2b_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-    int nborj_start = nbor_j;
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
 
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -470,23 +548,13 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
 
-      int nbor_k,k_end;
-      if (dev_packed==dev_nbor) {
-        nbor_k=nborj_start-offset_j+offset_k;
-        int numk = dev_short_nbor[nbor_k-n_stride];
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      } else {
-        nbor_k = nbor_j-offset_j+offset_k;
-        if (nbor_k<=nbor_j) nbor_k += n_stride;
-        k_end = nbor_end;
-      }
+      int nbor_k = nbor_j-offset_j+offset_k;
+      if (nbor_k<=nbor_j) nbor_k += n_stride;
 
-      for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+      for ( ; nbor_k<nbor_end; nbor_k+=n_stride) {
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
-        if (dev_packed==dev_nbor && k <= j) continue;
-
         numtyp4 kx; fetch4(kx,k,pos_tex);
         int ktype=kx.w;
         ktype=map[ktype];
@@ -528,11 +596,9 @@ __kernel void k_vashishta_three_center(const __global numtyp4 *restrict x_,
     energy*=pre;
     for (int i=0; i<6; i++)
       virial[i]*=pre;
-
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
 }
 
 __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
@@ -545,53 +611,45 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
                              const __global int *restrict elem2param,
                              const int nelements,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp param_gamma_ij, param_r0sq_ij, param_r0_ij, param_gamma_ik, param_r0sq_ik, param_r0_ik;
   numtyp param_costheta_ijk, param_bigc_ijk, param_bigb_ijk, param_big2b_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -612,32 +670,16 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij = param4_ijparam.w;
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -680,14 +722,14 @@ __kernel void k_vashishta_three_end(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
 __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
@@ -700,53 +742,45 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
                              const __global int *restrict elem2param,
                              const int nelements,
                              const __global int * dev_nbor,
-                             const __global int * dev_packed,
                              const __global int * dev_ilist,
-                             const __global int * dev_short_nbor,
                              __global acctyp4 *restrict ans,
                              __global acctyp *restrict engv,
                              const int eflag, const int vflag,
                              const int inum,  const int nbor_pitch,
                              const int t_per_atom, const int gpu_nbor) {
-  __local int tpa_sq, n_stride;
-  tpa_sq=fast_mul(t_per_atom,t_per_atom);
+  int n_stride;
+  const int tpa_sq=fast_mul(t_per_atom,t_per_atom);
   numtyp param_gamma_ij, param_r0sq_ij, param_r0_ij, param_gamma_ik, param_r0sq_ik, param_r0_ik;
   numtyp param_costheta_ijk, param_bigc_ijk, param_bigb_ijk, param_big2b_ijk;
 
   int tid, ii, offset;
   atom_info(tpa_sq,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  local_allocate_store_three();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int i, numj, nbor_j, nbor_end, k_end;
-    const __global int* nbor_mem = dev_packed;
     int offset_j=offset/t_per_atom;
-    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
-              n_stride,nbor_end,nbor_j);
+    nbor_info_p(dev_nbor,nbor_pitch,t_per_atom,ii,offset_j,i,numj,
+                n_stride,nbor_end,nbor_j);
     int offset_k=tid & (t_per_atom-1);
 
     numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
     int itype=ix.w;
     itype=map[itype];
 
-    // recalculate numj and nbor_end for use of the short nbor list
-    if (dev_packed==dev_nbor) {
-      numj = dev_short_nbor[nbor_j];
-      nbor_j += n_stride;
-      nbor_end = nbor_j+fast_mul(numj,n_stride);
-      nbor_mem = dev_short_nbor;
-    }
-
     for ( ; nbor_j<nbor_end; nbor_j+=n_stride) {
-      int j=nbor_mem[nbor_j];
+      int j=dev_nbor[nbor_j];
       j &= NEIGHMASK;
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
@@ -767,32 +801,16 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
       param_gamma_ij=param4_ijparam.y;
       param_r0_ij=param4_ijparam.w;
 
-      int nbor_k,numk;
-      if (dev_nbor==dev_packed) {
-        if (gpu_nbor) nbor_k=j+nbor_pitch;
-        else nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
-        k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
-        nbor_k+=offset_k;
-      } else {
-        nbor_k=dev_ilist[j]+nbor_pitch;
-        numk=dev_nbor[nbor_k];
-        nbor_k+=nbor_pitch;
-        nbor_k=dev_nbor[nbor_k];
-        k_end=nbor_k+numk;
-        nbor_k+=offset_k;
-      }
-
-      // recalculate numk and k_end for the use of short neighbor list
-      if (dev_packed==dev_nbor) {
-        numk = dev_short_nbor[nbor_k];
-        nbor_k += n_stride;
-        k_end = nbor_k+fast_mul(numk,n_stride);
-      }
+      int nbor_k;
+      if (gpu_nbor) nbor_k=j+nbor_pitch;
+      else nbor_k=dev_ilist[j]+nbor_pitch;
+      const int numk=dev_nbor[nbor_k];
+      nbor_k+=nbor_pitch+fast_mul(j,t_per_atom-1);
+      k_end=nbor_k+fast_mul(numk/t_per_atom,n_stride)+(numk & (t_per_atom-1));
+      nbor_k+=offset_k;
 
       for ( ; nbor_k<k_end; nbor_k+=n_stride) {
-        int k=nbor_mem[nbor_k];
+        int k=dev_nbor[nbor_k];
         k &= NEIGHMASK;
 
         if (k == i) continue;
@@ -833,13 +851,13 @@ __kernel void k_vashishta_three_end_vatom(const __global numtyp4 *restrict x_,
     energy*=THIRD;
     for (int i=0; i<6; i++)
       virial[i]*=THIRD;
-    #ifdef THREE_CONCURRENT
-    store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                  eflag,vflag,ans,engv);
-    #else
-    store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
-                    eflag,vflag,ans,engv);
-    #endif
   } // if ii
+  #ifdef THREE_CONCURRENT
+  store_answers(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                eflag,vflag,ans,engv);
+  #else
+  store_answers_p(f,energy,virial,ii,inum,tid,tpa_sq,offset,
+                  eflag,vflag,ans,engv,NUM_BLOCKS_X);
+  #endif
 }
 
diff --git a/lib/gpu/lal_vashishta.h b/lib/gpu/lal_vashishta.h
index 2da7a11e1e..9a7bd8c630 100644
--- a/lib/gpu/lal_vashishta.h
+++ b/lib/gpu/lal_vashishta.h
@@ -40,11 +40,11 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
   int init(const int ntypes, const int nlocal, const int nall, const int max_nbors,
            const double cell_size, const double gpu_split, FILE *screen,
            int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-           const double* cutsq, const double* r0, 
+           const double* cutsq, const double* r0,
            const double* gamma, const double* eta,
            const double* lam1inv, const double* lam4inv,
            const double* zizj, const double* mbigd,
-           const double* dvrc, const double* big6w, 
+           const double* dvrc, const double* big6w,
            const double* heta, const double* bigh,
            const double* bigw, const double* c0,
            const double* costheta, const double* bigb,
@@ -82,13 +82,12 @@ class Vashishta : public BaseThree<numtyp, acctyp> {
   UCL_D_Vec<int> elem2param;
   UCL_D_Vec<int> map;
   int _nparams,_nelements;
-  numtyp _cutshortsq;
 
   UCL_Texture param1_tex, param2_tex, param3_tex, param4_tex, param5_tex;
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag, const int evatom);
+  int loop(const int eflag, const int vflag, const int evatom, bool &success);
 
 };
 
diff --git a/lib/gpu/lal_vashishta_ext.cpp b/lib/gpu/lal_vashishta_ext.cpp
index 56dfd8a0ff..ecbdefed19 100644
--- a/lib/gpu/lal_vashishta_ext.cpp
+++ b/lib/gpu/lal_vashishta_ext.cpp
@@ -32,7 +32,7 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
                 const double* gamma, const double* eta,
                 const double* lam1inv, const double* lam4inv,
                 const double* zizj, const double* mbigd,
-                const double* dvrc, const double* big6w, 
+                const double* dvrc, const double* big6w,
                 const double* heta, const double* bigh,
                 const double* bigw, const double* c0,
                 const double* costheta, const double* bigb,
@@ -63,10 +63,10 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
 
   int init_ok=0;
   if (world_me==0)
-    init_ok=VashishtaMF.init(ntypes, inum, nall, 500, cell_size, gpu_split, screen,
+    init_ok=VashishtaMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                       host_map, nelements, host_elem2param, nparams,
-                      cutsq, r0, gamma, eta, lam1inv, 
-                      lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, 
+                      cutsq, r0, gamma, eta, lam1inv,
+                      lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
                       c0, costheta, bigb, big2b, bigc);
 
   VashishtaMF.device->world_barrier();
@@ -83,10 +83,10 @@ int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const i
       fflush(screen);
     }
     if (gpu_rank==i && world_me!=0)
-      init_ok=VashishtaMF.init(ntypes, inum, nall, 500, cell_size, gpu_split, screen,
+      init_ok=VashishtaMF.init(ntypes, inum, nall, max_nbors, cell_size, gpu_split, screen,
                         host_map, nelements, host_elem2param, nparams,
-                        cutsq, r0, gamma, eta, lam1inv, 
-                        lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw, 
+                        cutsq, r0, gamma, eta, lam1inv,
+                        lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
                         c0, costheta, bigb, big2b, bigc);
 
     VashishtaMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_yukawa.cpp b/lib/gpu/lal_yukawa.cpp
index 453139e537..707f60f071 100644
--- a/lib/gpu/lal_yukawa.cpp
+++ b/lib/gpu/lal_yukawa.cpp
@@ -109,20 +109,9 @@ double YukawaT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void YukawaT::loop(const bool _eflag, const bool _vflag) {
+int YukawaT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -130,8 +119,8 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff, &_kappa, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff, &_kappa, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag,
                           &vflag, &ainum, &nbor_pitch,
@@ -144,6 +133,7 @@ void YukawaT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class Yukawa<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_yukawa.cu b/lib/gpu/lal_yukawa.cu
index 62bc013dc6..6ebd2dc06d 100644
--- a/lib/gpu/lal_yukawa.cu
+++ b/lib/gpu/lal_yukawa.cu
@@ -38,22 +38,25 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -89,11 +92,11 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x*screening*rinv;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -104,9 +107,9 @@ __kernel void k_yukawa(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
@@ -124,25 +127,28 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -178,11 +184,11 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x*screening*rinv;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -193,8 +199,8 @@ __kernel void k_yukawa_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_yukawa.h b/lib/gpu/lal_yukawa.h
index 7d638d760e..51871a9728 100644
--- a/lib/gpu/lal_yukawa.h
+++ b/lib/gpu/lal_yukawa.h
@@ -72,7 +72,7 @@ class Yukawa : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_yukawa_colloid.cpp b/lib/gpu/lal_yukawa_colloid.cpp
index 46d4d64328..a447bb3889 100644
--- a/lib/gpu/lal_yukawa_colloid.cpp
+++ b/lib/gpu/lal_yukawa_colloid.cpp
@@ -133,10 +133,25 @@ double YukawaColloidT::host_memory_usage() const {
 template <class numtyp, class acctyp>
 void YukawaColloidT::compute(const int f_ago, const int inum_full,
                const int nall, double **host_x, int *host_type, int *ilist,
-               int *numj, int **firstneigh, const bool eflag, const bool vflag,
-               const bool eatom, const bool vatom, int &host_start,
-               const double cpu_time, bool &success, double *rad) {
+               int *numj, int **firstneigh, const bool eflag_in,
+               const bool vflag_in, const bool eatom, const bool vatom,
+               int &host_start, const double cpu_time, bool &success,
+               double *rad) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
 
   // ------------------- Resize rad array --------------------------
 
@@ -177,8 +192,8 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
   this->atom->add_x_data(host_x,host_type);
   this->add_rad_data();
 
-  this->loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom,ilist);
+  const int red_blocks=this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,ilist,red_blocks);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 }
@@ -187,14 +202,28 @@ void YukawaColloidT::compute(const int f_ago, const int inum_full,
 // Reneighbor on GPU and then compute per-atom densities
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall,
-                double **host_x, int *host_type, double *sublo,
+int** YukawaColloidT::compute(const int ago, const int inum_full,
+                const int nall, double **host_x, int *host_type, double *sublo,
                 double *subhi, tagint *tag, int **nspecial,
-                tagint **special, const bool eflag, const bool vflag,
+                tagint **special, const bool eflag_in, const bool vflag_in,
                 const bool eatom, const bool vatom, int &host_start,
                 int **ilist, int **jnum, const double cpu_time, bool &success,
                 double *rad) {
   this->acc_timers();
+  int eflag, vflag;
+  if (eatom) eflag=2;
+  else if (eflag_in) eflag=1;
+  else eflag=0;
+  if (vatom) vflag=2;
+  else if (vflag_in) vflag=1;
+  else vflag=0;
+
+  #ifdef LAL_NO_BLOCK_REDUCE
+  if (eflag) eflag=2;
+  if (vflag) vflag=2;
+  #endif
+
+  this->set_kernel(eflag,vflag);
 
   // ------------------- Resize rad array ----------------------------
 
@@ -240,8 +269,8 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall
   *ilist=this->nbor->host_ilist.begin();
   *jnum=this->nbor->host_acc.begin();
 
-  this->loop(eflag,vflag);
-  this->ans->copy_answers(eflag,vflag,eatom,vatom);
+  const int red_blocks=this->loop(eflag,vflag);
+  this->ans->copy_answers(eflag_in,vflag_in,eatom,vatom,red_blocks);
   this->device->add_ans_object(this->ans);
   this->hd_balancer.stop_timer();
 
@@ -252,20 +281,9 @@ int** YukawaColloidT::compute(const int ago, const int inum_full, const int nall
 // Calculate per-atom energies and forces
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
+int YukawaColloidT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -273,8 +291,8 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &c_rad, &coeff, &sp_lj,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &c_rad, &coeff, &sp_lj,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
                           &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
@@ -286,6 +304,7 @@ void YukawaColloidT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom, &_kappa);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class YukawaColloid<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_yukawa_colloid.cu b/lib/gpu/lal_yukawa_colloid.cu
index 30b458fec7..847ffa6d80 100644
--- a/lib/gpu/lal_yukawa_colloid.cu
+++ b/lib/gpu/lal_yukawa_colloid.cu
@@ -24,6 +24,10 @@ _texture_2d( pos_tex,int4);
 _texture( rad_tex,int2);
 #endif
 
+#if (__CUDACC_VER_MAJOR__ >= 11)
+#define rad_tex rad_
+#endif
+
 #else
 #define pos_tex x_
 #define rad_tex rad_
@@ -45,22 +49,25 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
   atom_info(t_per_atom,ii,tid,offset);
 
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   sp_lj[0]=sp_lj_in[0];
   sp_lj[1]=sp_lj_in[1];
   sp_lj[2]=sp_lj_in[2];
   sp_lj[3]=sp_lj_in[3];
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -98,11 +105,11 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x/kappa * screening;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -113,9 +120,9 @@ __kernel void k_yukawa_colloid(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
@@ -134,25 +141,28 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
 
   __local numtyp4 coeff[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp sp_lj[4];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<4)
     sp_lj[tid]=sp_lj_in[tid];
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff[tid]=coeff_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -190,11 +200,11 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=coeff[mtype].x/kappa * screening;
           energy+=factor_lj*(e-coeff[mtype].y);
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -205,8 +215,7 @@ __kernel void k_yukawa_colloid_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
-
diff --git a/lib/gpu/lal_yukawa_colloid.h b/lib/gpu/lal_yukawa_colloid.h
index 607bc42321..a08248dd3a 100644
--- a/lib/gpu/lal_yukawa_colloid.h
+++ b/lib/gpu/lal_yukawa_colloid.h
@@ -114,7 +114,7 @@ class YukawaColloid : public BaseAtomic<numtyp, acctyp> {
  private:
   bool _shared_view;
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_yukawa_colloid_ext.cpp b/lib/gpu/lal_yukawa_colloid_ext.cpp
index 988d33bdd6..db86f91689 100644
--- a/lib/gpu/lal_yukawa_colloid_ext.cpp
+++ b/lib/gpu/lal_yukawa_colloid_ext.cpp
@@ -55,7 +55,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
   int init_ok=0;
   if (world_me==0)
     init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
-                          inum, nall, 300, maxspecial, cell_size, gpu_split,
+                          inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                           screen, kappa);
 
   YKCOLLMF.device->world_barrier();
@@ -73,7 +73,7 @@ int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=YKCOLLMF.init(ntypes, cutsq, host_a, host_offset, special_lj,
-                            inum, nall, 300, maxspecial, cell_size, gpu_split,
+                            inum, nall, max_nbors, maxspecial, cell_size, gpu_split,
                             screen, kappa);
 
     YKCOLLMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_yukawa_ext.cpp b/lib/gpu/lal_yukawa_ext.cpp
index 995694bdfd..cf2bf89e3d 100644
--- a/lib/gpu/lal_yukawa_ext.cpp
+++ b/lib/gpu/lal_yukawa_ext.cpp
@@ -55,7 +55,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
   int init_ok=0;
   if (world_me==0)
     init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
-                      inum, nall, 300, maxspecial, cell_size,
+                      inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen);
 
   YKMF.device->world_barrier();
@@ -73,7 +73,7 @@ int yukawa_gpu_init(const int ntypes, double **cutsq, double kappa,
     }
     if (gpu_rank==i && world_me!=0)
       init_ok=YKMF.init(ntypes, cutsq, kappa, host_a, offset, special_lj,
-                      inum, nall, 300, maxspecial, cell_size,
+                      inum, nall, max_nbors, maxspecial, cell_size,
                       gpu_split, screen);
 
     YKMF.device->gpu_barrier();
diff --git a/lib/gpu/lal_zbl.cpp b/lib/gpu/lal_zbl.cpp
index 2bf3369174..885f6f10bb 100644
--- a/lib/gpu/lal_zbl.cpp
+++ b/lib/gpu/lal_zbl.cpp
@@ -118,20 +118,9 @@ double ZBLT::host_memory_usage() const {
 // Calculate energies, forces, and torques
 // ---------------------------------------------------------------------------
 template <class numtyp, class acctyp>
-void ZBLT::loop(const bool _eflag, const bool _vflag) {
+int ZBLT::loop(const int eflag, const int vflag) {
   // Compute the block size and grid size to keep all cores busy
   const int BX=this->block_size();
-  int eflag, vflag;
-  if (_eflag)
-    eflag=1;
-  else
-    eflag=0;
-
-  if (_vflag)
-    vflag=1;
-  else
-    vflag=0;
-
   int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
                                (BX/this->_threads_per_atom)));
 
@@ -139,8 +128,8 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) {
   int nbor_pitch=this->nbor->nbor_pitch();
   this->time_pair.start();
   if (shared_types) {
-    this->k_pair_fast.set_size(GX,BX);
-    this->k_pair_fast.run(&this->atom->x, &coeff1, &coeff2, &coeff3,
+    this->k_pair_sel->set_size(GX,BX);
+    this->k_pair_sel->run(&this->atom->x, &coeff1, &coeff2, &coeff3,
                           &_cut_globalsq, &_cut_innersq, &_cut_inner,
                           &this->nbor->dev_nbor, &this->_nbor_data->begin(),
                           &this->ans->force, &this->ans->engv, &eflag, &vflag,
@@ -154,6 +143,7 @@ void ZBLT::loop(const bool _eflag, const bool _vflag) {
                      &ainum, &nbor_pitch, &this->_threads_per_atom);
   }
   this->time_pair.stop();
+  return GX;
 }
 
 template class ZBL<PRECISION,ACC_PRECISION>;
diff --git a/lib/gpu/lal_zbl.cu b/lib/gpu/lal_zbl.cu
index 2539c0ddd7..09e1b4f6bb 100644
--- a/lib/gpu/lal_zbl.cu
+++ b/lib/gpu/lal_zbl.cu
@@ -95,17 +95,20 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
   int tid, ii, offset;
   atom_info(t_per_atom,ii,tid,offset);
 
-  acctyp energy=(acctyp)0;
+  int n_stride;
+  local_allocate_store_pair();
+
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -142,7 +145,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
           e += coeff3[mtype].z;
@@ -151,7 +154,7 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
           }
           energy+=e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -162,9 +165,9 @@ __kernel void k_zbl(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
 __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
@@ -186,25 +189,28 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
   __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
   __local numtyp4 coeff3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  int n_stride;
+  local_allocate_store_pair();
+
   if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
     coeff1[tid]=coeff1_in[tid];
     coeff2[tid]=coeff2_in[tid];
     coeff3[tid]=coeff3_in[tid];
   }
 
-  acctyp energy=(acctyp)0;
   acctyp4 f;
   f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
-  acctyp virial[6];
-  for (int i=0; i<6; i++)
-    virial[i]=(acctyp)0;
+  acctyp energy, virial[6];
+  if (EVFLAG) {
+    energy=(acctyp)0;
+    for (int i=0; i<6; i++) virial[i]=(acctyp)0;
+  }
 
   __syncthreads();
 
   if (ii<inum) {
     int nbor, nbor_end;
     int i, numj;
-    __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -242,7 +248,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        if (eflag>0) {
+        if (EVFLAG && eflag) {
           numtyp e=e_zbl(r, coeff2[mtype].x, coeff2[mtype].y,
                          coeff2[mtype].z, coeff2[mtype].w, coeff1[mtype].z);
           e += coeff3[mtype].z;
@@ -251,7 +257,7 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
           }
           energy+=e;
         }
-        if (vflag>0) {
+        if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
           virial[2] += delz*delz*force;
@@ -262,8 +268,8 @@ __kernel void k_zbl_fast(const __global numtyp4 *restrict x_,
       }
 
     } // for nbor
-    store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
-                  ans,engv);
   } // if ii
+  store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
+                ans,engv);
 }
 
diff --git a/lib/gpu/lal_zbl.h b/lib/gpu/lal_zbl.h
index e205d326c6..af4f1b2eac 100644
--- a/lib/gpu/lal_zbl.h
+++ b/lib/gpu/lal_zbl.h
@@ -76,7 +76,7 @@ class ZBL : public BaseAtomic<numtyp, acctyp> {
 
  private:
   bool _allocated;
-  void loop(const bool _eflag, const bool _vflag);
+  int loop(const int eflag, const int vflag);
 };
 
 }
diff --git a/lib/gpu/lal_zbl_ext.cpp b/lib/gpu/lal_zbl_ext.cpp
index f15e814a50..ee7794af2d 100644
--- a/lib/gpu/lal_zbl_ext.cpp
+++ b/lib/gpu/lal_zbl_ext.cpp
@@ -58,7 +58,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
     init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                        host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
                        cut_globalsq, cut_innersq, cut_inner,
-                       inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
+                       inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen);
 
   ZBLMF.device->world_barrier();
   if (message)
@@ -77,7 +77,7 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
       init_ok=ZBLMF.init(ntypes, cutsq, host_sw1, host_sw2, host_sw3, host_sw4,
                          host_sw5, host_d1a, host_d2a, host_d3a, host_d4a, host_zze,
                          cut_globalsq, cut_innersq, cut_inner,
-                         inum, nall, 300, maxspecial, cell_size, gpu_split, screen);
+                         inum, nall, max_nbors, maxspecial, cell_size, gpu_split, screen);
 
     ZBLMF.device->gpu_barrier();
     if (message)
diff --git a/python/lammps/core.py b/python/lammps/core.py
index d1bc7bc138..e13bf9585b 100644
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@@ -286,15 +286,16 @@ class lammps(object):
     self.lib.lammps_fix_external_set_energy_global = [c_void_p, c_char_p, c_double]
     self.lib.lammps_fix_external_set_virial_global = [c_void_p, c_char_p, POINTER(c_double)]
 
-    # detect if Python is using version of mpi4py that can pass a communicator
-
+    # detect if Python is using a version of mpi4py that can pass communicators
+    # only needed if LAMMPS has been compiled with MPI support.
     self.has_mpi4py = False
-    try:
-      from mpi4py import __version__ as mpi4py_version
-      # tested to work with mpi4py versions 2 and 3
-      self.has_mpi4py = mpi4py_version.split('.')[0] in ['2','3']
-    except:
-      pass
+    if self.has_mpi_support:
+      try:
+        from mpi4py import __version__ as mpi4py_version
+        # tested to work with mpi4py versions 2 and 3
+        self.has_mpi4py = mpi4py_version.split('.')[0] in ['2','3']
+      except:
+        pass
 
     # if no ptr provided, create an instance of LAMMPS
     #   don't know how to pass an MPI communicator from PyPar
@@ -307,23 +308,27 @@ class lammps(object):
 
     if not ptr:
 
-      # with mpi4py v2, can pass MPI communicator to LAMMPS
+      # with mpi4py v2+, we can pass MPI communicators to LAMMPS
       # need to adjust for type of MPI communicator object
       # allow for int (like MPICH) or void* (like OpenMPI)
-      if self.has_mpi4py and self.has_mpi_support:
+      if self.has_mpi_support and self.has_mpi4py:
         from mpi4py import MPI
         self.MPI = MPI
 
       if comm:
-        if not self.has_mpi4py:
-          raise Exception('Python mpi4py version is not 2 or 3')
         if not self.has_mpi_support:
           raise Exception('LAMMPS not compiled with real MPI library')
+        if not self.has_mpi4py:
+          raise Exception('Python mpi4py version is not 2 or 3')
         if self.MPI._sizeof(self.MPI.Comm) == sizeof(c_int):
           MPI_Comm = c_int
         else:
           MPI_Comm = c_void_p
 
+        # Detect whether LAMMPS and mpi4py definitely use different MPI libs
+        if sizeof(MPI_Comm) != self.lib.lammps_config_has_mpi_support():
+          raise Exception('Inconsistent MPI library in LAMMPS and mpi4py')
+
         narg = 0
         cargs = None
         if cmdargs:
@@ -1612,7 +1617,7 @@ class lammps(object):
   def get_neighlist(self, idx):
     """Returns an instance of :class:`NeighList` which wraps access to the neighbor list with the given index
 
-    See :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.get_neighlist()>` if you want to use
+    See :py:meth:`lammps.numpy.get_neighlist() <lammps.numpy_wrapper.numpy_wrapper.get_neighlist()>` if you want to use
     NumPy arrays instead of ``c_int`` pointers.
 
     :param idx: index of neighbor list
diff --git a/python/lammps/pylammps.py b/python/lammps/pylammps.py
index 47a2a5a6ab..4bba9f5e94 100644
--- a/python/lammps/pylammps.py
+++ b/python/lammps/pylammps.py
@@ -400,6 +400,7 @@ class PyLammps(object):
       self.lmp = lammps(name=name,cmdargs=cmdargs,ptr=None,comm=comm)
     print("LAMMPS output is captured by PyLammps wrapper")
     self._cmd_history = []
+    self._enable_cmd_history = False
     self.runs = []
 
   def __del__(self):
@@ -434,6 +435,24 @@ class PyLammps(object):
     """
     self.lmp.file(file)
 
+  @property
+  def enable_cmd_history(self):
+    """
+    :getter: Return whether command history is saved
+    :setter: Set if command history should be saved
+    :type: bool
+    """
+    return self._enable_cmd_history
+
+  @enable_cmd_history.setter
+  def enable_cmd_history(self, value):
+    """
+    :getter: Return whether command history is saved
+    :setter: Set if command history should be saved
+    :type: bool
+    """
+    self._enable_cmd_history = (value == True)
+
   def write_script(self, filepath):
     """
     Write LAMMPS script file containing all commands executed up until now
@@ -445,18 +464,28 @@ class PyLammps(object):
       for cmd in self._cmd_history:
         print(cmd, file=f)
 
+  def clear_cmd_history(self):
+    """
+    Clear LAMMPS command history up to this point
+    """
+    self._cmd_history = []
+
   def command(self, cmd):
     """
     Execute LAMMPS command
 
-    All commands executed will be stored in a command history which can be
-    written to a file using :py:meth:`PyLammps.write_script()`
+    If :py:attr:`PyLammps.enable_cmd_history` is set to ``True``, commands executed
+    will be recorded. The entire command history can be written to a file using
+    :py:meth:`PyLammps.write_script()`. To clear the command history, use
+    :py:meth:`PyLammps.clear_cmd_history()`.
 
     :param cmd: command string that should be executed
     :type: cmd: string
     """
     self.lmp.command(cmd)
-    self._cmd_history.append(cmd)
+
+    if self.enable_cmd_history:
+      self._cmd_history.append(cmd)
 
   def run(self, *args, **kwargs):
     """
diff --git a/src/GPU/Install.sh b/src/GPU/Install.sh
index 1fefb01d42..49b7eeda57 100755
--- a/src/GPU/Install.sh
+++ b/src/GPU/Install.sh
@@ -30,6 +30,16 @@ action () {
 
 action fix_gpu.cpp
 action fix_gpu.h
+action fix_nve_gpu.h
+action fix_nve_gpu.cpp
+action fix_nh_gpu.h
+action fix_nh_gpu.cpp
+action fix_nvt_gpu.h
+action fix_nvt_gpu.cpp
+action fix_npt_gpu.h
+action fix_npt_gpu.cpp
+action fix_nve_asphere_gpu.h fix_nve_asphere.h
+action fix_nve_asphere_gpu.cpp fix_nve_asphere.cpp
 action gpu_extra.h
 action pair_beck_gpu.cpp
 action pair_beck_gpu.h
@@ -83,6 +93,8 @@ action pair_lj96_cut_gpu.cpp
 action pair_lj96_cut_gpu.h
 action pair_lj_charmm_coul_long_gpu.cpp pair_lj_charmm_coul_long.cpp
 action pair_lj_charmm_coul_long_gpu.h pair_lj_charmm_coul_long.cpp
+action pair_lj_charmm_coul_charmm_gpu.cpp pair_lj_charmm_coul_charmm.cpp
+action pair_lj_charmm_coul_charmm_gpu.h pair_lj_charmm_coul_charmm.cpp
 action pair_lj_class2_coul_long_gpu.cpp pair_lj_class2_coul_long.cpp
 action pair_lj_class2_coul_long_gpu.h pair_lj_class2_coul_long.cpp
 action pair_lj_class2_gpu.cpp pair_lj_class2.cpp
diff --git a/src/GPU/fix_gpu.cpp b/src/GPU/fix_gpu.cpp
index 8f88dfd61d..8297c338a5 100644
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@@ -15,6 +15,7 @@
 #include <cstring>
 
 #include "atom.h"
+#include "comm.h"
 #include "force.h"
 #include "pair.h"
 #include "pair_hybrid.h"
@@ -31,21 +32,28 @@
 #include "citeme.h"
 #include "error.h"
 
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
 
 enum{GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH};
 
-extern int lmp_init_device(MPI_Comm world, MPI_Comm replica,
-                           const int first_gpu, const int last_gpu,
-                           const int gpu_mode, const double particle_split,
-                           const int nthreads, const int t_per_atom,
-                           const double cell_size, char *opencl_flags,
+extern int lmp_init_device(MPI_Comm world, MPI_Comm replica, const int ngpu,
+                           const int first_gpu_id, const int gpu_mode,
+                           const double particle_split, const int t_per_atom,
+                           const double cell_size, char *opencl_args,
+                           const int ocl_platform, char *device_type_flags,
                            const int block_pair);
 extern void lmp_clear_device();
 extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
+                             double **vatom, double *virial, double &ecoul,
+                             int &err_flag);
+extern double lmp_gpu_update_bin_size(const double subx, const double suby,
+                                      const double subz, const int nlocal,
+                                      const double cut);
 
 static const char cite_gpu_package[] =
   "GPU package (short-range, long-range and three-body potentials):\n\n"
@@ -105,22 +113,27 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
 
   if (narg < 4) error->all(FLERR,"Illegal package gpu command");
 
+  // If ngpu is 0, autoset ngpu to the number of devices per node matching
+  // best device
   int ngpu = atoi(arg[3]);
-  if (ngpu <= 0) error->all(FLERR,"Illegal package gpu command");
-  int first_gpu = 0;
-  int last_gpu = ngpu-1;
+  if (ngpu < 0) error->all(FLERR,"Illegal package gpu command");
+
+  // Negative value indicate GPU package should find the best device ID
+  int first_gpu_id = -1;
 
   // options
 
   _gpu_mode = GPU_NEIGH;
   _particle_split = 1.0;
-  int nthreads = 1;
+  int nthreads = 0;
   int newtonflag = 0;
   int threads_per_atom = -1;
   double binsize = 0.0;
-  char *opencl_flags = nullptr;
+  char *opencl_args = nullptr;
   int block_pair = -1;
   int pair_only_flag = 0;
+  int ocl_platform = -1;
+  char *device_type_flags = nullptr;
 
   int iarg = 4;
   while (iarg < narg) {
@@ -149,22 +162,25 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
         error->all(FLERR,"Illegal package GPU command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"gpuID") == 0) {
-      if (iarg+3 > narg) error->all(FLERR,"Illegal package gpu command");
-      first_gpu = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      last_gpu = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
-      iarg += 3;
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      first_gpu_id = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      iarg += 2;
     } else if (strcmp(arg[iarg],"tpa") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       threads_per_atom = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"nthreads") == 0) {
+    } else if (strcmp(arg[iarg],"omp") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
       nthreads = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
-      if (nthreads < 1) error->all(FLERR,"Illegal fix GPU command");
+      if (nthreads < 0) error->all(FLERR,"Illegal fix GPU command");
       iarg += 2;
-    } else if (strcmp(arg[iarg],"device") == 0) {
+    } else if (strcmp(arg[iarg],"platform") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
-      opencl_flags = arg[iarg+1];
+      ocl_platform = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      iarg += 2;
+    } else if (strcmp(arg[iarg],"device_type") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      device_type_flags = arg[iarg+1];
       iarg += 2;
     } else if (strcmp(arg[iarg],"blocksize") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
@@ -176,12 +192,21 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
       else if (strcmp(arg[iarg+1],"on") == 0) pair_only_flag = 1;
       else error->all(FLERR,"Illegal package gpu command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"ocl_args") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package gpu command");
+      opencl_args = arg[iarg+1];
+      iarg += 2;
     } else error->all(FLERR,"Illegal package gpu command");
   }
 
-  #ifndef _OPENMP
+  #if (LAL_USE_OMP == 0)
   if (nthreads > 1)
     error->all(FLERR,"No OpenMP support compiled in");
+  #else
+  if (nthreads > 0) {
+    omp_set_num_threads(nthreads);
+    comm->nthreads = nthreads;
+  }
   #endif
 
   // set newton pair flag
@@ -207,10 +232,11 @@ FixGPU::FixGPU(LAMMPS *lmp, int narg, char **arg) :
   // change binsize default (0.0) to -1.0 used by GPU lib
 
   if (binsize == 0.0) binsize = -1.0;
-  int gpu_flag = lmp_init_device(universe->uworld, world, first_gpu, last_gpu,
-                                 _gpu_mode, _particle_split, nthreads,
-                                 threads_per_atom, binsize, opencl_flags,
-                                 block_pair);
+  _binsize = binsize;
+  int gpu_flag = lmp_init_device(universe->uworld, world, ngpu, first_gpu_id,
+                                 _gpu_mode, _particle_split, threads_per_atom,
+                                 binsize, opencl_args, ocl_platform,
+                                 device_type_flags, block_pair);
   GPU_EXTRA::check_flag(gpu_flag,error,world);
 }
 
@@ -296,9 +322,15 @@ void FixGPU::post_force(int /* vflag */)
   timer->stamp();
   double lvirial[6];
   for (int i = 0; i < 6; i++) lvirial[i] = 0.0;
+  int err_flag;
   double my_eng = lmp_gpu_forces(atom->f, atom->torque, force->pair->eatom,
                                  force->pair->vatom, lvirial,
-                                 force->pair->eng_coul);
+                                 force->pair->eng_coul, err_flag);
+  if (err_flag) {
+    if (err_flag==1)
+      error->one(FLERR,
+        "Too many neighbors on GPU. Use neigh_modify one to increase limit.");
+  }
 
   force->pair->eng_vdwl += my_eng;
   force->pair->virial[0] += lvirial[0];
@@ -335,3 +367,12 @@ double FixGPU::memory_usage()
   return bytes;
 }
 
+double FixGPU::binsize(const double subx, const double suby,
+                       const double subz, const int nlocal,
+                       const double cut) {
+  if (_binsize > 0.0) return _binsize;
+  else if (_gpu_mode == GPU_FORCE || comm->cutghostuser)
+    return cut * 0.5;
+  else
+    return lmp_gpu_update_bin_size(subx, suby, subz, nlocal, cut);
+}
diff --git a/src/GPU/fix_gpu.h b/src/GPU/fix_gpu.h
index ba0b4c83cb..29a0907915 100644
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@@ -37,10 +37,14 @@ class FixGPU : public Fix {
   void post_force_respa(int, int, int);
   double memory_usage();
 
+  double binsize(const double subx, const double suby,
+                 const double subz, const int nlocal, const double cut);
+
  private:
   int _gpu_mode;
   int _nlevels_respa;
   double _particle_split;
+  double _binsize;
 };
 
 }
@@ -78,4 +82,11 @@ E: Cannot use neigh_modify exclude with GPU neighbor builds
 This is a current limitation of the GPU implementation
 in LAMMPS.
 
+E: Too many neighbors on GPU. Use neigh_modify one to increase limit.
+
+The expected maximum number of neighbors is determined in the GPU package
+automatically. This error means the actual number of neighbors is exceeding
+the expected value. Use neigh_modify one command to increase GPU allocations
+(e.g. doubling this value doubles the GPU allocation).
+
 */
diff --git a/src/GPU/fix_nh_gpu.cpp b/src/GPU/fix_nh_gpu.cpp
new file mode 100644
index 0000000000..8b57289a50
--- /dev/null
+++ b/src/GPU/fix_nh_gpu.cpp
@@ -0,0 +1,552 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "fix_nh_gpu.h"
+
+#include "atom.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "update.h"
+
+#include <cstring>
+#include <cmath>
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define TILTMAX 1.5
+
+enum{NOBIAS,BIAS};
+enum{ISO,ANISO,TRICLINIC};
+
+typedef struct { double x,y,z; } dbl3_t;
+
+/* ----------------------------------------------------------------------
+   NVT,NPH,NPT integrators for improved Nose-Hoover equations of motion
+ ---------------------------------------------------------------------- */
+
+FixNHGPU::FixNHGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNH(lmp, narg, arg)
+{
+  _dtfm = 0;
+  _nlocal3 = 0;
+  _nlocal_max = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixNHGPU::~FixNHGPU()
+{
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHGPU::setup(int vflag)
+{
+  FixNH::setup(vflag);
+  if (strstr(update->integrate_style,"respa"))
+    _respa_on = 1;
+  else
+    _respa_on = 0;
+  reset_dt();
+}
+
+/* ----------------------------------------------------------------------
+   change box size
+   remap all atoms or dilate group atoms depending on allremap flag
+   if rigid bodies exist, scale rigid body centers-of-mass
+------------------------------------------------------------------------- */
+
+void FixNHGPU::remap()
+{
+  if (_respa_on) { FixNH::remap(); return; }
+
+  double oldlo,oldhi;
+  double expfac;
+
+  dbl3_t * _noalias const x = (dbl3_t *) atom->x[0];
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  double *h = domain->h;
+
+  // omega is not used, except for book-keeping
+
+  for (int i = 0; i < 6; i++) omega[i] += dto*omega_dot[i];
+
+  // convert pertinent atoms and rigid bodies to lamda coords
+  const double hi0 = domain->h_inv[0];
+  const double hi1 = domain->h_inv[1];
+  const double hi2 = domain->h_inv[2];
+  const double hi3 = domain->h_inv[3];
+  const double hi4 = domain->h_inv[4];
+  const double hi5 = domain->h_inv[5];
+  const double b0 = domain->boxlo[0];
+  const double b1 = domain->boxlo[1];
+  const double b2 = domain->boxlo[2];
+
+  if (allremap) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      const double d0 = x[i].x - b0;
+      const double d1 = x[i].y - b1;
+      const double d2 = x[i].z - b2;
+      x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
+      x[i].y = hi1*d1 + hi3*d2;
+      x[i].z = hi2*d2;
+    }
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & dilate_group_bit) {
+        const double d0 = x[i].x - b0;
+        const double d1 = x[i].y - b1;
+        const double d2 = x[i].z - b2;
+        x[i].x = hi0*d0 + hi5*d1 + hi4*d2;
+        x[i].y = hi1*d1 + hi3*d2;
+        x[i].z = hi2*d2;
+      }
+    }
+  }
+
+  if (nrigid)
+    for (int i = 0; i < nrigid; i++)
+      modify->fix[rfix[i]]->deform(0);
+
+  // reset global and local box to new size/shape
+
+  // this operation corresponds to applying the
+  // translate and scale operations
+  // corresponding to the solution of the following ODE:
+  //
+  // h_dot = omega_dot * h
+  //
+  // where h_dot, omega_dot and h are all upper-triangular
+  // 3x3 tensors. In Voigt notation, the elements of the
+  // RHS product tensor are:
+  // h_dot = [0*0, 1*1, 2*2, 1*3+3*2, 0*4+5*3+4*2, 0*5+5*1]
+  //
+  // Ordering of operations preserves time symmetry.
+
+  double dto2 = dto/2.0;
+  double dto4 = dto/4.0;
+  double dto8 = dto/8.0;
+
+  // off-diagonal components, first half
+
+  if (pstyle == TRICLINIC) {
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+
+    if (p_flag[3]) {
+      expfac = exp(dto4*omega_dot[1]);
+      h[3] *= expfac;
+      h[3] += dto2*(omega_dot[3]*h[2]);
+      h[3] *= expfac;
+    }
+
+    if (p_flag[5]) {
+      expfac = exp(dto4*omega_dot[0]);
+      h[5] *= expfac;
+      h[5] += dto2*(omega_dot[5]*h[1]);
+      h[5] *= expfac;
+    }
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+  }
+
+  // scale diagonal components
+  // scale tilt factors with cell, if set
+
+  if (p_flag[0]) {
+    oldlo = domain->boxlo[0];
+    oldhi = domain->boxhi[0];
+    expfac = exp(dto*omega_dot[0]);
+    domain->boxlo[0] = (oldlo-fixedpoint[0])*expfac + fixedpoint[0];
+    domain->boxhi[0] = (oldhi-fixedpoint[0])*expfac + fixedpoint[0];
+  }
+
+  if (p_flag[1]) {
+    oldlo = domain->boxlo[1];
+    oldhi = domain->boxhi[1];
+    expfac = exp(dto*omega_dot[1]);
+    domain->boxlo[1] = (oldlo-fixedpoint[1])*expfac + fixedpoint[1];
+    domain->boxhi[1] = (oldhi-fixedpoint[1])*expfac + fixedpoint[1];
+    if (scalexy) h[5] *= expfac;
+  }
+
+  if (p_flag[2]) {
+    oldlo = domain->boxlo[2];
+    oldhi = domain->boxhi[2];
+    expfac = exp(dto*omega_dot[2]);
+    domain->boxlo[2] = (oldlo-fixedpoint[2])*expfac + fixedpoint[2];
+    domain->boxhi[2] = (oldhi-fixedpoint[2])*expfac + fixedpoint[2];
+    if (scalexz) h[4] *= expfac;
+    if (scaleyz) h[3] *= expfac;
+  }
+
+  // off-diagonal components, second half
+
+  if (pstyle == TRICLINIC) {
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+
+    if (p_flag[3]) {
+      expfac = exp(dto4*omega_dot[1]);
+      h[3] *= expfac;
+      h[3] += dto2*(omega_dot[3]*h[2]);
+      h[3] *= expfac;
+    }
+
+    if (p_flag[5]) {
+      expfac = exp(dto4*omega_dot[0]);
+      h[5] *= expfac;
+      h[5] += dto2*(omega_dot[5]*h[1]);
+      h[5] *= expfac;
+    }
+
+    if (p_flag[4]) {
+      expfac = exp(dto8*omega_dot[0]);
+      h[4] *= expfac;
+      h[4] += dto4*(omega_dot[5]*h[3]+omega_dot[4]*h[2]);
+      h[4] *= expfac;
+    }
+
+  }
+
+  domain->yz = h[3];
+  domain->xz = h[4];
+  domain->xy = h[5];
+
+  // tilt factor to cell length ratio can not exceed TILTMAX in one step
+
+  if (domain->yz < -TILTMAX*domain->yprd ||
+      domain->yz > TILTMAX*domain->yprd ||
+      domain->xz < -TILTMAX*domain->xprd ||
+      domain->xz > TILTMAX*domain->xprd ||
+      domain->xy < -TILTMAX*domain->xprd ||
+      domain->xy > TILTMAX*domain->xprd)
+    error->all(FLERR,"Fix npt/nph has tilted box too far in one step - "
+               "periodic cell is too far from equilibrium state");
+
+  domain->set_global_box();
+  domain->set_local_box();
+
+  // convert pertinent atoms and rigid bodies back to box coords
+  const double h0 = domain->h[0];
+  const double h1 = domain->h[1];
+  const double h2 = domain->h[2];
+  const double h3 = domain->h[3];
+  const double h4 = domain->h[4];
+  const double h5 = domain->h[5];
+  const double nb0 = domain->boxlo[0];
+  const double nb1 = domain->boxlo[1];
+  const double nb2 = domain->boxlo[2];
+
+  if (allremap) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
+      x[i].y = h1*x[i].y + h3*x[i].z + nb1;
+      x[i].z = h2*x[i].z + nb2;
+    }
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & dilate_group_bit) {
+        x[i].x = h0*x[i].x + h5*x[i].y + h4*x[i].z + nb0;
+        x[i].y = h1*x[i].y + h3*x[i].z + nb1;
+        x[i].z = h2*x[i].z + nb2;
+      }
+    }
+  }
+
+  if (nrigid)
+    for (int i = 0; i < nrigid; i++)
+      modify->fix[rfix[i]]->deform(1);
+}
+
+/* ----------------------------------------------------------------------
+   2nd half of Verlet update
+------------------------------------------------------------------------- */
+
+void FixNHGPU::final_integrate() {
+  if (neighbor->ago == 0 && _respa_on == 0) reset_dt();
+  FixNH::final_integrate();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNHGPU::reset_dt()
+{
+  if (_respa_on) { FixNH::reset_dt(); return; }
+  dtv = update->dt;
+  dtf = 0.5 * update->dt * force->ftm2v;
+  dthalf = 0.5 * update->dt;
+  dt4 = 0.25 * update->dt;
+  dt8 = 0.125 * update->dt;
+  dto = dthalf;
+
+  if (pstat_flag)
+    pdrag_factor = 1.0 - (update->dt * p_freq_max * drag / nc_pchain);
+
+  if (tstat_flag)
+    tdrag_factor = 1.0 - (update->dt * t_freq * drag / nc_tchain);
+
+  const int * const mask = atom->mask;
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (nlocal > _nlocal_max) {
+    if (_nlocal_max) memory->destroy(_dtfm);
+    _nlocal_max = static_cast<int>(1.20 * nlocal);
+    memory->create(_dtfm, _nlocal_max * 3, "fix_nh_gpu:dtfm");
+  }
+
+  _nlocal3 = nlocal * 3;
+
+  if (igroup == 0) {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++) {
+        const double dtfir = dtf / rmass[i];
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+      }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++) {
+        const double dtfim = dtf / mass[type[i]];
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+      }
+    }
+  } else {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++)
+        if (mask[i] & groupbit) {
+          const double dtfir = dtf / rmass[i];
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = 0;
+      for (int i = 0; i < nlocal; i++)
+        if (mask[i] & groupbit) {
+          const double dtfim = dtf / mass[type[i]];
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step barostat scaling of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nh_v_press()
+{
+  if (pstyle == TRICLINIC || which == BIAS || _respa_on) {
+    FixNH::nh_v_press();
+    return;
+  }
+
+  dbl3_t * _noalias const v = (dbl3_t *)atom->v[0];
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  double f0 = exp(-dt4*(omega_dot[0]+mtk_term2));
+  double f1 = exp(-dt4*(omega_dot[1]+mtk_term2));
+  double f2 = exp(-dt4*(omega_dot[2]+mtk_term2));
+  f0 *= f0;
+  f1 *= f1;
+  f2 *= f2;
+
+  if (igroup == 0) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      v[i].x *= f0;
+      v[i].y *= f1;
+      v[i].z *= f2;
+    }
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < nlocal; i++) {
+      if (mask[i] & groupbit) {
+        v[i].x *= f0;
+        v[i].y *= f1;
+        v[i].z *= f2;
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step update of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nve_v()
+{
+  if (_respa_on) { FixNH::nve_v(); return; }
+
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+  #pragma omp parallel for simd schedule(static)
+  #elif (LAL_USE_OMP_SIMD == 1)
+  #pragma omp simd
+  #endif
+  for (int i = 0; i < _nlocal3; i++)
+    v[i] += _dtfm[i] * f[i];
+}
+
+/* ----------------------------------------------------------------------
+   perform full-step update of positions
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nve_x()
+{
+  if (_respa_on) { FixNH::nve_x(); return; }
+
+  double * _noalias const x = atom->x[0];
+  double * _noalias const v = atom->v[0];
+
+  // x update by full step only for atoms in group
+
+  if (igroup == 0) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++)
+      x[i] += dtv * v[i];
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++) {
+      if (_dtfm[i] != 0.0)
+        x[i] += dtv * v[i];
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step thermostat scaling of velocities
+-----------------------------------------------------------------------*/
+
+void FixNHGPU::nh_v_temp()
+{
+  if (which == BIAS || _respa_on) {
+    FixNH::nh_v_temp();
+    return;
+  }
+
+  double * _noalias const v = atom->v[0];
+
+  if (igroup == 0) {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++)
+        v[i] *= factor_eta;
+  } else {
+    #if (LAL_USE_OMP == 1) && (LAL_USE_OMP_SIMD == 1)
+    #pragma omp parallel for simd schedule(static)
+    #elif (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = 0; i < _nlocal3; i++) {
+      if (_dtfm[i] != 0.0)
+        v[i] *= factor_eta;
+    }
+  }
+}
+
+double FixNHGPU::memory_usage()
+{
+  return FixNH::memory_usage() + _nlocal_max * 3 * sizeof(double);
+}
diff --git a/src/GPU/fix_nh_gpu.h b/src/GPU/fix_nh_gpu.h
new file mode 100644
index 0000000000..edd210e813
--- /dev/null
+++ b/src/GPU/fix_nh_gpu.h
@@ -0,0 +1,164 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifndef LMP_FIX_NH_GPU_H
+#define LMP_FIX_NH_GPU_H
+
+#include "fix_nh.h"
+
+namespace LAMMPS_NS {
+
+class FixNHGPU : public FixNH {
+ public:
+  FixNHGPU(class LAMMPS *, int, char **);
+  virtual ~FixNHGPU();
+  virtual void setup(int vflag);
+  void reset_dt();
+  virtual void final_integrate();
+  virtual double memory_usage();
+
+ protected:
+  double *_dtfm;
+  int _nlocal3, _nlocal_max, _respa_on;
+
+  virtual void remap();
+  virtual void nve_x();
+  virtual void nve_v();
+  virtual void nh_v_press();
+  virtual void nh_v_temp();
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Target temperature for fix nvt/npt/nph cannot be 0.0
+
+Self-explanatory.
+
+E: Invalid fix nvt/npt/nph command for a 2d simulation
+
+Cannot control z dimension in a 2d model.
+
+E: Fix nvt/npt/nph dilate group ID does not exist
+
+Self-explanatory.
+
+E: Invalid fix nvt/npt/nph command pressure settings
+
+If multiple dimensions are coupled, those dimensions must be
+specified.
+
+E: Cannot use fix nvt/npt/nph on a non-periodic dimension
+
+When specifying a diagonal pressure component, the dimension must be
+periodic.
+
+E: Cannot use fix nvt/npt/nph on a 2nd non-periodic dimension
+
+When specifying an off-diagonal pressure component, the 2nd of the two
+dimensions must be periodic.  E.g. if the xy component is specified,
+then the y dimension must be periodic.
+
+E: Cannot use fix nvt/npt/nph with yz scaling when z is non-periodic dimension
+
+The 2nd dimension in the barostatted tilt factor must be periodic.
+
+E: Cannot use fix nvt/npt/nph with xz scaling when z is non-periodic dimension
+
+The 2nd dimension in the barostatted tilt factor must be periodic.
+
+E: Cannot use fix nvt/npt/nph with xy scaling when y is non-periodic dimension
+
+The 2nd dimension in the barostatted tilt factor must be periodic.
+
+E: Cannot use fix nvt/npt/nph with both yz dynamics and yz scaling
+
+Self-explanatory.
+
+E: Cannot use fix nvt/npt/nph with both xz dynamics and xz scaling
+
+Self-explanatory.
+
+E: Cannot use fix nvt/npt/nph with both xy dynamics and xy scaling
+
+Self-explanatory.
+
+E: Can not specify Pxy/Pxz/Pyz in fix nvt/npt/nph with non-triclinic box
+
+Only triclinic boxes can be used with off-diagonal pressure components.
+See the region prism command for details.
+
+E: Invalid fix nvt/npt/nph pressure settings
+
+Settings for coupled dimensions must be the same.
+
+E: Fix nvt/npt/nph damping parameters must be > 0.0
+
+Self-explanatory.
+
+E: Cannot use fix npt and fix deform on same component of stress tensor
+
+This would be changing the same box dimension twice.
+
+E: Temperature ID for fix nvt/npt does not exist
+
+Self-explanatory.
+
+E: Pressure ID for fix npt/nph does not exist
+
+Self-explanatory.
+
+E: Fix npt/nph has tilted box too far in one step - periodic cell is too far from equilibrium state
+
+Self-explanatory.  The change in the box tilt is too extreme
+on a short timescale.
+
+E: Could not find fix_modify temperature ID
+
+The compute ID for computing temperature does not exist.
+
+E: Fix_modify temperature ID does not compute temperature
+
+The compute ID assigned to the fix must compute temperature.
+
+W: Temperature for fix modify is not for group all
+
+The temperature compute is being used with a pressure calculation
+which does operate on group all, so this may be inconsistent.
+
+E: Pressure ID for fix modify does not exist
+
+Self-explanatory.
+
+E: Could not find fix_modify pressure ID
+
+The compute ID for computing pressure does not exist.
+
+E: Fix_modify pressure ID does not compute pressure
+
+The compute ID assigned to the fix must compute pressure.
+
+*/
diff --git a/src/GPU/fix_npt_gpu.cpp b/src/GPU/fix_npt_gpu.cpp
new file mode 100644
index 0000000000..2ba0be29e0
--- /dev/null
+++ b/src/GPU/fix_npt_gpu.cpp
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_npt_gpu.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixNPTGPU::FixNPTGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNHGPU(lmp, narg, arg)
+{
+  if (!tstat_flag)
+    error->all(FLERR,"Temperature control must be used with fix npt/omp");
+  if (!pstat_flag)
+    error->all(FLERR,"Pressure control must be used with fix npt/omp");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+  // compute group = all since pressure is always global (group all)
+  // and thus its KE/temperature contribution should use group all
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "temp";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tcomputeflag = 1;
+
+  // create a new compute pressure style
+  // id = fix-ID + press, compute group = all
+  // pass id_temp as 4th arg to pressure constructor
+
+  n = strlen(id) + 7;
+  id_press = new char[n];
+  strcpy(id_press,id);
+  strcat(id_press,"_press");
+
+  newarg = new char*[4];
+  newarg[0] = id_press;
+  newarg[1] = (char *) "all";
+  newarg[2] = (char *) "pressure";
+  newarg[3] = id_temp;
+  modify->add_compute(4,newarg);
+  delete [] newarg;
+  pcomputeflag = 1;
+}
diff --git a/src/GPU/fix_npt_gpu.h b/src/GPU/fix_npt_gpu.h
new file mode 100644
index 0000000000..2684935fe5
--- /dev/null
+++ b/src/GPU/fix_npt_gpu.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(npt/gpu,FixNPTGPU)
+
+#else
+
+#ifndef LMP_FIX_NPT_GPU_H
+#define LMP_FIX_NPT_GPU_H
+
+#include "fix_nh_gpu.h"
+
+namespace LAMMPS_NS {
+
+class FixNPTGPU : public FixNHGPU {
+ public:
+  FixNPTGPU(class LAMMPS *, int, char **);
+  ~FixNPTGPU() {}
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Temperature control must be used with fix npt
+
+Self-explanatory.
+
+E: Pressure control must be used with fix npt
+
+Self-explanatory.
+
+*/
diff --git a/src/GPU/fix_nve_asphere_gpu.cpp b/src/GPU/fix_nve_asphere_gpu.cpp
new file mode 100644
index 0000000000..bf6cfda67d
--- /dev/null
+++ b/src/GPU/fix_nve_asphere_gpu.cpp
@@ -0,0 +1,440 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "fix_nve_asphere_gpu.h"
+
+#include "atom.h"
+#include "atom_vec_ellipsoid.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "update.h"
+#include <cmath>
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define INERTIA 0.2          // moment of inertia prefactor for ellipsoid
+
+#define ME_qnormalize(q)                                                \
+{                                                                       \
+  double norm = 1.0 /                                                   \
+    sqrt(q##_w*q##_w + q##_i*q##_i + q##_j*q##_j + q##_k*q##_k);        \
+  q##_w *= norm;                                                        \
+  q##_i *= norm;                                                        \
+  q##_j *= norm;                                                        \
+  q##_k *= norm;                                                        \
+}
+
+#define ME_mq_to_omega(m, quat, moments_0, moments_1, moments_2, w)     \
+{                                                                       \
+  double wbody_0, wbody_1, wbody_2;                                     \
+  double rot_0, rot_1, rot_2, rot_3, rot_4, rot_5, rot_6, rot_7, rot_8; \
+                                                                        \
+  double w2 = quat##_w * quat##_w;                                      \
+  double i2 = quat##_i * quat##_i;                                      \
+  double j2 = quat##_j * quat##_j;                                      \
+  double k2 = quat##_k * quat##_k;                                      \
+  double twoij = 2.0 * quat##_i * quat##_j;                             \
+  double twoik = 2.0 * quat##_i * quat##_k;                             \
+  double twojk = 2.0 * quat##_j * quat##_k;                             \
+  double twoiw = 2.0 * quat##_i * quat##_w;                             \
+  double twojw = 2.0 * quat##_j * quat##_w;                             \
+  double twokw = 2.0 * quat##_k * quat##_w;                             \
+                                                                        \
+  rot##_0 = w2 + i2 - j2 - k2;                                          \
+  rot##_1 = twoij - twokw;                                              \
+  rot##_2 = twojw + twoik;                                              \
+                                                                        \
+  rot##_3 = twoij + twokw;                                              \
+  rot##_4 = w2 - i2 + j2 - k2;                                          \
+  rot##_5 = twojk - twoiw;                                              \
+                                                                        \
+  rot##_6 = twoik - twojw;                                              \
+  rot##_7 = twojk + twoiw;                                              \
+  rot##_8 = w2 - i2 - j2 + k2;                                          \
+                                                                        \
+  wbody_0 = rot##_0*m##_0 + rot##_3*m##_1 + rot##_6*m##_2;              \
+  wbody_1 = rot##_1*m##_0 + rot##_4*m##_1 + rot##_7*m##_2;              \
+  wbody_2 = rot##_2*m##_0 + rot##_5*m##_1 + rot##_8*m##_2;              \
+                                                                        \
+  wbody_0 *= moments_0;                                                 \
+  wbody_1 *= moments_1;                                                 \
+  wbody_2 *= moments_2;                                                 \
+                                                                        \
+  w##_0 = rot##_0*wbody_0 + rot##_1*wbody_1 + rot##_2*wbody_2;          \
+  w##_1 = rot##_3*wbody_0 + rot##_4*wbody_1 + rot##_5*wbody_2;          \
+  w##_2 = rot##_6*wbody_0 + rot##_7*wbody_1 + rot##_8*wbody_2;          \
+}
+
+#define ME_omega_richardson(dtf,dtq,angmomin,quatin,torque,i0,i1,i2)    \
+{                                                                       \
+  angmomin[0] += dtf * torque[0];                                       \
+  double angmom_0 = angmomin[0];                                        \
+  angmomin[1] += dtf * torque[1];                                       \
+  double angmom_1 = angmomin[1];                                        \
+  angmomin[2] += dtf * torque[2];                                       \
+  double angmom_2 = angmomin[2];                                        \
+                                                                        \
+  double quat_w = quatin[0];                                            \
+  double quat_i = quatin[1];                                            \
+  double quat_j = quatin[2];                                            \
+  double quat_k = quatin[3];                                            \
+                                                                        \
+  double omega_0, omega_1, omega_2;                                     \
+  ME_mq_to_omega(angmom,quat,i0,i1,i2,omega);                           \
+                                                                        \
+  double wq_0, wq_1, wq_2, wq_3;                                        \
+  wq_0 = -omega_0*quat_i - omega_1*quat_j - omega_2*quat_k;             \
+  wq_1 = quat_w*omega_0 + omega_1*quat_k - omega_2*quat_j;              \
+  wq_2 = quat_w*omega_1 + omega_2*quat_i - omega_0*quat_k;              \
+  wq_3 = quat_w*omega_2 + omega_0*quat_j - omega_1*quat_i;              \
+                                                                        \
+  double qfull_w, qfull_i, qfull_j, qfull_k;                            \
+  qfull_w = quat_w + dtq * wq_0;                                        \
+  qfull_i = quat_i + dtq * wq_1;                                        \
+  qfull_j = quat_j + dtq * wq_2;                                        \
+  qfull_k = quat_k + dtq * wq_3;                                        \
+  ME_qnormalize(qfull);                                                 \
+                                                                        \
+  double qhalf_w, qhalf_i, qhalf_j, qhalf_k;                            \
+  qhalf_w = quat_w + 0.5*dtq * wq_0;                                    \
+  qhalf_i = quat_i + 0.5*dtq * wq_1;                                    \
+  qhalf_j = quat_j + 0.5*dtq * wq_2;                                    \
+  qhalf_k = quat_k + 0.5*dtq * wq_3;                                    \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  ME_mq_to_omega(angmom,qhalf,i0,i1,i2,omega);                          \
+  wq_0 = -omega_0*qhalf_i - omega_1*qhalf_j - omega_2*qhalf_k;          \
+  wq_1 = qhalf_w*omega_0 + omega_1*qhalf_k - omega_2*qhalf_j;           \
+  wq_2 = qhalf_w*omega_1 + omega_2*qhalf_i - omega_0*qhalf_k;           \
+  wq_3 = qhalf_w*omega_2 + omega_0*qhalf_j - omega_1*qhalf_i;           \
+                                                                        \
+  qhalf_w += 0.5*dtq * wq_0;                                            \
+  qhalf_i += 0.5*dtq * wq_1;                                            \
+  qhalf_j += 0.5*dtq * wq_2;                                            \
+  qhalf_k += 0.5*dtq * wq_3;                                            \
+  ME_qnormalize(qhalf);                                                 \
+                                                                        \
+  quat_w = 2.0*qhalf_w - qfull_w;                                       \
+  quat_i = 2.0*qhalf_i - qfull_i;                                       \
+  quat_j = 2.0*qhalf_j - qfull_j;                                       \
+  quat_k = 2.0*qhalf_k - qfull_k;                                       \
+  ME_qnormalize(quat);                                                  \
+                                                                        \
+  quatin[0] = quat_w;                                                   \
+  quatin[1] = quat_i;                                                   \
+  quatin[2] = quat_j;                                                   \
+  quatin[3] = quat_k;                                                   \
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixNVEAsphereGPU::FixNVEAsphereGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNVE(lmp, narg, arg)
+{
+  _dtfm = 0;
+  _nlocal_max = 0;
+  _inertia0 = 0;
+  _inertia1 = 0;
+  _inertia2 = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::init()
+{
+  avec = (AtomVecEllipsoid *) atom->style_match("ellipsoid");
+  if (!avec)
+    error->all(FLERR,"Compute nve/asphere requires atom style ellipsoid");
+
+  // check that all particles are finite-size ellipsoids
+  // no point particles allowed, spherical is OK
+
+  int *ellipsoid = atom->ellipsoid;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++)
+    if (mask[i] & groupbit)
+      if (ellipsoid[i] < 0)
+        error->one(FLERR,"Fix nve/asphere requires extended particles");
+
+  FixNVE::init();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::setup(int vflag)
+{
+  FixNVE::setup(vflag);
+  reset_dt();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::initial_integrate(int /*vflag*/)
+{
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  int *ellipsoid = atom->ellipsoid;
+  double * _noalias const x = atom->x[0];
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  int *mask = atom->mask;
+
+  double **angmom = atom->angmom;
+  double **torque = atom->torque;
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  // set timestep here since dt may have changed or come via rRESPA
+
+  dtq = 0.5 * dtv;
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    const int ifrom3 = ifrom * 3;
+    const int ito3 = ito * 3;
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ifrom3 = 0;
+    const int ito = nlocal;
+    const int ito3 = nlocal * 3;
+    #endif
+
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = ifrom3; i < ito3; i++) {
+      v[i] += _dtfm[i] * f[i];
+      x[i] += dtv * v[i];
+    }
+
+    // update angular momentum by 1/2 step
+    if (igroup == 0) {
+      #if (LAL_USE_OMP_SIMD == 1)
+        // Workaround for compiler bug
+        #ifdef __INTEL_COMPILER
+        #pragma simd
+        #else
+        #pragma omp simd
+        #endif
+      #endif
+      for (int i = ifrom; i < ito; i++) {
+        double *quat = bonus[ellipsoid[i]].quat;
+        ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i], _inertia0[i],
+                            _inertia1[i], _inertia2[i]);
+      }
+    } else {
+      #if (LAL_USE_OMP_SIMD == 1)
+        // Workaround for compiler bug
+        #ifdef __INTEL_COMPILER
+        #pragma simd
+        #else
+        #pragma omp simd
+        #endif
+      #endif
+      for (int i = ifrom; i < ito; i++) {
+        if (mask[i] & groupbit) {
+          double *quat = bonus[ellipsoid[i]].quat;
+          ME_omega_richardson(dtf, dtq, angmom[i], quat, torque[i],
+                              _inertia0[i], _inertia1[i], _inertia2[i]);
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEAsphereGPU::final_integrate()
+{
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  double * _noalias const angmom = atom->angmom[0];
+  const double * _noalias const torque = atom->torque[0];
+
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (neighbor->ago == 0) {
+    if (nlocal > _nlocal_max) {
+      if (_nlocal_max) {
+        memory->destroy(_dtfm);
+        memory->destroy(_inertia0);
+        memory->destroy(_inertia1);
+        memory->destroy(_inertia2);
+      }
+      _nlocal_max = static_cast<int>(1.20 * nlocal);
+      memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+      memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0");
+      memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1");
+      memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2");
+    }
+  }
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    const int ifrom3 = ifrom * 3;
+    const int ito3 = ito * 3;
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ifrom3 = 0;
+    const int ito = nlocal;
+    const int ito3 = nlocal * 3;
+    #endif
+
+    double dtfo;
+    if (neighbor->ago == 0) dtfo = reset_dt_omp(ifrom, ito, tid);
+    else dtfo = dtf;
+
+    #if (LAL_USE_OMP_SIMD == 1)
+    #pragma omp simd
+    #endif
+    for (int i = ifrom3; i < ito3; i++) {
+      v[i] += _dtfm[i] * f[i];
+      angmom[i] += dtfo * torque[i];
+    }
+  }
+}
+
+void FixNVEAsphereGPU::reset_dt() {
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (nlocal > _nlocal_max) {
+    if (_nlocal_max) {
+      memory->destroy(_dtfm);
+      memory->destroy(_inertia0);
+      memory->destroy(_inertia1);
+      memory->destroy(_inertia2);
+    }
+    _nlocal_max = static_cast<int>(1.20 * nlocal);
+    memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+    memory->create(_inertia0, _nlocal_max * 3, "fix_nve_gpu:inertia0");
+    memory->create(_inertia1, _nlocal_max * 3, "fix_nve_gpu:inertia1");
+    memory->create(_inertia2, _nlocal_max * 3, "fix_nve_gpu:inertia2");
+  }
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ito = nlocal;
+    #endif
+    reset_dt_omp(ifrom, ito, tid);
+  }
+}
+
+double FixNVEAsphereGPU::reset_dt_omp(const int ifrom, const int ito,
+                                      const int tid) {
+  AtomVecEllipsoid::Bonus *bonus = avec->bonus;
+  int *ellipsoid = atom->ellipsoid;
+  const int * const mask = atom->mask;
+
+  const double dtfo = 0.5 * update->dt * force->ftm2v;
+  if (tid == 0) {
+    dtv = update->dt;
+    dtf = dtfo;
+  }
+
+  if (igroup == 0) {
+    const double * const rmass = atom->rmass;
+    int n = ifrom * 3;
+    for (int i = ifrom; i < ito; i++) {
+      const double dtfir = dtfo / rmass[i];
+      _dtfm[n++] = dtfir;
+      _dtfm[n++] = dtfir;
+      _dtfm[n++] = dtfir;
+      double *shape = bonus[ellipsoid[i]].shape;
+      double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+      if (idot != 0.0) idot = 1.0 / idot;
+      _inertia0[i] = idot;
+      idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+      if (idot != 0.0) idot = 1.0 / idot;
+      _inertia1[i] = idot;
+      idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+      if (idot != 0.0) idot = 1.0 / idot;
+      _inertia2[i] = idot;
+    }
+  } else {
+    const double * const rmass = atom->rmass;
+    int n = ifrom * 3;
+    for (int i = ifrom; i < ito; i++) {
+      if (mask[i] & groupbit) {
+        const double dtfir = dtfo / rmass[i];
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        double *shape = bonus[ellipsoid[i]].shape;
+        double idot = INERTIA*rmass[i] * (shape[1]*shape[1]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia0[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[2]*shape[2]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia1[i] = idot;
+        idot = INERTIA*rmass[i] * (shape[0]*shape[0]+shape[1]*shape[1]);
+        if (idot != 0.0) idot = 1.0 / idot;
+        _inertia2[i] = idot;
+      } else {
+        _dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
+        _dtfm[n++] = 0.0;
+      }
+    }
+  }
+  return dtfo;
+}
+
+double FixNVEAsphereGPU::memory_usage()
+{
+  return FixNVE::memory_usage() + _nlocal_max * 12 * sizeof(double);
+}
+
diff --git a/src/GPU/fix_nve_asphere_gpu.h b/src/GPU/fix_nve_asphere_gpu.h
new file mode 100644
index 0000000000..3c67e0e024
--- /dev/null
+++ b/src/GPU/fix_nve_asphere_gpu.h
@@ -0,0 +1,63 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/asphere/gpu,FixNVEAsphereGPU)
+
+#else
+
+#ifndef LMP_FIX_NVE_ASPHERE_GPU_H
+#define LMP_FIX_NVE_ASPHERE_GPU_H
+
+#include "fix_nve.h"
+
+namespace LAMMPS_NS {
+
+class FixNVEAsphereGPU : public FixNVE {
+ public:
+  FixNVEAsphereGPU(class LAMMPS *, int, char **);
+  void init();
+  void setup(int vflag);
+  void initial_integrate(int);
+  void final_integrate();
+  void reset_dt();
+  virtual double memory_usage();
+
+ private:
+  double reset_dt_omp(const int, const int, const int);
+  double *_dtfm, *_inertia0, *_inertia1, *_inertia2;
+  int _nlocal_max;
+  double dtq;
+  class AtomVecEllipsoid *avec;
+};
+
+}
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Compute nve/asphere requires atom style ellipsoid
+
+Self-explanatory.
+
+E: Fix nve/asphere requires extended particles
+
+This fix can only be used for particles with a shape setting.
+
+*/
diff --git a/src/GPU/fix_nve_gpu.cpp b/src/GPU/fix_nve_gpu.cpp
new file mode 100644
index 0000000000..c3dd5b6ae2
--- /dev/null
+++ b/src/GPU/fix_nve_gpu.cpp
@@ -0,0 +1,291 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include "fix_nve_gpu.h"
+#include <cstring>
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "gpu_extra.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "update.h"
+#if (LAL_USE_OMP == 1)
+#include <omp.h>
+#endif
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVEGPU::FixNVEGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNVE(lmp, narg, arg)
+{
+  _dtfm = 0;
+  _nlocal_max = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixNVEGPU::~FixNVEGPU()
+{
+  memory->destroy(_dtfm);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEGPU::setup(int vflag)
+{
+  FixNVE::setup(vflag);
+  if (strstr(update->integrate_style,"respa"))
+    _respa_on = 1;
+  else
+    _respa_on = 0;
+  if (atom->ntypes > 1) reset_dt();
+}
+
+/* ----------------------------------------------------------------------
+   allow for both per-type and per-atom mass
+------------------------------------------------------------------------- */
+
+void FixNVEGPU::initial_integrate(int vflag)
+{
+  if (_respa_on) { FixNVE::initial_integrate(vflag); return; }
+
+  // update v and x of atoms in group
+
+  double * _noalias const x = atom->x[0];
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+  const int nlocal3 = nlocal * 3;
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int idelta = nlocal3 / nthreads + 1;
+    const int ifrom3 = omp_get_thread_num() * idelta;
+    const int ito3 = MIN(ifrom3 + idelta, nlocal3);
+    #else
+    const int ifrom3 = 0;
+    const int ito3 = nlocal3;
+    #endif
+    if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
+      const double dtfm = dtf / atom->mass[1];
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++) {
+        v[i] += dtfm * f[i];
+        x[i] += dtv * v[i];
+      }
+    } else if (igroup == 0) {
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++) {
+        v[i] += _dtfm[i] * f[i];
+        x[i] += dtv * v[i];
+      }
+    } else {
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++) {
+        if (_dtfm[i] != 0.0) {
+          v[i] += _dtfm[i] * f[i];
+          x[i] += dtv * v[i];
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixNVEGPU::final_integrate()
+{
+  if (_respa_on) { FixNVE::final_integrate(); return; }
+  // update v of atoms in group
+  double * _noalias const v = atom->v[0];
+  const double * _noalias const f = atom->f[0];
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+
+  if (neighbor->ago == 0) {
+    if (igroup != 0 || atom->ntypes != 1 || atom->rmass) {
+      if (nlocal > _nlocal_max) {
+        if (_nlocal_max) memory->destroy(_dtfm);
+        _nlocal_max = static_cast<int>(1.20 * nlocal);
+        memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+      }
+    }
+  }
+
+  #if (LAL_USE_OMP == 1)
+  #pragma omp parallel
+  #endif
+  {
+    #if (LAL_USE_OMP == 1)
+    const int nthreads = comm->nthreads;
+    const int tid = omp_get_thread_num();
+    const int idelta = nlocal / nthreads + 1;
+    const int ifrom = tid * idelta;
+    const int ito = MIN(ifrom + idelta, nlocal);
+    const int ifrom3 = ifrom * 3;
+    const int ito3 = ito * 3;
+    #else
+    const int tid = 0;
+    const int ifrom = 0;
+    const int ifrom3 = 0;
+    const int ito = nlocal;
+    const int ito3 = nlocal * 3;
+    #endif
+    if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
+      const double dtfm = dtf / atom->mass[1];
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++)
+        v[i] += dtfm * f[i];
+    } else if (igroup == 0) {
+      if (neighbor->ago == 0) reset_dt_omp(ifrom,ito,tid);
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++)
+        v[i] += _dtfm[i] * f[i];
+    } else {
+      if (neighbor->ago == 0) reset_dt_omp(ifrom,ito,tid);
+      #if (LAL_USE_OMP_SIMD == 1)
+      #pragma omp simd
+      #endif
+      for (int i = ifrom3; i < ito3; i++)
+        v[i] += _dtfm[i] * f[i];
+    }
+  }
+}
+
+void FixNVEGPU::reset_dt() {
+  if (_respa_on) { FixNVE::reset_dt(); return; }
+  if (igroup == 0 && atom->ntypes == 1 && !atom->rmass) {
+    dtv = update->dt;
+    dtf = 0.5 * update->dt * force->ftm2v;
+  } else {
+    const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+      atom->nlocal;
+    if (nlocal > _nlocal_max) {
+      if (_nlocal_max) memory->destroy(_dtfm);
+      _nlocal_max = static_cast<int>(1.20 * nlocal);
+      memory->create(_dtfm, _nlocal_max * 3, "fix_nve_gpu:dtfm");
+    }
+
+    #if (LAL_USE_OMP == 1)
+    #pragma omp parallel
+    #endif
+    {
+      #if (LAL_USE_OMP == 1)
+      const int nthreads = comm->nthreads;
+      const int tid = omp_get_thread_num();
+      const int idelta = nlocal / nthreads + 1;
+      const int ifrom = tid * idelta;
+      const int ito = MIN(ifrom + idelta, nlocal);
+      #else
+      const int tid = 0;
+      const int ifrom = 0;
+      const int ito = nlocal;
+      #endif
+
+      reset_dt_omp(ifrom, ito, tid);
+    }
+  }
+}
+
+void FixNVEGPU::reset_dt_omp(const int ifrom, const int ito, const int tid) {
+  const double dtfo = 0.5 * update->dt * force->ftm2v;
+  if (tid == 0) {
+    dtv = update->dt;
+    dtf = dtfo;
+  }
+
+  const int * const mask = atom->mask;
+  if (igroup == 0) {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++) {
+        const double dtfir = dtfo / rmass[i];
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+        _dtfm[n++] = dtfir;
+      }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++) {
+        const double dtfim = dtfo / mass[type[i]];
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+        _dtfm[n++] = dtfim;
+      }
+    }
+  } else {
+    if (atom->rmass) {
+      const double * const rmass = atom->rmass;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++)
+        if (mask[i] & groupbit) {
+          const double dtfir = dtfo / rmass[i];
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+          _dtfm[n++] = dtfir;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    } else {
+      const double * const mass = atom->mass;
+      const int * const type = atom->type;
+      int n = ifrom * 3;
+      for (int i = ifrom; i < ito; i++)
+        if (mask[i] & groupbit) {
+          const double dtfim = dtfo / mass[type[i]];
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+          _dtfm[n++] = dtfim;
+        } else {
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+          _dtfm[n++] = 0.0;
+        }
+    }
+  }
+}
+
+double FixNVEGPU::memory_usage()
+{
+  const int nlocal = (igroup == atom->firstgroup) ? atom->nfirst :
+    atom->nlocal;
+  return FixNVE::memory_usage() + nlocal * 3 * sizeof(double);
+}
diff --git a/src/GPU/fix_nve_gpu.h b/src/GPU/fix_nve_gpu.h
new file mode 100644
index 0000000000..1042d4eadd
--- /dev/null
+++ b/src/GPU/fix_nve_gpu.h
@@ -0,0 +1,60 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nve/gpu,FixNVEGPU)
+
+#else
+
+#ifndef LMP_FIX_NVE_GPU_H
+#define LMP_FIX_NVE_GPU_H
+
+#include "fix_nve.h"
+
+namespace LAMMPS_NS {
+
+class FixNVEGPU : public FixNVE {
+ public:
+  FixNVEGPU(class LAMMPS *, int, char **);
+  virtual ~FixNVEGPU();
+  virtual void setup(int);
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+  virtual void reset_dt();
+  virtual double memory_usage();
+
+ protected:
+  void reset_dt_omp(const int, const int, const int);
+  double *_dtfm;
+  int _nlocal_max, _respa_on;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+*/
diff --git a/src/GPU/fix_nvt_gpu.cpp b/src/GPU/fix_nvt_gpu.cpp
new file mode 100644
index 0000000000..7d7826b6bf
--- /dev/null
+++ b/src/GPU/fix_nvt_gpu.cpp
@@ -0,0 +1,50 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <cstring>
+#include "fix_nvt_gpu.h"
+#include "group.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+FixNVTGPU::FixNVTGPU(LAMMPS *lmp, int narg, char **arg) :
+  FixNHGPU(lmp, narg, arg)
+{
+  if (!tstat_flag)
+    error->all(FLERR,"Temperature control must be used with fix nvt");
+  if (pstat_flag)
+    error->all(FLERR,"Pressure control can not be used with fix nvt");
+
+  // create a new compute temp style
+  // id = fix-ID + temp
+
+  int n = strlen(id) + 6;
+  id_temp = new char[n];
+  strcpy(id_temp,id);
+  strcat(id_temp,"_temp");
+
+  char **newarg = new char*[3];
+  newarg[0] = id_temp;
+  newarg[1] = group->names[igroup];
+  newarg[2] = (char *) "temp";
+
+  modify->add_compute(3,newarg);
+  delete [] newarg;
+  tcomputeflag = 1;
+}
+
diff --git a/src/GPU/fix_nvt_gpu.h b/src/GPU/fix_nvt_gpu.h
new file mode 100644
index 0000000000..7ccba97040
--- /dev/null
+++ b/src/GPU/fix_nvt_gpu.h
@@ -0,0 +1,52 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(nvt/gpu,FixNVTGPU)
+
+#else
+
+#ifndef LMP_FIX_NVT_GPU_H
+#define LMP_FIX_NVT_GPU_H
+
+#include "fix_nh_gpu.h"
+
+namespace LAMMPS_NS {
+
+class FixNVTGPU : public FixNHGPU {
+ public:
+  FixNVTGPU(class LAMMPS *, int, char **);
+  ~FixNVTGPU() {}
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Temperature control must be used with fix nvt
+
+Self-explanatory.
+
+E: Pressure control can not be used with fix nvt
+
+Self-explanatory.
+
+*/
diff --git a/src/GPU/gpu_extra.h b/src/GPU/gpu_extra.h
index 115e1f0574..1a957c9aef 100644
--- a/src/GPU/gpu_extra.h
+++ b/src/GPU/gpu_extra.h
@@ -21,6 +21,29 @@
 #include "modify.h"
 #include "error.h"
 
+// ---------------------- OPENMP PREPROCESSOR STUFF ------------------
+#if defined(_OPENMP)
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 1
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+    #if (_OPENMP >= 201307)
+    #define LAL_USE_OMP_SIMD 1
+    #else
+    #define LAL_USE_OMP_SIMD 0
+    #endif
+  #endif
+#else
+  #if !defined(LAL_USE_OMP)
+  #define LAL_USE_OMP 0
+  #endif
+
+  #if !defined(LAL_USE_OMP_SIMD)
+  #define LAL_USE_OMP_SIMD 0
+  #endif
+#endif
+
 namespace GPU_EXTRA {
 
   inline void check_flag(int error_flag, LAMMPS_NS::Error *error,
@@ -61,6 +84,12 @@ namespace GPU_EXTRA {
       else if (all_success == -12)
         error->all(FLERR,
                    "Invalid OpenCL platform ID.");
+      else if (all_success == -13)
+        error->all(FLERR,
+                   "Invalid device configuration.");
+      else if (all_success == -15)
+        error->all(FLERR,
+                   "P3M built for FP64 and GPU device is FP32 only.");
       else
         error->all(FLERR,"Unknown error in GPU library");
     }
@@ -127,12 +156,22 @@ greater than 4 for NVIDIA GPUs.
 E: Invalid custom OpenCL parameter string.
 
 There are not enough or too many parameters in the custom string for package
-GPU.
+GPU or the parameters do not meet required restrictions.
 
 E: Unknown error in GPU library
 
 Self-explanatory.
 
+E: Invalid device configuration.
+
+The specified GPU or accelerator does not support the specified device
+configuration. Check the output of ocl_get_devices or nvd_get_devices to
+verify the correct device IDs for the GPU package.
+
+E: P3M built for FP64 and GPU device is FP32 only
+
+Either turn off GPU acceleration for PPPM or build LAMMPS with -DFFT_SINGLE
+
 W: Increasing communication cutoff for GPU style
 
 The pair style has increased the communication cutoff to be consistent with
diff --git a/src/GPU/pair_beck_gpu.cpp b/src/GPU/pair_beck_gpu.cpp
index 38cc593076..ff9537a33e 100644
--- a/src/GPU/pair_beck_gpu.cpp
+++ b/src/GPU/pair_beck_gpu.cpp
@@ -48,9 +48,9 @@ int beck_gpu_init(const int ntypes, double **cutsq, double **host_aa,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen);
 void beck_gpu_clear();
-int ** beck_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** beck_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum,
@@ -160,9 +160,10 @@ void PairBeckGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = beck_gpu_init(atom->ntypes+1, cutsq, aa, alpha, beta,
                               AA, BB, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_born_coul_long_cs_gpu.cpp b/src/GPU/pair_born_coul_long_cs_gpu.cpp
index b65b662496..db0faab0ab 100644
--- a/src/GPU/pair_born_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_cs_gpu.cpp
@@ -57,15 +57,15 @@ using namespace MathConst;
 // External functions from cuda library for atom decomposition
 
 int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                    double **host_born1, double **host_born2,
-                    double **host_born3, double **host_a,
-                    double **host_c, double **host_d,
-                    double **sigma, double **offset, double *special_lj,
-                    const int inum, const int nall, const int max_nbors,
-                    const int maxspecial, const double cell_size,
-                    int &gpu_mode, FILE *screen, double **host_cut_ljsq,
-                    double host_cut_coulsq, double *host_special_coul,
-                    const double qqrd2e, const double g_ewald);
+                      double **host_born1, double **host_born2,
+                      double **host_born3, double **host_a,
+                      double **host_c, double **host_d,
+                      double **sigma, double **offset, double *special_lj,
+                      const int inum, const int nall, const int max_nbors,
+                      const int maxspecial, const double cell_size,
+                      int &gpu_mode, FILE *screen, double **host_cut_ljsq,
+                      double host_cut_coulsq, double *host_special_coul,
+                      const double qqrd2e, const double g_ewald);
 void bornclcs_gpu_clear();
 int** bornclcs_gpu_compute_n(const int ago, const int inum_full, const int nall,
                            double **host_x, int *host_type, double *sublo,
@@ -196,10 +196,11 @@ void PairBornCoulLongCSGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = bornclcs_gpu_init(atom->ntypes+1, cutsq,  rhoinv,
                                 born1, born2, born3, a, c, d, sigma,
                                 offset, force->special_lj, atom->nlocal,
-                                  atom->nlocal+atom->nghost, 300, maxspecial,
+                                  atom->nlocal+atom->nghost, mnf, maxspecial,
                                    cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul,
                                 force->qqrd2e, g_ewald);
diff --git a/src/GPU/pair_born_coul_long_gpu.cpp b/src/GPU/pair_born_coul_long_gpu.cpp
index 0a359f66cc..cad174c0de 100644
--- a/src/GPU/pair_born_coul_long_gpu.cpp
+++ b/src/GPU/pair_born_coul_long_gpu.cpp
@@ -195,10 +195,11 @@ void PairBornCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = borncl_gpu_init(atom->ntypes+1, cutsq,  rhoinv,
                                 born1, born2, born3, a, c, d, sigma,
                                 offset, force->special_lj, atom->nlocal,
-                                  atom->nlocal+atom->nghost, 300, maxspecial,
+                                  atom->nlocal+atom->nghost, mnf, maxspecial,
                                    cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul,
                                 force->qqrd2e, g_ewald);
diff --git a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
index 7aba6e059b..5c8cac0ec2 100644
--- a/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_cs_gpu.cpp
@@ -45,24 +45,26 @@ using namespace MathConst;
 // External functions from cuda library for atom decomposition
 
 int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
-                    double **host_born1, double **host_born2,
-                    double **host_born3, double **host_a, double **host_c,
-                    double **host_d, double **sigma, double **offset,
-                    double *special_lj, const int inum,
-                    const int nall, const int max_nbors, const int maxspecial,
-                    const double cell_size, int &gpu_mode, FILE *screen,
-                    double **host_cut_ljsq, double host_cut_coulsq,
-                    double *host_special_coul, const double qqrd2e,
-                    const double alf, const double e_shift, const double f_shift);
+                      double **host_born1, double **host_born2,
+                      double **host_born3, double **host_a, double **host_c,
+                      double **host_d, double **sigma, double **offset,
+                      double *special_lj, const int inum, const int nall,
+                      const int max_nbors, const int maxspecial,
+                      const double cell_size, int &gpu_mode, FILE *screen,
+                      double **host_cut_ljsq, double host_cut_coulsq,
+                      double *host_special_coul, const double qqrd2e,
+                      const double alf, const double e_shift,
+                      const double f_shift);
 void borncwcs_gpu_clear();
-int ** borncwcs_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                            double **host_x, int *host_type, double *sublo,
-                            double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom, int &host_start,
-                            int **ilist, int **jnum, const double cpu_time,
-                            bool &success, double *host_q, double *boxlo,
-                            double *prd);
+int ** borncwcs_gpu_compute_n(const int ago, const int inum_full,
+                              const int nall, double **host_x, int *host_type,
+                              double *sublo, double *subhi, tagint *tag,
+                              int **nspecial, tagint **special,
+                              const bool eflag, const bool vflag,
+                              const bool eatom, const bool vatom,
+                              int &host_start, int **ilist, int **jnum,
+                              const double cpu_time, bool &success,
+                              double *host_q, double *boxlo, double *prd);
 void borncwcs_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
@@ -179,10 +181,11 @@ void PairBornCoulWolfCSGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = borncwcs_gpu_init(atom->ntypes+1, cutsq, rhoinv,
                                 born1, born2, born3, a, c, d, sigma, offset,
                                 force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul, force->qqrd2e,
                                 alf, e_shift, f_shift);
diff --git a/src/GPU/pair_born_coul_wolf_gpu.cpp b/src/GPU/pair_born_coul_wolf_gpu.cpp
index ee6fcf3cea..73e58b0a1f 100644
--- a/src/GPU/pair_born_coul_wolf_gpu.cpp
+++ b/src/GPU/pair_born_coul_wolf_gpu.cpp
@@ -51,13 +51,15 @@ int borncw_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                     const double cell_size, int &gpu_mode, FILE *screen,
                     double **host_cut_ljsq, double host_cut_coulsq,
                     double *host_special_coul, const double qqrd2e,
-                    const double alf, const double e_shift, const double f_shift);
+                    const double alf, const double e_shift,
+                    const double f_shift);
 void borncw_gpu_clear();
 int ** borncw_gpu_compute_n(const int ago, const int inum_full, const int nall,
                             double **host_x, int *host_type, double *sublo,
                             double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom, int &host_start,
+                            tagint **special, const bool eflag,
+                            const bool vflag, const bool eatom,
+                            const bool vatom, int &host_start,
                             int **ilist, int **jnum, const double cpu_time,
                             bool &success, double *host_q, double *boxlo,
                             double *prd);
@@ -177,10 +179,11 @@ void PairBornCoulWolfGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = borncw_gpu_init(atom->ntypes+1, cutsq, rhoinv,
                                 born1, born2, born3, a, c, d, sigma, offset,
                                 force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul, force->qqrd2e,
                                 alf, e_shift, f_shift);
diff --git a/src/GPU/pair_born_gpu.cpp b/src/GPU/pair_born_gpu.cpp
index 84ed4cfc04..770dad8346 100644
--- a/src/GPU/pair_born_gpu.cpp
+++ b/src/GPU/pair_born_gpu.cpp
@@ -48,13 +48,13 @@ int born_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                   const int maxspecial, const double cell_size,
                   int &gpu_mode, FILE *screen);
 void born_gpu_reinit(const int ntypes, double **host_rhoinv,
-                     double **host_born1, double **host_born2, double **host_born3,
-                     double **host_a, double **host_c, double **host_d,
-                     double **offset);
+                     double **host_born1, double **host_born2,
+                     double **host_born3, double **host_a, double **host_c,
+                     double **host_d, double **offset);
 void born_gpu_clear();
-int ** born_gpu_compute_n(const int ago, const int inum_full,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** born_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
@@ -163,10 +163,11 @@ void PairBornGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = born_gpu_init(atom->ntypes+1, cutsq, rhoinv,
                               born1, born2, born3, a, c, d, sigma,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_buck_coul_cut_gpu.cpp b/src/GPU/pair_buck_coul_cut_gpu.cpp
index 036bc0d7a8..2c9e71bc83 100644
--- a/src/GPU/pair_buck_coul_cut_gpu.cpp
+++ b/src/GPU/pair_buck_coul_cut_gpu.cpp
@@ -167,9 +167,10 @@ void PairBuckCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = buckc_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2,
                                a, c, offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, cut_ljsq,
                                cut_coulsq, force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_buck_coul_long_gpu.cpp b/src/GPU/pair_buck_coul_long_gpu.cpp
index 3916e5634e..3d48862c6a 100644
--- a/src/GPU/pair_buck_coul_long_gpu.cpp
+++ b/src/GPU/pair_buck_coul_long_gpu.cpp
@@ -191,9 +191,10 @@ void PairBuckCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = buckcl_gpu_init(atom->ntypes+1, cutsq,  rhoinv, buck1, buck2,
                                 a, c, offset, force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen, cut_ljsq,
                                 cut_coulsq, force->special_coul, force->qqrd2e,
                                 g_ewald);
diff --git a/src/GPU/pair_buck_gpu.cpp b/src/GPU/pair_buck_gpu.cpp
index 54c579bf72..d17f9d2072 100644
--- a/src/GPU/pair_buck_gpu.cpp
+++ b/src/GPU/pair_buck_gpu.cpp
@@ -47,8 +47,8 @@ int buck_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
                   const int nall, const int max_nbors,  const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen);
 void buck_gpu_reinit(const int ntypes, double **cutsq, double **host_rhoinv,
-                  double **host_buck1, double **host_buck2,
-                  double **host_a, double **host_c, double **offset);
+                     double **host_buck1, double **host_buck2,
+                     double **host_a, double **host_c, double **offset);
 void buck_gpu_clear();
 int ** buck_gpu_compute_n(const int ago, const int inum_full, const int nall,
                           double **host_x, int *host_type, double *sublo,
@@ -161,9 +161,10 @@ void PairBuckGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = buck_gpu_init(atom->ntypes+1, cutsq, rhoinv, buck1, buck2,
                               a, c, offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_colloid_gpu.cpp b/src/GPU/pair_colloid_gpu.cpp
index 2e35486993..8b7870575a 100644
--- a/src/GPU/pair_colloid_gpu.cpp
+++ b/src/GPU/pair_colloid_gpu.cpp
@@ -44,18 +44,18 @@ int colloid_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                      double **host_lj2, double **host_lj3, double **host_lj4,
                      double **offset, double *special_lj, double **host_a12,
                      double **host_a1, double **host_a2, double **host_d1,
-                     double **host_d2, double **host_sigma3, double **host_sigma6,
-                     int **host_form, const int nlocal,
+                     double **host_d2, double **host_sigma3,
+                     double **host_sigma6, int **host_form, const int nlocal,
                      const int nall, const int max_nbors, const int maxspecial,
                      const double cell_size, int &gpu_mode, FILE *screen);
 void colloid_gpu_clear();
-int ** colloid_gpu_compute_n(const int ago, const int inum,
-                             const int nall, double **host_x, int *host_type,
-                             double *sublo, double *subhi, tagint *tag, int **nspecial,
-                             tagint **special, const bool eflag, const bool vflag,
-                             const bool eatom, const bool vatom, int &host_start,
-                             int **ilist, int **jnum,
-                             const double cpu_time, bool &success);
+int ** colloid_gpu_compute_n(const int ago, const int inum, const int nall,
+                             double **host_x, int *host_type, double *sublo,
+                             double *subhi, tagint *tag, int **nspecial,
+                             tagint **special, const bool eflag,
+                             const bool vflag, const bool eatom,
+                             const bool vatom, int &host_start, int **ilist,
+                             int **jnum, const double cpu_time, bool &success);
 void colloid_gpu_compute(const int ago, const int inum, const int nall,
                          double **host_x, int *host_type, int *ilist, int *numj,
                          int **firstneigh, const bool eflag, const bool vflag,
@@ -171,10 +171,11 @@ void PairColloidGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = colloid_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                                  offset, force->special_lj, a12, a1, a2,
                                  d1, d2, sigma3, sigma6, _form, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300, maxspecial,
+                                 atom->nlocal+atom->nghost, mnf, maxspecial,
                                  cell_size, gpu_mode, screen);
   memory->destroy(_form);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_coul_cut_gpu.cpp b/src/GPU/pair_coul_cut_gpu.cpp
index 1e45aebf7b..9098f86737 100644
--- a/src/GPU/pair_coul_cut_gpu.cpp
+++ b/src/GPU/pair_coul_cut_gpu.cpp
@@ -47,21 +47,21 @@ int coul_gpu_init(const int ntypes, double **host_scale, double **cutsq,
                   const double qqrd2e);
 void coul_gpu_reinit(const int ntypes, double **host_scale);
 void coul_gpu_clear();
-int ** coul_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time,
-                         bool &success, double *host_q, double *boxlo,
-                         double *prd);
+int ** coul_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success, double *host_q, double *boxlo,
+                          double *prd);
 void coul_gpu_compute(const int ago, const int inum,
                       const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q, const int nlocal,
-                     double *boxlo, double *prd);
+                      int *ilist, int *numj, int **firstneigh,
+                      const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, const int nlocal,
+                      double *boxlo, double *prd);
 double coul_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -166,9 +166,10 @@ void PairCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = coul_gpu_init(atom->ntypes+1, scale, cutsq,
                              force->special_coul, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_coul_debye_gpu.cpp b/src/GPU/pair_coul_debye_gpu.cpp
index f23b5acde3..1db2995810 100644
--- a/src/GPU/pair_coul_debye_gpu.cpp
+++ b/src/GPU/pair_coul_debye_gpu.cpp
@@ -48,20 +48,20 @@ int cdebye_gpu_init(const int ntypes, double **host_scale, double **cutsq,
 void cdebye_gpu_reinit(const int ntypes, double **host_scale);
 void cdebye_gpu_clear();
 int ** cdebye_gpu_compute_n(const int ago, const int inum, const int nall,
-                          double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
-                          tagint **special, const bool eflag, const bool vflag,
-                          const bool eatom, const bool vatom, int &host_start,
-                          int **ilist, int **jnum, const double cpu_time,
-                          bool &success, double *host_q, double *boxlo,
-                          double *prd);
+                            double **host_x, int *host_type, double *sublo,
+                            double *subhi, tagint *tag, int **nspecial,
+                            tagint **special, const bool eflag,
+                            const bool vflag, const bool eatom,
+                            const bool vatom, int &host_start, int **ilist,
+                            int **jnum, const double cpu_time, bool &success,
+                            double *host_q, double *boxlo, double *prd);
 void cdebye_gpu_compute(const int ago, const int inum, const int nall,
-                      double **host_x, int *host_type,
-                      int *ilist, int *numj, int **firstneigh,
-                      const bool eflag, const bool vflag, const bool eatom,
-                      const bool vatom, int &host_start, const double cpu_time,
-                      bool &success, double *host_q, const int nlocal,
-                      double *boxlo, double *prd);
+                        double **host_x, int *host_type, int *ilist,
+                        int *numj, int **firstneigh, const bool eflag,
+                        const bool vflag, const bool eatom, const bool vatom,
+                        int &host_start, const double cpu_time, bool &success,
+                        double *host_q, const int nlocal, double *boxlo,
+                        double *prd);
 double cdebye_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -167,9 +167,10 @@ void PairCoulDebyeGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = cdebye_gpu_init(atom->ntypes+1, scale, cutsq,
                                 force->special_coul, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen,
                                 force->qqrd2e, kappa);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_coul_dsf_gpu.cpp b/src/GPU/pair_coul_dsf_gpu.cpp
index 0bcffb5d2c..830ad057e6 100644
--- a/src/GPU/pair_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_coul_dsf_gpu.cpp
@@ -57,9 +57,9 @@ int cdsf_gpu_init(const int ntypes, const int nlocal, const int nall,
                   const double e_shift, const double f_shift,
                   const double alpha);
 void cdsf_gpu_clear();
-int ** cdsf_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** cdsf_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
@@ -184,8 +184,9 @@ void PairCoulDSFGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = cdsf_gpu_init(atom->ntypes+1, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_coulsq,
                               force->special_coul, force->qqrd2e, e_shift,
                               f_shift, alpha);
diff --git a/src/GPU/pair_coul_long_cs_gpu.cpp b/src/GPU/pair_coul_long_cs_gpu.cpp
index ef404d7a13..5b1fcd9c8f 100644
--- a/src/GPU/pair_coul_long_cs_gpu.cpp
+++ b/src/GPU/pair_coul_long_cs_gpu.cpp
@@ -54,27 +54,27 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int clcs_gpu_init(const int ntypes, double **scale,
-                const int nlocal, const int nall, const int max_nbors,
-                const int maxspecial, const double cell_size, int &gpu_mode,
-                FILE *screen, double host_cut_coulsq, double *host_special_coul,
-                const double qqrd2e, const double g_ewald);
+int clcs_gpu_init(const int ntypes, double **scale, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double host_cut_coulsq, double *host_special_coul,
+                  const double qqrd2e, const double g_ewald);
 void clcs_gpu_reinit(const int ntypes, double **scale);
 void clcs_gpu_clear();
 int ** clcs_gpu_compute_n(const int ago, const int inum,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag,
-                        int **nspecial, tagint **special, const bool eflag,
-                        const bool vflag, const bool eatom, const bool vatom,
-                        int &host_start, int **ilist, int **jnum,
-                        const double cpu_time, bool &success, double *host_q,
-                        double *boxlo, double *prd);
+                          const int nall, double **host_x, int *host_type,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special, const bool eflag,
+                          const bool vflag, const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success, double *host_q,
+                          double *boxlo, double *prd);
 void clcs_gpu_compute(const int ago, const int inum, const int nall,
-                    double **host_x, int *host_type, int *ilist, int *numj,
-                    int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start,
-                    const double cpu_time, bool &success, double *host_q,
-                    const int nlocal, double *boxlo, double *prd);
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd);
 double clcs_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -186,8 +186,9 @@ void PairCoulLongCSGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = clcs_gpu_init(atom->ntypes+1, scale,
-                            atom->nlocal, atom->nlocal+atom->nghost, 300,
+                            atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             maxspecial, cell_size, gpu_mode, screen, cut_coulsq,
                             force->special_coul, force->qqrd2e, g_ewald);
 
diff --git a/src/GPU/pair_coul_long_gpu.cpp b/src/GPU/pair_coul_long_gpu.cpp
index 1118a012d0..af6a66fa34 100644
--- a/src/GPU/pair_coul_long_gpu.cpp
+++ b/src/GPU/pair_coul_long_gpu.cpp
@@ -181,8 +181,9 @@ void PairCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = cl_gpu_init(atom->ntypes+1, scale,
-                            atom->nlocal, atom->nlocal+atom->nghost, 300,
+                            atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             maxspecial, cell_size, gpu_mode, screen, cut_coulsq,
                             force->special_coul, force->qqrd2e, g_ewald);
 
diff --git a/src/GPU/pair_dpd_gpu.cpp b/src/GPU/pair_dpd_gpu.cpp
index 59c0fa031f..d77d83e953 100644
--- a/src/GPU/pair_dpd_gpu.cpp
+++ b/src/GPU/pair_dpd_gpu.cpp
@@ -52,8 +52,8 @@ int ** dpd_gpu_compute_n(const int ago, const int inum_full, const int nall,
                          double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time, bool &success,
-                         double **host_v, const double dtinvsqrt,
+                         int **ilist, int **jnum, const double cpu_time,
+                         bool &success, double **host_v, const double dtinvsqrt,
                          const int seed, const int timestep,
                          double *boxlo, double *prd);
 void dpd_gpu_compute(const int ago, const int inum_full, const int nall,
@@ -308,9 +308,10 @@ void PairDPDGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dpd_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma,
                              cut, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_dpd_tstat_gpu.cpp b/src/GPU/pair_dpd_tstat_gpu.cpp
index 8bf98cc8ed..a5ae3e3001 100644
--- a/src/GPU/pair_dpd_tstat_gpu.cpp
+++ b/src/GPU/pair_dpd_tstat_gpu.cpp
@@ -47,12 +47,13 @@ int dpd_tstat_gpu_init(const int ntypes, double **cutsq, double **host_a0,
                  const int nall, const int max_nbors,  const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen);
 void dpd_tstat_gpu_clear();
-int ** dpd_tstat_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                         double **host_x, int *host_type, double *sublo,
-                         double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum, const double cpu_time, bool &success,
+int ** dpd_tstat_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, tagint *tag,
+                         int **nspecial, tagint **special, const bool eflag,
+                         const bool vflag, const bool eatom, const bool vatom,
+                         int &host_start, int **ilist, int **jnum,
+                         const double cpu_time, bool &success,
                          double **host_v, const double dtinvsqrt,
                          const int seed, const int timestep,
                          double *boxlo, double *prd);
@@ -64,8 +65,9 @@ void dpd_tstat_gpu_compute(const int ago, const int inum_full, const int nall,
                      double **host_v, const double dtinvsqrt,
                      const int seed, const int timestep,
                      const int nlocal, double *boxlo, double *prd);
-void dpd_tstat_gpu_update_coeff(int ntypes, double **host_a0, double **host_gamma,
-                          double **host_sigma, double **host_cut);
+void dpd_tstat_gpu_update_coeff(int ntypes, double **host_a0,
+                                double **host_gamma, double **host_sigma,
+                                double **host_cut);
 double dpd_tstat_gpu_bytes();
 
 #define EPSILON 1.0e-10
@@ -325,10 +327,11 @@ void PairDPDTstatGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dpd_tstat_gpu_init(atom->ntypes+1, cutsq, a0, gamma, sigma,
-                             cut, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
-                             cell_size, gpu_mode, screen);
+                                   cut, force->special_lj, atom->nlocal,
+                                   atom->nlocal+atom->nghost, mnf, maxspecial,
+                                   cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
diff --git a/src/GPU/pair_eam_alloy_gpu.cpp b/src/GPU/pair_eam_alloy_gpu.cpp
index c1370af307..4678a6f669 100644
--- a/src/GPU/pair_eam_alloy_gpu.cpp
+++ b/src/GPU/pair_eam_alloy_gpu.cpp
@@ -39,21 +39,22 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int eam_alloy_gpu_init(const int ntypes, double host_cutforcesq,
-                 int **host_type2rhor, int **host_type2z2r,
-                 int *host_type2frho, double ***host_rhor_spline,
-                 double ***host_z2r_spline, double ***host_frho_spline,
-                 double rdr, double rdrho, double rhomax,
-                 int nrhor, int nrho, int nz2r, int nfrho, int nr,
-                 const int nlocal, const int nall, const int max_nbors,
-                 const int maxspecial, const double cell_size, int &gpu_mode,
-                 FILE *screen, int &fp_size);
+                       int **host_type2rhor, int **host_type2z2r,
+                       int *host_type2frho, double ***host_rhor_spline,
+                       double ***host_z2r_spline, double ***host_frho_spline,
+                       double rdr, double rdrho, double rhomax,
+                       int nrhor, int nrho, int nz2r, int nfrho, int nr,
+                       const int nlocal, const int nall, const int max_nbors,
+                       const int maxspecial, const double cell_size,
+                       int &gpu_mode, FILE *screen, int &fp_size);
 void eam_alloy_gpu_clear();
-int** eam_alloy_gpu_compute_n(const int ago, const int inum_full, const int nall,
-                        double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag, const bool eatom,
-                        const bool vatom, int &host_start, int **ilist,
-                        int **jnum,  const double cpu_time, bool &success,
+int** eam_alloy_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, tagint *tag,
+                        int **nspecial, tagint **special, const bool eflag,
+                        const bool vflag, const bool eatom, const bool vatom,
+                        int &host_start, int **ilist, int **jnum,
+                        const double cpu_time, bool &success,
                         int &inum, void **fp_ptr);
 void eam_alloy_gpu_compute(const int ago, const int inum_full, const int nlocal,
                      const int nall,double **host_x, int *host_type,
@@ -183,10 +184,11 @@ void PairEAMAlloyGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = eam_alloy_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r,
                              type2frho, rhor_spline, z2r_spline, frho_spline,
                              rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr,
-                             atom->nlocal, atom->nlocal+atom->nghost, 300,
+                             atom->nlocal, atom->nlocal+atom->nghost, mnf,
                              maxspecial, cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
 
@@ -195,7 +197,6 @@ void PairEAMAlloyGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   if (fp_size == sizeof(double))
     fp_single = false;
   else
diff --git a/src/GPU/pair_eam_fs_gpu.cpp b/src/GPU/pair_eam_fs_gpu.cpp
index ce3ea8bb0b..390bb93987 100644
--- a/src/GPU/pair_eam_fs_gpu.cpp
+++ b/src/GPU/pair_eam_fs_gpu.cpp
@@ -50,19 +50,19 @@ int eam_fs_gpu_init(const int ntypes, double host_cutforcesq,
 void eam_fs_gpu_clear();
 int** eam_fs_gpu_compute_n(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag, const bool eatom,
-                        const bool vatom, int &host_start, int **ilist,
-                        int **jnum,  const double cpu_time, bool &success,
-                        int &inum, void **fp_ptr);
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum,  const double cpu_time,
+                        bool &success, int &inum, void **fp_ptr);
 void eam_fs_gpu_compute(const int ago, const int inum_full, const int nlocal,
-                     const int nall,double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, void **fp_ptr);
+                        const int nall,double **host_x, int *host_type,
+                        int *ilist, int *numj, int **firstneigh,
+                        const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, void **fp_ptr);
 void eam_fs_gpu_compute_force(int *ilist, const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom);
+                              const bool eatom, const bool vatom);
 double eam_fs_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -183,10 +183,11 @@ void PairEAMFSGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = eam_fs_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r,
                              type2frho, rhor_spline, z2r_spline, frho_spline,
                              rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr,
-                             atom->nlocal, atom->nlocal+atom->nghost, 300,
+                             atom->nlocal, atom->nlocal+atom->nghost, mnf,
                              maxspecial, cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
 
@@ -195,7 +196,6 @@ void PairEAMFSGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   if (fp_size == sizeof(double))
     fp_single = false;
   else
diff --git a/src/GPU/pair_eam_gpu.cpp b/src/GPU/pair_eam_gpu.cpp
index abd721a327..e458ea2020 100644
--- a/src/GPU/pair_eam_gpu.cpp
+++ b/src/GPU/pair_eam_gpu.cpp
@@ -50,11 +50,11 @@ int eam_gpu_init(const int ntypes, double host_cutforcesq,
 void eam_gpu_clear();
 int** eam_gpu_compute_n(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag, const bool eatom,
-                        const bool vatom, int &host_start, int **ilist,
-                        int **jnum,  const double cpu_time, bool &success,
-                        int &inum, void **fp_ptr);
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum,  const double cpu_time,
+                        bool &success, int &inum, void **fp_ptr);
 void eam_gpu_compute(const int ago, const int inum_full, const int nlocal,
                      const int nall,double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
@@ -185,10 +185,11 @@ void PairEAMGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
   int fp_size;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = eam_gpu_init(atom->ntypes+1, cutforcesq, type2rhor, type2z2r,
                              type2frho, rhor_spline, z2r_spline, frho_spline,
                              rdr, rdrho, rhomax, nrhor, nrho, nz2r, nfrho, nr,
-                             atom->nlocal, atom->nlocal+atom->nghost, 300,
+                             atom->nlocal, atom->nlocal+atom->nghost, mnf,
                              maxspecial, cell_size, gpu_mode, screen, fp_size);
   GPU_EXTRA::check_flag(success,error,world);
 
@@ -197,7 +198,6 @@ void PairEAMGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   if (fp_size == sizeof(double))
     fp_single = false;
   else
diff --git a/src/GPU/pair_gauss_gpu.cpp b/src/GPU/pair_gauss_gpu.cpp
index 89b79f11f2..fe9dd9ba96 100644
--- a/src/GPU/pair_gauss_gpu.cpp
+++ b/src/GPU/pair_gauss_gpu.cpp
@@ -41,15 +41,16 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int gauss_gpu_init(const int ntypes, double **cutsq, double **host_a,
-                   double **b, double **offset, double *special_lj, const int nlocal,
-                   const int nall, const int max_nbors, const int maxspecial,
-                   const double cell_size, int &gpu_mode, FILE *screen);
+                   double **b, double **offset, double *special_lj,
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
+                   int &gpu_mode, FILE *screen);
 void gauss_gpu_reinit(const int ntypes, double **cutsq, double **host_a,
-                   double **b, double **offset);
+                      double **b, double **offset);
 void gauss_gpu_clear();
-int ** gauss_gpu_compute_n(const int ago, const int inum,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** gauss_gpu_compute_n(const int ago, const int inum, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum,
@@ -158,9 +159,10 @@ void PairGaussGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = gauss_gpu_init(atom->ntypes+1, cutsq, a, b,
                                offset, force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_gayberne_gpu.cpp b/src/GPU/pair_gayberne_gpu.cpp
index 19a4c77032..81966824ba 100644
--- a/src/GPU/pair_gayberne_gpu.cpp
+++ b/src/GPU/pair_gayberne_gpu.cpp
@@ -49,12 +49,12 @@ int gb_gpu_init(const int ntypes, const double gamma, const double upsilon,
                 double **host_lj3, double **host_lj4, double **offset,
                 double *special_lj, const int nlocal, const int nall,
                 const int max_nbors, const int maxspecial,
-                const double cell_size,        int &gpu_mode, FILE *screen);
+                const double cell_size, int &gpu_mode, FILE *screen);
 void gb_gpu_clear();
 int ** gb_gpu_compute_n(const int ago, const int inum, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag,
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double **host_quat);
@@ -207,10 +207,11 @@ void PairGayBerneGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = gb_gpu_init(atom->ntypes+1, gamma, upsilon, mu,
                             shape2, well, cutsq, sigma, epsilon, lshape, form,
                             lj1, lj2, lj3, lj4, offset, force->special_lj,
-                            atom->nlocal, atom->nlocal+atom->nghost, 300,
+                            atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             maxspecial, cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj96_cut_gpu.cpp b/src/GPU/pair_lj96_cut_gpu.cpp
index e15a78fb91..84d1a1a10d 100644
--- a/src/GPU/pair_lj96_cut_gpu.cpp
+++ b/src/GPU/pair_lj96_cut_gpu.cpp
@@ -160,9 +160,10 @@ void PairLJ96CutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
new file mode 100644
index 0000000000..4f8679a8a8
--- /dev/null
+++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.cpp
@@ -0,0 +1,309 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Mike Brown (SNL)
+------------------------------------------------------------------------- */
+
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include "pair_lj_charmm_coul_charmm_gpu.h"
+#include "atom.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "memory.h"
+#include "error.h"
+#include "neigh_request.h"
+#include "universe.h"
+#include "domain.h"
+#include "gpu_extra.h"
+
+using namespace LAMMPS_NS;
+
+// External functions from cuda library for atom decomposition
+
+int crm_gpu_init(const int ntypes, double cut_bothsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double *special_lj, const int nlocal,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double host_cut_ljsq, double host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e,
+                 const double cut_lj_innersq, const double cut_coul_innersq,
+                 const double denom_lj, const double denom_coul,
+                 double **epsilon, double **sigma,
+                 const bool mix_arithmetic);
+void crm_gpu_clear();
+int ** crm_gpu_compute_n(const int ago, const int inum,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, tagint *tag,
+                         int **nspecial, tagint **special, const bool eflag,
+                         const bool vflag, const bool eatom,
+                         const bool vatom, int &host_start, int **ilist,
+                         int **jnum, const double cpu_time, bool &success,
+                         double *host_q, double *boxlo, double *prd);
+void crm_gpu_compute(const int ago, const int inum, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     const int nlocal, double *boxlo, double *prd);
+double crm_gpu_bytes();
+
+/* ---------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmGPU::PairLJCharmmCoulCharmmGPU(LAMMPS *lmp) :
+  PairLJCharmmCoulCharmm(lmp), gpu_mode(GPU_FORCE)
+{
+  reinitflag = 0;
+  cpu_time = 0.0;
+  GPU_EXTRA::gpu_ready(lmp->modify, lmp->error);
+}
+
+/* ----------------------------------------------------------------------
+   free all arrays
+------------------------------------------------------------------------- */
+
+PairLJCharmmCoulCharmmGPU::~PairLJCharmmCoulCharmmGPU()
+{
+  crm_gpu_clear();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmGPU::compute(int eflag, int vflag)
+{
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = 0;
+
+  int nall = atom->nlocal + atom->nghost;
+  int inum, host_start;
+
+  bool success = true;
+  int *ilist, *numneigh, **firstneigh;
+  if (gpu_mode != GPU_FORCE) {
+    inum = atom->nlocal;
+    firstneigh = crm_gpu_compute_n(neighbor->ago, inum, nall, atom->x,
+                                   atom->type, domain->sublo, domain->subhi,
+                                   atom->tag, atom->nspecial, atom->special,
+                                   eflag, vflag, eflag_atom, vflag_atom,
+                                   host_start, &ilist, &numneigh, cpu_time,
+                                   success, atom->q, domain->boxlo,
+                                   domain->prd);
+  } else {
+    inum = list->inum;
+    ilist = list->ilist;
+    numneigh = list->numneigh;
+    firstneigh = list->firstneigh;
+    crm_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
+                    ilist, numneigh, firstneigh, eflag, vflag, eflag_atom,
+                    vflag_atom, host_start, cpu_time, success, atom->q,
+                    atom->nlocal, domain->boxlo, domain->prd);
+  }
+  if (!success)
+    error->one(FLERR,"Insufficient memory on accelerator");
+
+  if (host_start<inum) {
+    cpu_time = MPI_Wtime();
+    cpu_compute(host_start, inum, eflag, vflag, ilist, numneigh, firstneigh);
+    cpu_time = MPI_Wtime() - cpu_time;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmGPU::init_style()
+{
+  if (!atom->q_flag)
+    error->all(FLERR,
+               "Pair style lj/charmm/coul/long/gpu requires atom attribute q");
+  if (force->newton_pair)
+    error->all(FLERR,
+      "Cannot use newton pair with lj/charmm/coul/long/gpu pair style");
+
+  // Repeat cutsq calculation because done after call to init_style
+
+  double cut;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0))
+        cut = init_one(i,j);
+    }
+  }
+
+  cut_lj_innersq = cut_lj_inner * cut_lj_inner;
+  cut_coul_innersq = cut_coul_inner * cut_coul_inner;
+  cut_ljsq = cut_lj * cut_lj;
+  cut_coulsq = cut_coul * cut_coul;
+  cut_bothsq = MAX(cut_ljsq,cut_coulsq);
+
+  denom_lj = (cut_ljsq-cut_lj_innersq) * (cut_ljsq-cut_lj_innersq) *
+    (cut_ljsq-cut_lj_innersq);
+  denom_lj = 1.0 / denom_lj;
+
+  denom_coul = (cut_coulsq-cut_coul_innersq) * (cut_coulsq-cut_coul_innersq) *
+    (cut_coulsq-cut_coul_innersq);
+  denom_coul = 1.0 / denom_coul;
+
+  double cell_size = sqrt(cut_bothsq) + neighbor->skin;
+
+  int maxspecial=0;
+  if (atom->molecular)
+    maxspecial=atom->maxspecial;
+
+  bool arithmetic = true;
+  for (int i = 1; i < atom->ntypes + 1; i++)
+    for (int j = i + 1; j < atom->ntypes + 1; j++) {
+      if (epsilon[i][j] != sqrt(epsilon[i][i] * epsilon[j][j]))
+        arithmetic = false;
+      if (sigma[i][j] != 0.5 * (sigma[i][i] + sigma[j][j]))
+        arithmetic = false;
+    }
+
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = crm_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
+                             force->special_lj, atom->nlocal,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
+                             cell_size, gpu_mode, screen, cut_ljsq,
+                             cut_coulsq, force->special_coul, force->qqrd2e,
+                             cut_lj_innersq,cut_coul_innersq,denom_lj,
+                             denom_coul,epsilon,sigma,arithmetic);
+  GPU_EXTRA::check_flag(success,error,world);
+
+  if (gpu_mode == GPU_FORCE) {
+    int irequest = neighbor->request(this,instance_me);
+    neighbor->requests[irequest]->half = 0;
+    neighbor->requests[irequest]->full = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double PairLJCharmmCoulCharmmGPU::memory_usage()
+{
+  double bytes = Pair::memory_usage();
+  return bytes + crm_gpu_bytes();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJCharmmCoulCharmmGPU::cpu_compute(int start, int inum, int eflag,
+                                          int vflag, int *ilist,
+                                          int *numneigh, int **firstneigh)
+{
+  int i,j,ii,jj,jnum,itype,jtype;
+  double qtmp,xtmp,ytmp,ztmp,delx,dely,delz,evdwl,ecoul,fpair;
+  double rsq,r2inv,r6inv,forcecoul,forcelj,factor_coul,factor_lj;
+  double philj,switch1,switch2;
+  int *jlist;
+
+  evdwl = ecoul = 0.0;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *q = atom->q;
+  int *type = atom->type;
+  double *special_coul = force->special_coul;
+  double *special_lj = force->special_lj;
+  double qqrd2e = force->qqrd2e;
+
+  // loop over neighbors of my atoms
+
+  for (ii = start; ii < inum; ii++) {
+    i = ilist[ii];
+    qtmp = q[i];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      factor_coul = special_coul[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_bothsq) {
+        r2inv = 1.0/rsq;
+
+        if (rsq < cut_coulsq) {
+          forcecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+          if (rsq > cut_coul_innersq) {
+            switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+              (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) * denom_coul;
+            forcecoul *= switch1;
+          }
+        } else forcecoul = 0.0;
+
+        if (rsq < cut_ljsq) {
+          r6inv = r2inv*r2inv*r2inv;
+          jtype = type[j];
+          forcelj = r6inv * (lj1[itype][jtype]*r6inv - lj2[itype][jtype]);
+          if (rsq > cut_lj_innersq) {
+            switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+              (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * denom_lj;
+            switch2 = 12.0*rsq * (cut_ljsq-rsq) *
+              (rsq-cut_lj_innersq) * denom_lj;
+            philj = r6inv * (lj3[itype][jtype]*r6inv - lj4[itype][jtype]);
+            forcelj = forcelj*switch1 + philj*switch2;
+          }
+        } else forcelj = 0.0;
+
+        fpair = (factor_coul*forcecoul + factor_lj*forcelj) * r2inv;
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+
+        if (eflag) {
+          if (rsq < cut_coulsq) {
+            ecoul = qqrd2e * qtmp*q[j]*sqrt(r2inv);
+            if (rsq > cut_coul_innersq) {
+              switch1 = (cut_coulsq-rsq) * (cut_coulsq-rsq) *
+                (cut_coulsq + 2.0*rsq - 3.0*cut_coul_innersq) *
+                denom_coul;
+              ecoul *= switch1;
+            }
+            ecoul *= factor_coul;
+          } else ecoul = 0.0;
+
+          if (rsq < cut_ljsq) {
+            evdwl = r6inv*(lj3[itype][jtype]*r6inv-lj4[itype][jtype]);
+            if (rsq > cut_lj_innersq) {
+              switch1 = (cut_ljsq-rsq) * (cut_ljsq-rsq) *
+                (cut_ljsq + 2.0*rsq - 3.0*cut_lj_innersq) * denom_lj;
+              evdwl *= switch1;
+            }
+            evdwl *= factor_lj;
+          } else evdwl = 0.0;
+        }
+
+        if (evflag) ev_tally_full(i,evdwl,ecoul,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
diff --git a/src/GPU/pair_lj_charmm_coul_charmm_gpu.h b/src/GPU/pair_lj_charmm_coul_charmm_gpu.h
new file mode 100644
index 0000000000..d80730ca5c
--- /dev/null
+++ b/src/GPU/pair_lj_charmm_coul_charmm_gpu.h
@@ -0,0 +1,62 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(lj/charmm/coul/charmm/gpu,PairLJCharmmCoulCharmmGPU)
+
+#else
+
+#ifndef LMP_PAIR_LJ_CHARMM_COUL_CHARMM_GPU_H
+#define LMP_PAIR_LJ_CHARMM_COUL_CHARMM_GPU_H
+
+#include "pair_lj_charmm_coul_charmm.h"
+
+namespace LAMMPS_NS {
+
+class PairLJCharmmCoulCharmmGPU : public PairLJCharmmCoulCharmm {
+ public:
+  PairLJCharmmCoulCharmmGPU(LAMMPS *lmp);
+  ~PairLJCharmmCoulCharmmGPU();
+  void cpu_compute(int, int, int, int, int *, int *, int **);
+  void compute(int, int);
+  void init_style();
+  double memory_usage();
+
+ enum { GPU_FORCE, GPU_NEIGH, GPU_HYB_NEIGH };
+
+ private:
+  int gpu_mode;
+  double cpu_time;
+};
+
+}
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Insufficient memory on accelerator
+
+There is insufficient memory on one of the devices specified for the gpu
+package
+
+E: Pair style lj/charmm/coul/long/gpu requires atom attribute q
+
+The atom style defined does not have this attribute.
+
+E: Cannot use newton pair with lj/charmm/coul/long/gpu pair style
+
+Self-explanatory.
+
+*/
diff --git a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
index b89e4d4574..9753404d5e 100644
--- a/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_charmm_coul_long_gpu.cpp
@@ -203,9 +203,10 @@ void PairLJCharmmCoulLongGPU::init_style()
         arithmetic = false;
     }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = crml_gpu_init(atom->ntypes+1, cut_bothsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
                               cut_coulsq, force->special_coul, force->qqrd2e,
                               g_ewald, cut_lj_innersq,denom_lj,epsilon,sigma,
diff --git a/src/GPU/pair_lj_class2_coul_long_gpu.cpp b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
index 50183196f8..3fc6195fa8 100644
--- a/src/GPU/pair_lj_class2_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_class2_coul_long_gpu.cpp
@@ -188,9 +188,10 @@ void PairLJClass2CoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = c2cl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_class2_gpu.cpp b/src/GPU/pair_lj_class2_gpu.cpp
index 55fdc2d43d..cf8158ce5f 100644
--- a/src/GPU/pair_lj_class2_gpu.cpp
+++ b/src/GPU/pair_lj_class2_gpu.cpp
@@ -157,9 +157,10 @@ void PairLJClass2GPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = lj96_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_cubic_gpu.cpp b/src/GPU/pair_lj_cubic_gpu.cpp
index 35062a5d71..a0dd9498c6 100644
--- a/src/GPU/pair_lj_cubic_gpu.cpp
+++ b/src/GPU/pair_lj_cubic_gpu.cpp
@@ -52,18 +52,18 @@ int ljcb_gpu_init(const int ntypes, double **cutsq, double **cut_inner_sq,
                   const double cell_size, int &gpu_mode, FILE *screen);
 
 void ljcb_gpu_clear();
-int ** ljcb_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success);
+int ** ljcb_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum, const double cpu_time,
+                          bool &success);
 void ljcb_gpu_compute(const int ago, const int inum, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success);
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success);
 double ljcb_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -165,10 +165,11 @@ void PairLJCubicGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcb_gpu_init(atom->ntypes+1, cutsq, cut_inner_sq,
                               cut_inner, sigma, epsilon, lj1, lj2,
                               lj3, lj4, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
index e4823a3ea4..7932a352b3 100644
--- a/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_cut_gpu.cpp
@@ -48,16 +48,16 @@ int ljc_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  double **host_cut_ljsq, double **host_cut_coulsq,
                  double *host_special_coul, const double qqrd2e);
 void ljc_gpu_clear();
-int ** ljc_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljc_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
                          bool &success, double *host_q, double *boxlo,
                          double *prd);
 void ljc_gpu_compute(const int ago, const int inum,
-                      const int nall, double **host_x, int *host_type,
+                     const int nall, double **host_x, int *host_type,
                      int *ilist, int *numj, int **firstneigh,
                      const bool eflag, const bool vflag, const bool eatom,
                      const bool vatom, int &host_start, const double cpu_time,
@@ -168,9 +168,10 @@ void PairLJCutCoulCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljc_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
index 1f7ae9af01..eb8e2c9c7f 100644
--- a/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_debye_gpu.cpp
@@ -41,17 +41,17 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ljcd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                 double **host_lj2, double **host_lj3, double **host_lj4,
-                 double **offset, double *special_lj, const int nlocal,
-                 const int nall, const int max_nbors, const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 double **host_cut_ljsq, double **host_cut_coulsq,
-                 double *host_special_coul, const double qqrd2e,
-                 const double kappa);
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double **host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double kappa);
 void ljcd_gpu_clear();
 int ** ljcd_gpu_compute_n(const int ago, const int inum, const int nall,
-                          double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
@@ -170,9 +170,10 @@ void PairLJCutCoulDebyeGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcd_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
                               cut_coulsq, force->special_coul,
                               force->qqrd2e, kappa);
diff --git a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
index 6c25412ae8..e071245a56 100644
--- a/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_dsf_gpu.cpp
@@ -59,9 +59,9 @@ int ljd_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const double e_shift, const double f_shift,
                  const double alpha);
 void ljd_gpu_clear();
-int ** ljd_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljd_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum, const double cpu_time,
@@ -185,9 +185,10 @@ void PairLJCutCoulDSFGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljd_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e, e_shift,
                              f_shift, alpha);
diff --git a/src/GPU/pair_lj_cut_coul_long_gpu.cpp b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
index 50776de795..cff48afd1e 100644
--- a/src/GPU/pair_lj_cut_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_long_gpu.cpp
@@ -58,8 +58,8 @@ int ljcl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   double *host_special_coul, const double qqrd2e,
                   const double g_ewald);
 void ljcl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                    double **host_lj2, double **host_lj3, double **host_lj4,
-                    double **offset, double **host_lj_cutsq);
+                     double **host_lj2, double **host_lj3, double **host_lj4,
+                     double **offset, double **host_lj_cutsq);
 void ljcl_gpu_clear();
 int ** ljcl_gpu_compute_n(const int ago, const int inum,
                           const int nall, double **host_x, int *host_type,
@@ -193,9 +193,10 @@ void PairLJCutCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
index 33ba418533..d686ea4d88 100644
--- a/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
+++ b/src/GPU/pair_lj_cut_coul_msm_gpu.cpp
@@ -48,15 +48,17 @@ int ljcm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                   const int nall, const int max_nbors, const int maxspecial,
                   const double cell_size, int &gpu_mode, FILE *screen,
                   double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const int order, const double qqrd2e);
+                  double *host_special_coul, const int order,
+                  const double qqrd2e);
 void ljcm_gpu_clear();
-int ** ljcm_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljcm_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
                           tagint **special, const bool eflag, const bool vflag,
                           const bool eatom, const bool vatom, int &host_start,
                           int **ilist, int **jnum, const double cpu_time,
-                          bool &success, double *host_q, double *boxlo, double *prd);
+                          bool &success, double *host_q, double *boxlo,
+                          double *prd);
 void ljcm_gpu_compute(const int ago, const int inum, const int nall,
                       double **host_x, int *host_type, int *ilist, int *numj,
                       int **firstneigh, const bool eflag, const bool vflag,
@@ -177,12 +179,13 @@ void PairLJCutCoulMSMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljcm_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               force->kspace->get_gcons(),
                               force->kspace->get_dgcons(),
                               offset, force->special_lj,
                               atom->nlocal, atom->nlocal+atom->nghost,
-                              300, maxspecial, cell_size, gpu_mode, screen,
+                              mnf, maxspecial, cell_size, gpu_mode, screen,
                               cut_ljsq, cut_coulsq, force->special_coul,
                               force->kspace->order, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
index ae93cd9010..16eef6e8e8 100644
--- a/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_cut_gpu.cpp
@@ -173,9 +173,10 @@ void PairLJCutDipoleCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dpl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
index 8e7d5baddc..b7c29cedb8 100644
--- a/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_dipole_long_gpu.cpp
@@ -52,29 +52,30 @@ using namespace MathConst;
 // External functions from cuda library for atom decomposition
 
 int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                 double **host_lj2, double **host_lj3, double **host_lj4,
-                 double **offset, double *special_lj, const int nlocal,
-                 const int nall, const int max_nbors, const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 double **host_cut_ljsq, const double host_cut_coulsq,
-                 double *host_special_coul, const double qqrd2e, const double g_ewald);
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double *special_lj, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, const double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald);
 void dplj_gpu_clear();
 int ** dplj_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag,
-                         int **nspecial, tagint **special, const bool eflag,
-                         const bool vflag, const bool eatom, const bool vatom,
-                         int &host_start, int **ilist, int **jnum,
-                         const double cpu_time, bool &success,
-                         double *host_q, double **host_mu,
-                         double *boxlo, double *prd);
+                          const int nall, double **host_x, int *host_type,
+                          double *sublo, double *subhi, tagint *tag,
+                          int **nspecial, tagint **special, const bool eflag,
+                          const bool vflag, const bool eatom, const bool vatom,
+                          int &host_start, int **ilist, int **jnum,
+                          const double cpu_time, bool &success,
+                          double *host_q, double **host_mu,
+                          double *boxlo, double *prd);
 void dplj_gpu_compute(const int ago, const int inum,
-                     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag, const bool eatom,
-                     const bool vatom, int &host_start, const double cpu_time,
-                     bool &success, double *host_q, double **host_mu,
-                     const int nlocal, double *boxlo, double *prd);
+                      const int nall, double **host_x, int *host_type,
+                      int *ilist, int *numj, int **firstneigh,
+                      const bool eflag, const bool vflag, const bool eatom,
+                      const bool vatom, int &host_start, const double cpu_time,
+                      bool &success, double *host_q, double **host_mu,
+                      const int nlocal, double *boxlo, double *prd);
 double dplj_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -196,9 +197,10 @@ void PairLJCutDipoleLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dplj_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                              force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_cut_gpu.cpp b/src/GPU/pair_lj_cut_gpu.cpp
index 2b2773b920..edd2a7feb0 100644
--- a/src/GPU/pair_lj_cut_gpu.cpp
+++ b/src/GPU/pair_lj_cut_gpu.cpp
@@ -47,13 +47,13 @@ int ljl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const double cell_size, int &gpu_mode, FILE *screen);
 
 void ljl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset);
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset);
 
 void ljl_gpu_clear();
-int ** ljl_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljl_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -164,9 +164,10 @@ void PairLJCutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
index 3e852513b2..9584c6f68a 100644
--- a/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
+++ b/src/GPU/pair_lj_cut_tip4p_long_gpu.cpp
@@ -229,10 +229,11 @@ void PairLJCutTIP4PLongGPU::init_style()
       error->warning(FLERR,"Increasing communication cutoff for TIP4P GPU style");
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljtip4p_long_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, force->special_lj, atom->nlocal,
                              typeH, typeO, alpha, qdist,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen, cut_ljsq,
                              cut_coulsq, cut_coulsqplus,
                              force->special_coul, force->qqrd2e,
diff --git a/src/GPU/pair_lj_expand_coul_long_gpu.cpp b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
index 533f9d9070..da0c720c74 100644
--- a/src/GPU/pair_lj_expand_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_expand_coul_long_gpu.cpp
@@ -50,31 +50,31 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
-                  double **host_lj2, double **host_lj3, double **host_lj4,
-                  double **offset, double **shift, double *special_lj, const int nlocal,
-                  const int nall, const int max_nbors, const int maxspecial,
-                  const double cell_size, int &gpu_mode, FILE *screen,
-                  double **host_cut_ljsq, double host_cut_coulsq,
-                  double *host_special_coul, const double qqrd2e,
-                  const double g_ewald);
+                   double **host_lj2, double **host_lj3, double **host_lj4,
+                   double **offset, double **shift, double *special_lj,
+                   const int nlocal, const int nall, const int max_nbors,
+                   const int maxspecial, const double cell_size,
+                   int &gpu_mode, FILE *screen, double **host_cut_ljsq,
+                   double host_cut_coulsq, double *host_special_coul,
+                   const double qqrd2e, const double g_ewald);
 int ljecl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                    double **host_lj2, double **host_lj3, double **host_lj4,
-                    double **offset, double **shift, double **host_lj_cutsq);
+                     double **host_lj2, double **host_lj3, double **host_lj4,
+                     double **offset, double **shift, double **host_lj_cutsq);
 void ljecl_gpu_clear();
 int ** ljecl_gpu_compute_n(const int ago, const int inum,
-                          const int nall, double **host_x, int *host_type,
-                          double *sublo, double *subhi, tagint *tag,
-                          int **nspecial, tagint **special, const bool eflag,
-                          const bool vflag, const bool eatom, const bool vatom,
-                          int &host_start, int **ilist, int **jnum,
-                          const double cpu_time, bool &success, double *host_q,
-                          double *boxlo, double *prd);
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag,
+                           int **nspecial, tagint **special, const bool eflag,
+                           const bool vflag, const bool eatom, const bool vatom,
+                           int &host_start, int **ilist, int **jnum,
+                           const double cpu_time, bool &success, double *host_q,
+                           double *boxlo, double *prd);
 void ljecl_gpu_compute(const int ago, const int inum, const int nall,
-                      double **host_x, int *host_type, int *ilist, int *numj,
-                      int **firstneigh, const bool eflag, const bool vflag,
-                      const bool eatom, const bool vatom, int &host_start,
-                      const double cpu_time, bool &success, double *host_q,
-                      const int nlocal, double *boxlo, double *prd);
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       const int nlocal, double *boxlo, double *prd);
 double ljecl_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -193,9 +193,10 @@ void PairLJExpandCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljecl_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                               offset, shift, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                               force->special_coul, force->qqrd2e, g_ewald);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_expand_gpu.cpp b/src/GPU/pair_lj_expand_gpu.cpp
index d3745dce56..0e86e41255 100644
--- a/src/GPU/pair_lj_expand_gpu.cpp
+++ b/src/GPU/pair_lj_expand_gpu.cpp
@@ -47,8 +47,8 @@ int lje_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                  const int maxspecial, const double cell_size, int &gpu_mode,
                  FILE *screen);
 void lje_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
-                   double **host_lj2, double **host_lj3, double **host_lj4,
-                   double **offset, double **shift);
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset, double **shift);
 void lje_gpu_clear();
 int ** lje_gpu_compute_n(const int ago, const int inum, const int nall,
                          double **host_x, int *host_type, double *sublo,
@@ -161,9 +161,10 @@ void PairLJExpandGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = lje_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                              offset, shift, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_gromacs_gpu.cpp b/src/GPU/pair_lj_gromacs_gpu.cpp
index 1bffbcd0b9..a605ebd6c4 100644
--- a/src/GPU/pair_lj_gromacs_gpu.cpp
+++ b/src/GPU/pair_lj_gromacs_gpu.cpp
@@ -43,16 +43,17 @@ using namespace LAMMPS_NS;
 
 int ljgrm_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                    double **host_lj2, double **host_lj3, double **host_lj4,
-                   double *special_lj, const int inum,
-                   const int nall, const int max_nbors, const int maxspecial,
+                   double *special_lj, const int inum, const int nall,
+                   const int max_nbors, const int maxspecial,
                    const double cell_size, int &gpu_mode, FILE *screen,
-                   double **host_ljsw1, double **host_ljsw2, double **host_ljsw3,
-                   double **host_ljsw4, double **host_ljsw5,
-                   double **cut_inner, double **cut_innersq);
+                   double **host_ljsw1, double **host_ljsw2,
+                   double **host_ljsw3, double **host_ljsw4,
+                   double **host_ljsw5, double **cut_inner,
+                   double **cut_innersq);
 void ljgrm_gpu_clear();
-int ** ljgrm_gpu_compute_n(const int ago, const int inum_full,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** ljgrm_gpu_compute_n(const int ago, const int inum_full, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
@@ -164,9 +165,10 @@ void PairLJGromacsGPU::init_style()
   if (atom->molecular)
     maxspecial=atom->maxspecial;
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ljgrm_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                                                  force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, ljsw1, ljsw2,
                                ljsw3, ljsw4, ljsw5, cut_inner, cut_inner_sq);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
index a3ba87c82e..df2310e904 100644
--- a/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_coul_long_gpu.cpp
@@ -197,9 +197,10 @@ void PairLJSDKCoulLongGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = sdkl_gpu_init(atom->ntypes+1, cutsq, lj_type, lj1, lj2, lj3,
                               lj4, offset, force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen, cut_ljsq,
                               cut_coulsq, force->special_coul,
                               force->qqrd2e, g_ewald);
diff --git a/src/GPU/pair_lj_sdk_gpu.cpp b/src/GPU/pair_lj_sdk_gpu.cpp
index baf341c25a..5a1960e4c8 100644
--- a/src/GPU/pair_lj_sdk_gpu.cpp
+++ b/src/GPU/pair_lj_sdk_gpu.cpp
@@ -166,9 +166,10 @@ void PairLJSDKGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = sdk_gpu_init(atom->ntypes+1,cutsq,lj_type,lj1,lj2,lj3,lj4,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
index 6f0ebc58b7..470c2f049e 100644
--- a/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
+++ b/src/GPU/pair_lj_sf_dipole_sf_gpu.cpp
@@ -48,21 +48,21 @@ int dplsf_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
                    double **host_cut_ljsq, double **host_cut_coulsq,
                    double *host_special_coul, const double qqrd2e);
 void dplsf_gpu_clear();
-int ** dplsf_gpu_compute_n(const int ago, const int inum,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** dplsf_gpu_compute_n(const int ago, const int inum, const int nall,
+                           double **host_x, int *host_type, double *sublo,
+                           double *subhi, tagint *tag, int **nspecial,
                            tagint **special, const bool eflag, const bool vflag,
                            const bool eatom, const bool vatom, int &host_start,
                            int **ilist, int **jnum, const double cpu_time,
                            bool &success, double *host_q, double **host_mu,
                            double *boxlo, double *prd);
-void dplsf_gpu_compute(const int ago, const int inum,
-                       const int nall, double **host_x, int *host_type,
-                       int *ilist, int *numj, int **firstneigh,
-                       const bool eflag, const bool vflag, const bool eatom,
-                       const bool vatom, int &host_start, const double cpu_time,
-                       bool &success, double *host_q, double **host_mu, const int nlocal,
-                       double *boxlo, double *prd);
+void dplsf_gpu_compute(const int ago, const int inum, const int nall,
+                       double **host_x, int *host_type, int *ilist, int *numj,
+                       int **firstneigh, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       const double cpu_time, bool &success, double *host_q,
+                       double **host_mu, const int nlocal, double *boxlo,
+                       double *prd);
 double dplsf_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -172,9 +172,10 @@ void PairLJSFDipoleSFGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = dplsf_gpu_init(atom->ntypes+1, cutsq, lj1, lj2, lj3, lj4,
                                force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, cut_ljsq, cut_coulsq,
                                force->special_coul, force->qqrd2e);
   GPU_EXTRA::check_flag(success,error,world);
diff --git a/src/GPU/pair_mie_cut_gpu.cpp b/src/GPU/pair_mie_cut_gpu.cpp
index e9e6eedde8..05e92909da 100644
--- a/src/GPU/pair_mie_cut_gpu.cpp
+++ b/src/GPU/pair_mie_cut_gpu.cpp
@@ -47,9 +47,9 @@ int mie_gpu_init(const int ntypes, double **cutsq, double **host_mie1,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen);
 void mie_gpu_clear();
-int ** mie_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** mie_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -161,9 +161,10 @@ void PairMIECutGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = mie_gpu_init(atom->ntypes+1, cutsq, mie1, mie2, mie3, mie4,
                              gamA, gamR, offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_morse_gpu.cpp b/src/GPU/pair_morse_gpu.cpp
index 75ca5627ba..d929c76930 100644
--- a/src/GPU/pair_morse_gpu.cpp
+++ b/src/GPU/pair_morse_gpu.cpp
@@ -46,9 +46,9 @@ int mor_gpu_init(const int ntypes, double **cutsq, double **host_morse1,
                  const int nall, const int max_nbors, const int maxspecial,
                  const double cell_size, int &gpu_mode, FILE *screen);
 void mor_gpu_clear();
-int ** mor_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** mor_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -157,9 +157,10 @@ void PairMorseGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = mor_gpu_init(atom->ntypes+1, cutsq, morse1, r0, alpha, d0,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_resquared_gpu.cpp b/src/GPU/pair_resquared_gpu.cpp
index b6c212da6f..c816ad9166 100644
--- a/src/GPU/pair_resquared_gpu.cpp
+++ b/src/GPU/pair_resquared_gpu.cpp
@@ -44,16 +44,16 @@ using namespace LAMMPS_NS;
 
 int re_gpu_init(const int ntypes, double **shape, double **well,
                 double **cutsq, double **sigma, double **epsilon,
-                int **form, double **host_lj1,
-                double **host_lj2, double **host_lj3, double **host_lj4,
-                double **offset, double *special_lj, const int nlocal,
-                const int nall,        const int max_nbors, const int maxspecial,
-                const double cell_size,        int &gpu_mode, FILE *screen);
+                int **form, double **host_lj1, double **host_lj2,
+                double **host_lj3, double **host_lj4, double **offset,
+                double *special_lj, const int nlocal, const int nall,
+                const int max_nbors, const int maxspecial,
+                const double cell_size, int &gpu_mode, FILE *screen);
 void re_gpu_clear();
 int ** re_gpu_compute_n(const int ago, const int inum, const int nall,
                         double **host_x, int *host_type, double *sublo,
-                        double *subhi, tagint *tag, int **nspecial, tagint **special,
-                        const bool eflag, const bool vflag,
+                        double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum, const double cpu_time,
                         bool &success, double **host_quat);
@@ -205,10 +205,11 @@ void PairRESquaredGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = re_gpu_init(atom->ntypes+1, shape1, well, cutsq, sigma,
                             epsilon, form, lj1, lj2, lj3, lj4, offset,
                             force->special_lj, atom->nlocal,
-                            atom->nlocal+atom->nghost, 300, maxspecial,
+                            atom->nlocal+atom->nghost, mnf, maxspecial,
                             cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_soft_gpu.cpp b/src/GPU/pair_soft_gpu.cpp
index c9eb55157a..5a3ad0c577 100644
--- a/src/GPU/pair_soft_gpu.cpp
+++ b/src/GPU/pair_soft_gpu.cpp
@@ -48,13 +48,13 @@ int soft_gpu_init(const int ntypes, double **cutsq, double **prefactor,
 void soft_gpu_reinit(const int ntypes, double **cutsq, double **host_prefactor,
                      double **host_cut);
 void soft_gpu_clear();
-int ** soft_gpu_compute_n(const int ago, const int inum,
-                           const int nall, double **host_x, int *host_type,
-                           double *sublo, double *subhi, tagint *tag, int **nspecial,
-                           tagint **special, const bool eflag, const bool vflag,
-                           const bool eatom, const bool vatom, int &host_start,
-                           int **ilist, int **jnum,
-                           const double cpu_time, bool &success);
+int ** soft_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum,
+                          const double cpu_time, bool &success);
 void soft_gpu_compute(const int ago, const int inum, const int nall,
                        double **host_x, int *host_type, int *ilist, int *numj,
                        int **firstneigh, const bool eflag, const bool vflag,
@@ -162,9 +162,10 @@ void PairSoftGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = soft_gpu_init(atom->ntypes+1, cutsq, prefactor, cut,
                               force->special_lj, atom->nlocal,
-                              atom->nlocal+atom->nghost, 300, maxspecial,
+                              atom->nlocal+atom->nghost, mnf, maxspecial,
                               cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_sw_gpu.cpp b/src/GPU/pair_sw_gpu.cpp
index 3d851121e0..7bfbe2810f 100644
--- a/src/GPU/pair_sw_gpu.cpp
+++ b/src/GPU/pair_sw_gpu.cpp
@@ -38,31 +38,27 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int sw_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
-                const double cell_size, int &gpu_mode, FILE *screen,
-                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-                const double* sw_epsilon, const double* sw_sigma,
-                const double* sw_lambda, const double* sw_gamma,
-                const double* sw_costheta, const double* sw_biga,
-                const double* sw_bigb, const double* sw_powerp,
-                const double* sw_powerq, const double* sw_cut,
-                const double* sw_cutsq);
+int sw_gpu_init(const int ntypes, const int inum, const int nall,
+                const int max_nbors, const double cell_size, int &gpu_mode,
+                FILE *screen, double **ncutsq, double **ncut, double **sigma,
+                double **powerp, double **powerq, double **sigma_gamma,
+                double **c1, double **c2, double **c3,double **c4,
+                double **c5, double **c6, double ***lambda_epsilon,
+                double ***costheta, const int *map, int ***e2param);
 void sw_gpu_clear();
-int ** sw_gpu_compute_n(const int ago, const int inum,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** sw_gpu_compute_n(const int ago, const int inum, const int nall,
+                        double **host_x, int *host_type, double *sublo,
+                        double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success);
-void sw_gpu_compute(const int ago, const int nloc, const int nall, const int ln,
-                    double **host_x, int *host_type, int *ilist, int *numj,
-                    int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start,
-                    const double cpu_time, bool &success);
+void sw_gpu_compute(const int ago, const int nloc, const int nall,
+                    const int ln, double **host_x, int *host_type, int *ilist,
+                    int *numj, int **firstneigh, const bool eflag,
+                    const bool vflag, const bool eatom, const bool vatom,
+                    int &host_start, const double cpu_time, bool &success);
 double sw_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 #define MAXLINE 1024
 #define DELTA 4
@@ -159,55 +155,84 @@ void PairSWGPU::init_style()
   if (force->newton_pair != 0)
     error->all(FLERR,"Pair style sw/gpu requires newton pair off");
 
-  double *epsilon, *sigma, *lambda, *gamma;
-  double *biga, *bigb, *powerp, *powerq;
-  double *_cut, *_cutsq, *costheta;
-  epsilon = sigma = lambda = gamma = nullptr;
-  biga = bigb = powerp = powerq = nullptr;
-  _cut = _cutsq = costheta = nullptr;
+  double **c1, **c2, **c3, **c4, **c5, **c6;
+  double **ncutsq, **ncut, **sigma, **powerp, **powerq, **sigma_gamma;
+  double ***lambda_epsilon, ***costheta;
+  c1 = c2 = c3 = c4 = c5 = c6 = nullptr;
+  ncutsq = ncut = sigma = powerp = powerq = sigma_gamma = nullptr;
+  lambda_epsilon = costheta = nullptr;
 
-  memory->create(epsilon,nparams,"pair:epsilon");
-  memory->create(sigma,nparams,"pair:sigma");
-  memory->create(lambda,nparams,"pair:lambda");
-  memory->create(gamma,nparams,"pair:gamma");
-  memory->create(biga,nparams,"pair:biga");
-  memory->create(bigb,nparams,"pair:bigb");
-  memory->create(powerp,nparams,"pair:powerp");
-  memory->create(powerq,nparams,"pair:powerq");
-  memory->create(_cut,nparams,"pair:_cut");
-  memory->create(_cutsq,nparams,"pair:_cutsq");
-  memory->create(costheta,nparams,"pair:costheta");
+  const int tp1 = atom->ntypes + 1;
 
-  for (int i = 0; i < nparams; i++) {
-    epsilon[i] = params[i].epsilon;
-    sigma[i] = params[i].sigma;
-    lambda[i] = params[i].lambda;
-    gamma[i] = params[i].gamma;
-    biga[i] = params[i].biga;
-    bigb[i] = params[i].bigb;
-    powerp[i] = params[i].powerp;
-    powerq[i] = params[i].powerq;
-    _cut[i] = params[i].cut;
-    _cutsq[i] = params[i].cutsq;
-    costheta[i] = params[i].costheta;
+  memory->create(ncutsq, tp1, tp1, "pair:ncutsq");
+  memory->create(ncut, tp1, tp1, "pair:ncut");
+  memory->create(sigma, tp1, tp1, "pair:sigma");
+  memory->create(powerp, tp1, tp1, "pair:powerp");
+  memory->create(powerq, tp1, tp1, "pair:powerq");
+  memory->create(sigma_gamma, tp1, tp1, "pair:sigma_gamma");
+  memory->create(c1, tp1, tp1, "pair:c1");
+  memory->create(c2, tp1, tp1, "pair:c2");
+  memory->create(c3, tp1, tp1, "pair:c3");
+  memory->create(c4, tp1, tp1, "pair:c4");
+  memory->create(c5, tp1, tp1, "pair:c5");
+  memory->create(c6, tp1, tp1, "pair:c6");
+  memory->create(lambda_epsilon, tp1, tp1, tp1, "pair:lambda_epsilon");
+  memory->create(costheta, tp1, tp1, tp1, "pair:costheta");
+
+  for (int ii = 1; ii < tp1; ii++) {
+    int i = map[ii];
+    for (int jj = 1; jj < tp1; jj++) {
+      int j = map[jj];
+      if (i < 0 || j < 0)
+        continue;
+      else {
+        int ijparam = elem2param[i][j][j];
+        ncutsq[ii][jj] = params[ijparam].cutsq;
+        ncut[ii][jj] = params[ijparam].cut;
+        sigma[ii][jj] = params[ijparam].sigma;
+        powerp[ii][jj] = params[ijparam].powerp;
+        powerq[ii][jj] = params[ijparam].powerq;
+        sigma_gamma[ii][jj] = params[ijparam].sigma_gamma;
+        c1[ii][jj] = params[ijparam].c1;
+        c2[ii][jj] = params[ijparam].c2;
+        c3[ii][jj] = params[ijparam].c3;
+        c4[ii][jj] = params[ijparam].c4;
+        c5[ii][jj] = params[ijparam].c5;
+        c6[ii][jj] = params[ijparam].c6;
+      }
+
+      for (int kk = 1; kk < tp1; kk++) {
+        int k = map[kk];
+        if (k < 0)
+          continue;
+        else {
+          int ijkparam = elem2param[i][j][k];
+          costheta[ii][jj][kk] = params[ijkparam].costheta;
+          lambda_epsilon[ii][jj][kk] = params[ijkparam].lambda_epsilon;
+        }
+      }
+    }
   }
 
-  int success = sw_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, 300,
-                            cell_size, gpu_mode, screen, map, nelements,
-                            elem2param, nparams, epsilon,
-                            sigma, lambda, gamma, costheta, biga, bigb,
-                            powerp, powerq, _cut, _cutsq);
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = sw_gpu_init(tp1, atom->nlocal, atom->nlocal+atom->nghost, mnf,
+                            cell_size, gpu_mode, screen, ncutsq, ncut, sigma,
+                            powerp, powerq, sigma_gamma,  c1, c2, c3, c4, c5,
+                            c6, lambda_epsilon, costheta, map, elem2param);
 
-  memory->destroy(epsilon);
+  memory->destroy(ncutsq);
+  memory->destroy(ncut);
   memory->destroy(sigma);
-  memory->destroy(lambda);
-  memory->destroy(gamma);
-  memory->destroy(biga);
-  memory->destroy(bigb);
   memory->destroy(powerp);
   memory->destroy(powerq);
-  memory->destroy(_cut);
-  memory->destroy(_cutsq);
+  memory->destroy(sigma_gamma);
+  memory->destroy(c1);
+  memory->destroy(c2);
+  memory->destroy(c3);
+  memory->destroy(c4);
+  memory->destroy(c5);
+  memory->destroy(c6);
+  memory->destroy(lambda_epsilon);
   memory->destroy(costheta);
 
   GPU_EXTRA::check_flag(success,error,world);
@@ -218,7 +243,6 @@ void PairSWGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser=2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_table_gpu.cpp b/src/GPU/pair_table_gpu.cpp
index e3cb740e0e..05b76d9adb 100644
--- a/src/GPU/pair_table_gpu.cpp
+++ b/src/GPU/pair_table_gpu.cpp
@@ -231,9 +231,10 @@ void PairTableGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = table_gpu_init(atom->ntypes+1, cutsq, table_coeffs, table_data,
                                force->special_lj, atom->nlocal,
-                               atom->nlocal+atom->nghost, 300, maxspecial,
+                               atom->nlocal+atom->nghost, mnf, maxspecial,
                                cell_size, gpu_mode, screen, tabstyle, ntables,
                                tablength);
   GPU_EXTRA::check_flag(success,error,world);
@@ -243,7 +244,6 @@ void PairTableGPU::init_style()
     neighbor->requests[irequest]->half = 0;
     neighbor->requests[irequest]->full = 1;
   }
-
   memory->destroy(table_coeffs);
   memory->destroy(table_data);
 }
diff --git a/src/GPU/pair_tersoff_gpu.cpp b/src/GPU/pair_tersoff_gpu.cpp
index 8758150956..e675ba6903 100644
--- a/src/GPU/pair_tersoff_gpu.cpp
+++ b/src/GPU/pair_tersoff_gpu.cpp
@@ -66,8 +66,6 @@ void tersoff_gpu_compute(const int ago, const int nlocal, const int nall,
                     const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success);
 double tersoff_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 #define MAXLINE 1024
 #define DELTA 4
@@ -216,8 +214,9 @@ void PairTersoffGPU::init_style()
     _cutsq[i] = params[i].cutsq;
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = tersoff_gpu_init(atom->ntypes+1, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300,
+                                 atom->nlocal+atom->nghost, mnf,
                                  cell_size, gpu_mode, screen, map, nelements,
                                  elem2param, nparams, lam1, lam2, lam3,
                                  powermint, biga, bigb, bigr, bigd,
@@ -252,7 +251,6 @@ void PairTersoffGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser = 2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_tersoff_mod_gpu.cpp b/src/GPU/pair_tersoff_mod_gpu.cpp
index 71734c1c09..98a7248c1f 100644
--- a/src/GPU/pair_tersoff_mod_gpu.cpp
+++ b/src/GPU/pair_tersoff_mod_gpu.cpp
@@ -43,9 +43,10 @@ int tersoff_mod_gpu_init(const int ntypes, const int inum, const int nall,
   int* host_map, const int nelements, int*** host_elem2param, const int nparams,
   const double* ts_lam1, const double* ts_lam2, const double* ts_lam3,
   const double* ts_powermint, const double* ts_biga, const double* ts_bigb,
-  const double* ts_bigr, const double* ts_bigd, const double* ts_c1, const double* ts_c2,
-  const double* ts_c3, const double* ts_c4, const double* ts_c5, const double* ts_h,
-  const double* ts_beta, const double* ts_powern, const double* ts_powern_del,
+  const double* ts_bigr, const double* ts_bigd, const double* ts_c1,
+  const double* ts_c2, const double* ts_c3, const double* ts_c4,
+  const double* ts_c5, const double* ts_h, const double* ts_beta,
+  const double* ts_powern, const double* ts_powern_del,
   const double* ts_ca1, const double* ts_cutsq);
 void tersoff_mod_gpu_clear();
 int ** tersoff_mod_gpu_compute_n(const int ago, const int inum_full,
@@ -61,8 +62,6 @@ void tersoff_mod_gpu_compute(const int ago, const int nlocal, const int nall,
                     const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success);
 double tersoff_mod_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 /* ---------------------------------------------------------------------- */
 
@@ -208,8 +207,9 @@ void PairTersoffMODGPU::init_style()
     _cutsq[i] = params[i].cutsq;
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = tersoff_mod_gpu_init(atom->ntypes+1, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300,
+                                 atom->nlocal+atom->nghost, mnf,
                                  cell_size, gpu_mode, screen, map, nelements,
                                  elem2param, nparams, lam1, lam2, lam3,
                                  powermint, biga, bigb, bigr, bigd,
@@ -244,7 +244,6 @@ void PairTersoffMODGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser = 2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_tersoff_zbl_gpu.cpp b/src/GPU/pair_tersoff_zbl_gpu.cpp
index e662159fa8..e17b48fec5 100644
--- a/src/GPU/pair_tersoff_zbl_gpu.cpp
+++ b/src/GPU/pair_tersoff_zbl_gpu.cpp
@@ -69,8 +69,6 @@ void tersoff_zbl_gpu_compute(const int ago, const int nlocal, const int nall,
                     const bool vflag, const bool eatom, const bool vatom,
                     int &host_start, const double cpu_time, bool &success);
 double tersoff_zbl_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 /* ---------------------------------------------------------------------- */
 
@@ -225,8 +223,9 @@ void PairTersoffZBLGPU::init_style()
     _cutsq[i] = params[i].cutsq;
   }
 
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = tersoff_zbl_gpu_init(atom->ntypes+1, atom->nlocal,
-                                 atom->nlocal+atom->nghost, 300,
+                                 atom->nlocal+atom->nghost, mnf,
                                  cell_size, gpu_mode, screen, map, nelements,
                                  elem2param, nparams, lam1, lam2, lam3,
                                  powermint, biga, bigb, bigr, bigd,
@@ -266,7 +265,6 @@ void PairTersoffZBLGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser = 2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_ufm_gpu.cpp b/src/GPU/pair_ufm_gpu.cpp
index 87354acda9..f950bf11c3 100644
--- a/src/GPU/pair_ufm_gpu.cpp
+++ b/src/GPU/pair_ufm_gpu.cpp
@@ -43,28 +43,27 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ufml_gpu_init(const int ntypes, double **cutsq, double **host_uf1,
-                 double **host_uf2, double **host_uf3,
-                 double **offset, double *special_lj, const int nlocal,
-                 const int nall, const int max_nbors, const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen);
+                  double **host_uf2, double **host_uf3,
+                  double **offset, double *special_lj, const int nlocal,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen);
 
 int ufml_gpu_reinit(const int ntypes, double **cutsq, double **host_uf1,
-                   double **host_uf2, double **host_uf3,
-                   double **offset);
+                    double **host_uf2, double **host_uf3, double **offset);
 
 void ufml_gpu_clear();
-int ** ufml_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
-                         tagint **special, const bool eflag, const bool vflag,
-                         const bool eatom, const bool vatom, int &host_start,
-                         int **ilist, int **jnum,
-                         const double cpu_time, bool &success);
+int ** ufml_gpu_compute_n(const int ago, const int inum, const int nall,
+                          double **host_x, int *host_type, double *sublo,
+                          double *subhi, tagint *tag, int **nspecial,
+                          tagint **special, const bool eflag, const bool vflag,
+                          const bool eatom, const bool vatom, int &host_start,
+                          int **ilist, int **jnum,
+                          const double cpu_time, bool &success);
 void ufml_gpu_compute(const int ago, const int inum, const int nall,
-                     double **host_x, int *host_type, int *ilist, int *numj,
-                     int **firstneigh, const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success);
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success);
 double ufml_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -166,9 +165,10 @@ void PairUFMGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ufml_gpu_init(atom->ntypes+1, cutsq, uf1, uf2, uf3,
                              offset, force->special_lj, atom->nlocal,
-                             atom->nlocal+atom->nghost, 300, maxspecial,
+                             atom->nlocal+atom->nghost, mnf, maxspecial,
                              cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_vashishta_gpu.cpp b/src/GPU/pair_vashishta_gpu.cpp
index df17b2091a..c5dd722974 100644
--- a/src/GPU/pair_vashishta_gpu.cpp
+++ b/src/GPU/pair_vashishta_gpu.cpp
@@ -38,34 +38,34 @@ using namespace LAMMPS_NS;
 
 // External functions from cuda library for atom decomposition
 
-int vashishta_gpu_init(const int ntypes, const int inum, const int nall, const int max_nbors,
-                const double cell_size, int &gpu_mode, FILE *screen,
-                int* host_map, const int nelements, int*** host_elem2param, const int nparams,
-                const double* cutsq, const double* r0,
-                const double* gamma, const double* eta,
-                const double* lam1inv, const double* lam4inv,
-                const double* zizj, const double* mbigd,
-                const double* dvrc, const double* big6w,
-                const double* heta, const double* bigh,
-                const double* bigw, const double* c0,
-                const double* costheta, const double* bigb,
-                const double* big2b, const double* bigc);
+int vashishta_gpu_init(const int ntypes, const int inum, const int nall,
+                       const int max_nbors, const double cell_size,
+                       int &gpu_mode, FILE *screen, int* host_map,
+                       const int nelements, int*** host_elem2param,
+                       const int nparams, const double* cutsq, const double* r0,
+                       const double* gamma, const double* eta,
+                       const double* lam1inv, const double* lam4inv,
+                       const double* zizj, const double* mbigd,
+                       const double* dvrc, const double* big6w,
+                       const double* heta, const double* bigh,
+                       const double* bigw, const double* c0,
+                       const double* costheta, const double* bigb,
+                       const double* big2b, const double* bigc);
 void vashishta_gpu_clear();
-int ** vashishta_gpu_compute_n(const int ago, const int inum,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** vashishta_gpu_compute_n(const int ago, const int inum, const int nall,
+                        double **host_x, int *host_type, double *sublo,
+                        double *subhi, tagint *tag, int **nspecial,
                         tagint **special, const bool eflag, const bool vflag,
                         const bool eatom, const bool vatom, int &host_start,
                         int **ilist, int **jnum,
                         const double cpu_time, bool &success);
-void vashishta_gpu_compute(const int ago, const int nloc, const int nall, const int ln,
-                    double **host_x, int *host_type, int *ilist, int *numj,
-                    int **firstneigh, const bool eflag, const bool vflag,
-                    const bool eatom, const bool vatom, int &host_start,
-                    const double cpu_time, bool &success);
+void vashishta_gpu_compute(const int ago, const int nloc, const int nall,
+                           const int ln, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           const double cpu_time, bool &success);
 double vashishta_gpu_bytes();
-extern double lmp_gpu_forces(double **f, double **tor, double *eatom,
-                             double **vatom, double *virial, double &ecoul);
 
 /* ---------------------------------------------------------------------- */
 
@@ -214,7 +214,8 @@ void PairVashishtaGPU::init_style()
     big2b[i] = params[i].big2b;
     bigc[i] = params[i].bigc;
   }
-  int success = vashishta_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, 500,
+  int mnf = 5e-2 * neighbor->oneatom;
+  int success = vashishta_gpu_init(atom->ntypes+1, atom->nlocal, atom->nlocal+atom->nghost, mnf,
                             cell_size, gpu_mode, screen, map, nelements,
                             elem2param, nparams, cutsq, r0, gamma, eta, lam1inv,
                             lam4inv, zizj, mbigd, dvrc, big6w, heta, bigh, bigw,
@@ -246,7 +247,6 @@ void PairVashishtaGPU::init_style()
     neighbor->requests[irequest]->full = 1;
     neighbor->requests[irequest]->ghost = 1;
   }
-
   if (comm->cutghostuser < (2.0*cutmax + neighbor->skin)) {
     comm->cutghostuser=2.0*cutmax + neighbor->skin;
     if (comm->me == 0)
diff --git a/src/GPU/pair_yukawa_colloid_gpu.cpp b/src/GPU/pair_yukawa_colloid_gpu.cpp
index 8da3b48dd5..9322f95f44 100644
--- a/src/GPU/pair_yukawa_colloid_gpu.cpp
+++ b/src/GPU/pair_yukawa_colloid_gpu.cpp
@@ -41,24 +41,27 @@ using namespace LAMMPS_NS;
 // External functions from cuda library for atom decomposition
 
 int ykcolloid_gpu_init(const int ntypes, double **cutsq, double **host_a,
-                 double **host_offset, double *special_lj, const int inum,
-                 const int nall, const int max_nbors,  const int maxspecial,
-                 const double cell_size, int &gpu_mode, FILE *screen,
-                 const double kappa);
+                       double **host_offset, double *special_lj, const int inum,
+                       const int nall, const int max_nbors,
+                       const int maxspecial, const double cell_size,
+                       int &gpu_mode, FILE *screen, const double kappa);
 void ykcolloid_gpu_clear();
 int ** ykcolloid_gpu_compute_n(const int ago, const int inum_full,
-                        const int nall, double **host_x, int *host_type,
-                        double *sublo, double *subhi, tagint *tag, int **nspecial,
-                        tagint **special, const bool eflag, const bool vflag,
-                        const bool eatom, const bool vatom, int &host_start,
-                        int **ilist, int **jnum, const double cpu_time,
-                        bool &success, double *host_rad);
+                               const int nall, double **host_x, int *host_type,
+                               double *sublo, double *subhi, tagint *tag,
+                               int **nspecial, tagint **special,
+                               const bool eflag, const bool vflag,
+                               const bool eatom, const bool vatom,
+                               int &host_start, int **ilist, int **jnum,
+                               const double cpu_time, bool &success,
+                               double *host_rad);
 void ykcolloid_gpu_compute(const int ago, const int inum_full,
-                     const int nall, double **host_x, int *host_type,
-                     int *ilist, int *numj, int **firstneigh,
-                     const bool eflag, const bool vflag,
-                     const bool eatom, const bool vatom, int &host_start,
-                     const double cpu_time, bool &success, double *host_rad);
+                           const int nall, double **host_x, int *host_type,
+                           int *ilist, int *numj, int **firstneigh,
+                           const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           const double cpu_time, bool &success,
+                           double *host_rad);
 double ykcolloid_gpu_bytes();
 
 /* ---------------------------------------------------------------------- */
@@ -167,9 +170,10 @@ void PairYukawaColloidGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = ykcolloid_gpu_init(atom->ntypes+1, cutsq, a,
                                    offset, force->special_lj, atom->nlocal,
-                                   atom->nlocal+atom->nghost, 300, maxspecial,
+                                   atom->nlocal+atom->nghost, mnf, maxspecial,
                                    cell_size, gpu_mode, screen, kappa);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_yukawa_gpu.cpp b/src/GPU/pair_yukawa_gpu.cpp
index 8c133b068e..81304159a0 100644
--- a/src/GPU/pair_yukawa_gpu.cpp
+++ b/src/GPU/pair_yukawa_gpu.cpp
@@ -49,10 +49,10 @@ void yukawa_gpu_clear();
 int ** yukawa_gpu_compute_n(const int ago, const int inum_full, const int nall,
                             double **host_x, int *host_type, double *sublo,
                             double *subhi, tagint *tag, int **nspecial,
-                            tagint **special, const bool eflag, const bool vflag,
-                            const bool eatom, const bool vatom,
-                            int &host_start, int **ilist, int **jnum,
-                            const double cpu_time, bool &success);
+                            tagint **special, const bool eflag,
+                            const bool vflag, const bool eatom,
+                            const bool vatom, int &host_start, int **ilist,
+                            int **jnum, const double cpu_time, bool &success);
 void yukawa_gpu_compute(const int ago, const int inum_full, const int nall,
                         double **host_x, int *host_type, int *ilist, int *numj,
                         int **firstneigh, const bool eflag, const bool vflag,
@@ -159,9 +159,10 @@ void PairYukawaGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = yukawa_gpu_init(atom->ntypes+1, cutsq, kappa, a,
                                 offset, force->special_lj, atom->nlocal,
-                                atom->nlocal+atom->nghost, 300, maxspecial,
+                                atom->nlocal+atom->nghost, mnf, maxspecial,
                                 cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
diff --git a/src/GPU/pair_zbl_gpu.cpp b/src/GPU/pair_zbl_gpu.cpp
index eda0c26614..93e0588285 100644
--- a/src/GPU/pair_zbl_gpu.cpp
+++ b/src/GPU/pair_zbl_gpu.cpp
@@ -50,9 +50,9 @@ int zbl_gpu_init(const int ntypes, double **cutsq, double **host_sw1,
                  const int maxspecial, const double cell_size,
                  int &gpu_mode, FILE *screen);
 void zbl_gpu_clear();
-int ** zbl_gpu_compute_n(const int ago, const int inum,
-                         const int nall, double **host_x, int *host_type,
-                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+int ** zbl_gpu_compute_n(const int ago, const int inum, const int nall,
+                         double **host_x, int *host_type, double *sublo,
+                         double *subhi, tagint *tag, int **nspecial,
                          tagint **special, const bool eflag, const bool vflag,
                          const bool eatom, const bool vatom, int &host_start,
                          int **ilist, int **jnum,
@@ -165,11 +165,12 @@ void PairZBLGPU::init_style()
   int maxspecial=0;
   if (atom->molecular)
     maxspecial=atom->maxspecial;
+  int mnf = 5e-2 * neighbor->oneatom;
   int success = zbl_gpu_init(atom->ntypes+1, cutsq, sw1, sw2, sw3, sw4,
                              sw5, d1a, d2a, d3a, d4a, zze,
                              cut_globalsq, cut_innersq, cut_inner,
                              atom->nlocal, atom->nlocal+atom->nghost,
-                             300, maxspecial, cell_size, gpu_mode, screen);
+                             mnf, maxspecial, cell_size, gpu_mode, screen);
   GPU_EXTRA::check_flag(success,error,world);
 
   if (gpu_mode == GPU_FORCE) {
diff --git a/src/GPU/pppm_gpu.cpp b/src/GPU/pppm_gpu.cpp
index cc7ef8841e..61d0144b73 100644
--- a/src/GPU/pppm_gpu.cpp
+++ b/src/GPU/pppm_gpu.cpp
@@ -80,9 +80,9 @@ FFT_SCALAR* PPPM_GPU_API(init)(const int nlocal, const int nall, FILE *screen,
                                const bool respa, int &success);
 void PPPM_GPU_API(clear)(const double poisson_time);
 int PPPM_GPU_API(spread)(const int ago, const int nlocal, const int nall,
-                      double **host_x, int *host_type, bool &success,
-                      double *host_q, double *boxlo, const double delxinv,
-                      const double delyinv, const double delzinv);
+                         double **host_x, int *host_type, bool &success,
+                         double *host_q, double *boxlo, const double delxinv,
+                         const double delyinv, const double delzinv);
 void PPPM_GPU_API(interp)(const FFT_SCALAR qqrd2e_scale);
 double PPPM_GPU_API(bytes)();
 void PPPM_GPU_API(forces)(double **f);
@@ -208,9 +208,9 @@ void PPPMGPU::compute(int eflag, int vflag)
   if (triclinic == 0) {
     bool success = true;
     int flag=PPPM_GPU_API(spread)(nago, atom->nlocal, atom->nlocal +
-                              atom->nghost, atom->x, atom->type, success,
-                              atom->q, domain->boxlo, delxinv, delyinv,
-                              delzinv);
+                                  atom->nghost, atom->x, atom->type, success,
+                                  atom->q, domain->boxlo, delxinv, delyinv,
+                                  delzinv);
     if (!success)
       error->one(FLERR,"Insufficient memory on accelerator");
     if (flag != 0)
@@ -402,7 +402,7 @@ void PPPMGPU::poisson_ik()
     work1[n++] = ZEROF;
   }
 
-  fft1->compute(work1,work1,1);
+  fft1->compute(work1,work1,FFT3d::FORWARD);
 
   // if requested, compute energy and virial contribution
 
@@ -441,7 +441,7 @@ void PPPMGPU::poisson_ik()
 
   if (evflag_atom) poisson_peratom();
 
-  // compute gradients of V(r) in each of 3 dims by transformimg -ik*V(k)
+  // compute gradients of V(r) in each of 3 dims by transformimg ik*V(k)
   // FFT leaves data in 3d brick decomposition
   // copy it into inner portion of vdx,vdy,vdz arrays
 
@@ -451,12 +451,12 @@ void PPPMGPU::poisson_ik()
   for (k = nzlo_fft; k <= nzhi_fft; k++)
     for (j = nylo_fft; j <= nyhi_fft; j++)
       for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkx[i]*work1[n+1];
-        work2[n+1] = -fkx[i]*work1[n];
+        work2[n] = -fkx[i]*work1[n+1];
+        work2[n+1] = fkx[i]*work1[n];
         n += 2;
       }
 
-  fft2->compute(work2,work2,-1);
+  fft2->compute(work2,work2,FFT3d::BACKWARD);
 
   n = 0;
   int x_hi = nxhi_in * 4 + 3;
@@ -473,12 +473,12 @@ void PPPMGPU::poisson_ik()
   for (k = nzlo_fft; k <= nzhi_fft; k++)
     for (j = nylo_fft; j <= nyhi_fft; j++)
       for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fky[j]*work1[n+1];
-        work2[n+1] = -fky[j]*work1[n];
+        work2[n] = -fky[j]*work1[n+1];
+        work2[n+1] = fky[j]*work1[n];
         n += 2;
       }
 
-  fft2->compute(work2,work2,-1);
+  fft2->compute(work2,work2,FFT3d::BACKWARD);
 
   n = 0;
   for (k = nzlo_in; k <= nzhi_in; k++)
@@ -494,12 +494,12 @@ void PPPMGPU::poisson_ik()
   for (k = nzlo_fft; k <= nzhi_fft; k++)
     for (j = nylo_fft; j <= nyhi_fft; j++)
       for (i = nxlo_fft; i <= nxhi_fft; i++) {
-        work2[n] = fkz[k]*work1[n+1];
-        work2[n+1] = -fkz[k]*work1[n];
+        work2[n] = -fkz[k]*work1[n+1];
+        work2[n+1] = fkz[k]*work1[n];
         n += 2;
       }
 
-  fft2->compute(work2,work2,-1);
+  fft2->compute(work2,work2,FFT3d::BACKWARD);
 
   n = 0;
   for (k = nzlo_in; k <= nzhi_in; k++)
diff --git a/src/MAKE/OPTIONS/Makefile.g++_openmpi b/src/MAKE/OPTIONS/Makefile.g++_openmpi
index 548994f832..75c12f9b38 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.g++_openmpi
@@ -7,12 +7,12 @@ SHELL = /bin/sh
 # specify flags and libraries needed for your compiler
 
 export OMPI_CXX = g++
-CC =		mpicxx
+CC =		mpicxx -std=c++11
 CCFLAGS =	-g -O3
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		mpicxx
+LINK =		mpicxx -std=c++11
 LINKFLAGS =	-g -O
 LIB = 
 SIZE =		size
diff --git a/src/MAKE/OPTIONS/Makefile.g++_serial b/src/MAKE/OPTIONS/Makefile.g++_serial
index 65de6a2c2c..4f6f0afe22 100644
--- a/src/MAKE/OPTIONS/Makefile.g++_serial
+++ b/src/MAKE/OPTIONS/Makefile.g++_serial
@@ -6,12 +6,12 @@ SHELL = /bin/sh
 # compiler/linker settings
 # specify flags and libraries needed for your compiler
 
-CC =		g++
+CC =		g++ -std=c++11
 CCFLAGS =	-g -O3
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
-LINK =		g++
+LINK =		g++ -std=c++11
 LINKFLAGS =	-g -O
 LIB = 
 SIZE =		size
diff --git a/src/MAKE/OPTIONS/Makefile.oneapi b/src/MAKE/OPTIONS/Makefile.oneapi
new file mode 100644
index 0000000000..2524773a76
--- /dev/null
+++ b/src/MAKE/OPTIONS/Makefile.oneapi
@@ -0,0 +1,122 @@
+# oneapi = For Intel oneAPI builds with GPU package
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpiicpc -std=c++11
+OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
+CCFLAGS =	-qopenmp -qopenmp-simd -qno-offload -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS) \
+                -I$(MKLROOT)/include
+SHFLAGS =	-fPIC
+DEPFLAGS =	-M
+
+LINK =		mpiicpc -std=c++11
+LINKFLAGS =	-qopenmp -qopenmp-simd $(OPTFLAGS) -L$(MKLROOT)/lib/intel64/
+LIB =           -ltbbmalloc -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core	
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings, all OPTIONAL
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings
+# see possible settings in Section 3.5 of the manual
+
+LMP_INC =	-DLAMMPS_GZIP
+
+# MPI library
+# see discussion in Section 3.4 of the manual
+# MPI wrapper compiler/linker can provide this info
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+MPI_PATH = 
+MPI_LIB =
+
+# FFT library
+# see discussion in Section 3.5.2 of manual
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =       -DFFT_MKL -DFFT_SINGLE
+FFT_PATH = 
+FFT_LIB =
+
+# JPEG and/or PNG library
+# see discussion in Section 3.5.4 of manual
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =
+JPG_PATH =
+JPG_LIB =
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# do not edit this section
+
+include Makefile.package.settings
+include Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
+EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE): main.o $(LMPLIB) $(EXTRA_LINK_DEPENDS)
+	$(LINK) $(LINKFLAGS) main.o $(EXTRA_PATH) $(LMPLINK) $(EXTRA_LIB) $(LIB) -o $@
+	$(SIZE) $@
+
+# Library targets
+
+$(ARLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
+	@rm -f ../$(ARLIB)
+	$(ARCHIVE) $(ARFLAGS) ../$(ARLIB) $(OBJ)
+	@rm -f $(ARLIB)
+	@ln -s ../$(ARLIB) $(ARLIB)
+
+$(SHLIB): $(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o ../$(SHLIB) \
+		$(OBJ) $(EXTRA_LIB) $(LIB)
+	@rm -f $(SHLIB)
+	@ln -s ../$(SHLIB) $(SHLIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+# Individual dependencies
+
+depend : fastdep.exe $(SRC)
+	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
+
+fastdep.exe: ../DEPEND/fastdep.c
+	cc -O -o $@ $<
+
+sinclude .depend
diff --git a/src/STUBS/Makefile b/src/STUBS/Makefile
index 3c3c3b46d9..c9b6fdb65a 100644
--- a/src/STUBS/Makefile
+++ b/src/STUBS/Makefile
@@ -11,13 +11,13 @@ SHELL = /bin/sh
 
 # Files
 
-SRC =		mpi.c
+SRC =		mpi.cpp
 INC =		mpi.h
 
 # Definitions
 
 EXE =		libmpi_stubs.a
-OBJ = 		$(SRC:.c=.o)
+OBJ = 		$(SRC:.cpp=.o)
 
 # System-specific settings
 
@@ -36,7 +36,7 @@ clean:
 
 # Compilation rules
 
-.c.o:
+.cpp.o:
 	$(CC) $(CCFLAGS) -c $<
 
 # Individual dependencies
diff --git a/src/STUBS/Makefile.mingw32-cross b/src/STUBS/Makefile.mingw32-cross
index 4144954ec7..2934bbd468 100644
--- a/src/STUBS/Makefile.mingw32-cross
+++ b/src/STUBS/Makefile.mingw32-cross
@@ -5,17 +5,17 @@ SHELL = /bin/sh
 
 # Files
 
-SRC =		mpi.c
+SRC =		mpi.cpp
 INC =		mpi.h
 
 # Definitions
 
 EXE =		libmpi_mingw32.a
-OBJ = 		$(SRC:%.c=%_mingw32.o)
+OBJ = 		$(SRC:%.cpp=%_mingw32.o)
 
 # System-specific settings
 
-CC =	        i686-w64-mingw32-gcc
+CC =	        i686-w64-mingw32-g++
 CCFLAGS =	-O2 -Wall -march=i686 -mtune=generic -mfpmath=387 -mpc64 -I.
 ARCHIVE =	i686-w64-mingw32-ar
 ARCHFLAG =	rs
diff --git a/src/STUBS/Makefile.mingw64-cross b/src/STUBS/Makefile.mingw64-cross
index 70b971f262..e62d5dcbe1 100644
--- a/src/STUBS/Makefile.mingw64-cross
+++ b/src/STUBS/Makefile.mingw64-cross
@@ -5,17 +5,17 @@ SHELL = /bin/sh
 
 # Files
 
-SRC =		mpi.c
+SRC =		mpi.cpp
 INC =		mpi.h
 
 # Definitions
 
 EXE =		libmpi_mingw64.a
-OBJ = 		$(SRC:%.c=%_mingw64.o)
+OBJ = 		$(SRC:%.cpp=%_mingw64.o)
 
 # System-specific settings
 
-CC =	        x86_64-w64-mingw32-gcc
+CC =	        x86_64-w64-mingw32-g++
 CCFLAGS =	-O2 -Wall -march=core2 -mtune=core2 -msse2 -mpc64 -I.
 ARCHIVE =	x86_64-w64-mingw32-ar
 ARCHFLAG =	rs
diff --git a/src/STUBS/mpi.c b/src/STUBS/mpi.cpp
similarity index 100%
rename from src/STUBS/mpi.c
rename to src/STUBS/mpi.cpp
diff --git a/src/STUBS/mpi.h b/src/STUBS/mpi.h
index 063dc542be..28e897960d 100644
--- a/src/STUBS/mpi.h
+++ b/src/STUBS/mpi.h
@@ -16,12 +16,17 @@
 
 #include <stdlib.h>
 
-/* use C bindings for MPI interface */
+/* We compile STUBS with C++ so the symbols embedded
+ * the serial shared library will not collide with any
+ * corresponding symbols from a real MPI library (which
+ * uses C bindings). As a consequence the header *must*
+ * enforce compiling with C++ only. */
 
-#ifdef __cplusplus
-extern "C" {
+#ifndef __cplusplus
+#error "MPI STUBS must be compiled with a C++ compiler"
 #endif
 
+
 /* Dummy defs for MPI stubs */
 
 #define MPI_COMM_WORLD 0
@@ -176,8 +181,4 @@ int MPI_Alltoallv(void *sendbuf, int *sendcounts, int *sdispls,
                   MPI_Datatype recvtype, MPI_Comm comm);
 /* ---------------------------------------------------------------------- */
 
-#ifdef __cplusplus
-}
-#endif
-
 #endif
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index 31bd63160f..6c7e108ca6 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -318,8 +318,7 @@ void FixIntel::init()
   _zero_master = 0;
 
   if (_pair_hybrid_flag && _hybrid_nonpair)
-    if (_pair_hybrid_flag > 1 || force->newton_pair == 0)
-      _pair_hybrid_zero = 1;
+    _pair_hybrid_zero = 1;
   _hybrid_nonpair = 0;
 
   _pair_intel_count = 0;
diff --git a/src/USER-REACTION/fix_bond_react.cpp b/src/USER-REACTION/fix_bond_react.cpp
index 3098a1bd67..93c9fe525b 100644
--- a/src/USER-REACTION/fix_bond_react.cpp
+++ b/src/USER-REACTION/fix_bond_react.cpp
@@ -537,7 +537,6 @@ FixBondReact::FixBondReact(LAMMPS *lmp, int narg, char **arg) :
   nmax = 0;
   partner = finalpartner = nullptr;
   distsq = nullptr;
-  probability = nullptr;
   maxattempt = 0;
   attempt = nullptr;
   nattempt = nullptr;
@@ -585,7 +584,6 @@ FixBondReact::~FixBondReact()
   memory->destroy(finalpartner);
   memory->destroy(nattempt);
   memory->destroy(distsq);
-  memory->destroy(probability);
   memory->destroy(attempt);
   memory->destroy(edge);
   memory->destroy(equivalences);
@@ -870,6 +868,9 @@ void FixBondReact::post_integrate()
     ghostly_rxn_count[i] = 0;
     nlocalskips[i] = 0;
     nghostlyskips[i] = 0;
+    // update reaction probability
+    if (var_flag[PROB][i])
+      fraction[i] = input->variable->compute_equal(var_id[PROB][i]);
   }
 
   if (nevery_check) {
@@ -890,16 +891,14 @@ void FixBondReact::post_integrate()
     memory->destroy(finalpartner);
     memory->destroy(distsq);
     memory->destroy(nattempt);
-    memory->destroy(probability);
     nmax = atom->nmax;
     memory->create(partner,nmax,"bond/react:partner");
     memory->create(finalpartner,nmax,"bond/react:finalpartner");
     memory->create(distsq,nmax,2,"bond/react:distsq");
     memory->create(nattempt,nreacts,"bond/react:nattempt");
-    memory->create(probability,nmax,"bond/react:probability");
   }
 
-  // reset create counts
+  // reset 'attempt' counts
   for (int i = 0; i < nreacts; i++) {
     nattempt[i] = 0;
   }
@@ -962,25 +961,14 @@ void FixBondReact::post_integrate()
       comm->reverse_comm_fix(this);
     }
 
-    // update reaction probability
-    if (var_flag[PROB][rxnID])
-      fraction[rxnID] = input->variable->compute_equal(var_id[PROB][rxnID]);
-
     // each atom now knows its winning partner
-    // for prob check, generate random value for each atom with a bond partner
-    // forward comm of partner and random value, so ghosts have it
-
-    if (fraction[rxnID] < 1.0) {
-      for (int i = 0; i < nlocal; i++)
-        if (partner[i]) probability[i] = random[rxnID]->uniform();
-    }
+    // forward comm of partner, so ghosts have it
 
     commflag = 2;
     comm->forward_comm_fix(this,2);
 
     // consider for reaction:
     // only if both atoms list each other as winning bond partner
-    //   and probability constraint is satisfied
     // if other atom is owned by another proc, it should do same thing
 
     int temp_nattempt = 0;
@@ -994,16 +982,6 @@ void FixBondReact::post_integrate()
         continue;
       }
 
-      // apply probability constraint using RN for atom with smallest ID
-
-      if (fraction[rxnID] < 1.0) {
-        if (tag[i] < tag[j]) {
-          if (probability[i] >= fraction[rxnID]) continue;
-        } else {
-          if (probability[j] >= fraction[rxnID]) continue;
-        }
-      }
-
       // store final bond partners and count the rxn possibility once
 
       finalpartner[i] = tag[j];
@@ -1031,23 +1009,28 @@ void FixBondReact::post_integrate()
       if (finalpartner[i] == 0) continue;
 
       j = atom->map(finalpartner[i]);
-      // if (j < 0 || tag[i] < tag[j]) {
-      if (tag[i] < tag[j]) { //atom->map(std::min(tag[i],tag[j])) <= nlocal &&
-        if (nattempt[rxnID] == maxattempt) {
+      if (tag[i] < tag[j]) {
+        if (nattempt[rxnID] > maxattempt-2) {
           maxattempt += DELTA;
-          // third column of 'attempt': bond/react integer ID
+          // third dim of 'attempt': bond/react integer ID
           memory->grow(attempt,maxattempt,2,nreacts,"bond/react:attempt");
         }
         // to ensure types remain in same order
-        // unnecessary now taken from reaction map file
         if (iatomtype[rxnID] == type[i]) {
           attempt[nattempt[rxnID]][0][rxnID] = tag[i];
           attempt[nattempt[rxnID]][1][rxnID] = finalpartner[i];
+          nattempt[rxnID]++;
+          // add another attempt if initiator atoms are same type
+          if (iatomtype[rxnID] == jatomtype[rxnID]) {
+            attempt[nattempt[rxnID]][0][rxnID] = finalpartner[i];
+            attempt[nattempt[rxnID]][1][rxnID] = tag[i];
+            nattempt[rxnID]++;
+          }
         } else {
           attempt[nattempt[rxnID]][0][rxnID] = finalpartner[i];
           attempt[nattempt[rxnID]][1][rxnID] = tag[i];
+          nattempt[rxnID]++;
         }
-        nattempt[rxnID]++;
       }
     }
   }
@@ -1340,10 +1323,14 @@ void FixBondReact::superimpose_algorithm()
              (nxspecial[local_atom1][0] == 0 ||
               xspecial[local_atom1][0] == atom->tag[local_atom2]) &&
              check_constraints()) {
-          status = ACCEPT;
-          glove_ghostcheck();
-        } else
-          status = REJECT;
+          if (fraction[rxnID] < 1.0 &&
+              random[rxnID]->uniform() >= fraction[rxnID]) {
+            status = REJECT;
+          } else {
+            status = ACCEPT;
+            glove_ghostcheck();
+          }
+        } else status = REJECT;
       }
 
       avail_guesses = 0;
@@ -1380,9 +1367,12 @@ void FixBondReact::superimpose_algorithm()
           }
         }
 
-        if (status == ACCEPT && check_constraints()) { // reaction site found successfully!
-          glove_ghostcheck();
-        }
+        // reaction site found successfully!
+        if (status == ACCEPT)
+          if (fraction[rxnID] < 1.0 &&
+              random[rxnID]->uniform() >= fraction[rxnID]) status = REJECT;
+          else glove_ghostcheck();
+
         hang_catch++;
         // let's go ahead and catch the simplest of hangs
         //if (hang_catch > onemol->natoms*4)
@@ -1622,8 +1612,8 @@ void FixBondReact::check_a_neighbor()
 
             glove_counter++;
             if (glove_counter == onemol->natoms) {
-              status = ACCEPT;
-              ring_check();
+              if (ring_check() && check_constraints()) status = ACCEPT;
+              else status = GUESSFAIL;
               return;
             }
             // status should still == PROCEED
@@ -1674,8 +1664,8 @@ void FixBondReact::check_a_neighbor()
 
         glove_counter++;
         if (glove_counter == onemol->natoms) {
-          status = ACCEPT;
-          ring_check();
+          if (ring_check() && check_constraints()) status = ACCEPT;
+          else status = GUESSFAIL;
           return;
           // will never complete here when there are edge atoms
           // ...actually that could be wrong if people get creative...shouldn't affect anything
@@ -1786,8 +1776,8 @@ void FixBondReact::inner_crosscheck_loop()
   }
   glove_counter++;
   if (glove_counter == onemol->natoms) {
-    status = ACCEPT;
-    ring_check();
+    if (ring_check() && check_constraints()) status = ACCEPT;
+    else status = GUESSFAIL;
     return;
   }
   status = CONTINUE;
@@ -1798,21 +1788,17 @@ void FixBondReact::inner_crosscheck_loop()
   Necessary for certain ringed structures
 ------------------------------------------------------------------------- */
 
-void FixBondReact::ring_check()
+int FixBondReact::ring_check()
 {
   // ring_check can be made more efficient by re-introducing 'frozen' atoms
   // 'frozen' atoms have been assigned and also are no longer pioneers
 
   // double check the number of neighbors match for all non-edge atoms
   // otherwise, atoms at 'end' of symmetric ring can behave like edge atoms
-  for (int i = 0; i < onemol->natoms; i++) {
-    if (edge[i][rxnID] == 0) {
-      if (onemol_nxspecial[i][0] != nxspecial[atom->map(glove[i][1])][0]) {
-        status = GUESSFAIL;
-        return;
-      }
-    }
-  }
+  for (int i = 0; i < onemol->natoms; i++)
+    if (edge[i][rxnID] == 0 &&
+        onemol_nxspecial[i][0] != nxspecial[atom->map(glove[i][1])][0])
+      return 0;
 
   for (int i = 0; i < onemol->natoms; i++) {
     for (int j = 0; j < onemol_nxspecial[i][0]; j++) {
@@ -1824,12 +1810,10 @@ void FixBondReact::ring_check()
           break;
         }
       }
-      if (ring_fail == 1) {
-        status = GUESSFAIL;
-        return;
-      }
+      if (ring_fail == 1) return 0;
     }
   }
+  return 1;
 }
 
 /* ----------------------------------------------------------------------
@@ -2705,7 +2689,7 @@ update molecule IDs, charges, types, special lists and all topology
 
 void FixBondReact::update_everything()
 {
-  int nlocal; // must be defined after create_atoms
+  int nlocal = atom->nlocal; // must be redefined after create atoms
   int *type = atom->type;
   int **nspecial = atom->nspecial;
   tagint **special = atom->special;
@@ -2717,6 +2701,9 @@ void FixBondReact::update_everything()
   // used when deleting atoms
   int ndel,ndelone;
   int *mark;
+  int nmark = nlocal;
+  memory->create(mark,nmark,"bond/react:mark");
+  for (int i = 0; i < nmark; i++) mark[i] = 0;
   tagint *tag = atom->tag;
   AtomVec *avec = atom->avec;
 
@@ -2778,8 +2765,11 @@ void FixBondReact::update_everything()
 
     // mark to-delete atoms
     nlocal = atom->nlocal;
-    mark = new int[nlocal];
-    for (int i = 0; i < nlocal; i++) mark[i] = 0;
+    if (nlocal > nmark) {
+      memory->grow(mark,nlocal,"bond/react:mark");
+      for (int i = nmark; i < nlocal; i++) mark[i] = 0;
+      nmark = nlocal;
+    }
     for (int i = 0; i < update_num_mega; i++) {
       rxnID = update_mega_glove[0][i];
       onemol = atom->molecules[unreacted_mol[rxnID]];
@@ -3228,7 +3218,7 @@ void FixBondReact::update_everything()
       }
     }
   }
-  delete [] mark;
+  memory->destroy(mark);
 
   MPI_Allreduce(&ndelone,&ndel,1,MPI_INT,MPI_SUM,world);
 
@@ -3941,20 +3931,10 @@ int FixBondReact::pack_forward_comm(int n, int *list, double *buf,
 
   m = 0;
 
-  if (commflag == 1) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      printf("hello you shouldn't be here\n");
-      //buf[m++] = ubuf(bondcount[j]).d;
-    }
-    return m;
-  }
-
   if (commflag == 2) {
     for (i = 0; i < n; i++) {
       j = list[i];
       buf[m++] = ubuf(partner[j]).d;
-      buf[m++] = probability[j];
     }
     return m;
   }
@@ -3980,15 +3960,9 @@ void FixBondReact::unpack_forward_comm(int n, int first, double *buf)
   m = 0;
   last = first + n;
 
-  if (commflag == 1) {
+  if (commflag == 2) {
     for (i = first; i < last; i++)
-      printf("hello you shouldn't be here\n");
-    // bondcount[i] = (int) ubuf(buf[m++]).i;
-  } else if (commflag == 2) {
-    for (i = first; i < last; i++) {
       partner[i] = (tagint) ubuf(buf[m++]).i;
-      probability[i] = buf[m++];
-    }
   } else {
     m = 0;
     last = first + n;
@@ -4029,20 +4003,18 @@ void FixBondReact::unpack_reverse_comm(int n, int *list, double *buf)
 
   m = 0;
 
-  if (commflag != 1) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      if (closeneigh[rxnID] != 0) {
-        if (buf[m+1] < distsq[j][1]) {
-          partner[j] = (tagint) ubuf(buf[m++]).i;
-          distsq[j][1] = buf[m++];
-        } else m += 2;
-      } else {
-        if (buf[m+1] > distsq[j][0]) {
-          partner[j] = (tagint) ubuf(buf[m++]).i;
-          distsq[j][0] = buf[m++];
-        } else m += 2;
-      }
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    if (closeneigh[rxnID] != 0) {
+      if (buf[m+1] < distsq[j][1]) {
+        partner[j] = (tagint) ubuf(buf[m++]).i;
+        distsq[j][1] = buf[m++];
+      } else m += 2;
+    } else {
+      if (buf[m+1] > distsq[j][0]) {
+        partner[j] = (tagint) ubuf(buf[m++]).i;
+        distsq[j][0] = buf[m++];
+      } else m += 2;
     }
   }
 }
diff --git a/src/USER-REACTION/fix_bond_react.h b/src/USER-REACTION/fix_bond_react.h
index 87a5945d45..67788df217 100644
--- a/src/USER-REACTION/fix_bond_react.h
+++ b/src/USER-REACTION/fix_bond_react.h
@@ -86,7 +86,7 @@ class FixBondReact : public Fix {
   int nmax; // max num local atoms
   int max_natoms; // max natoms in a molecule template
   tagint *partner,*finalpartner;
-  double **distsq,*probability;
+  double **distsq;
   int *nattempt;
   int maxattempt;
   int allnattempt;
@@ -171,7 +171,7 @@ class FixBondReact : public Fix {
   void check_a_neighbor();
   void crosscheck_the_neighbor();
   void inner_crosscheck_loop();
-  void ring_check();
+  int ring_check();
   int check_constraints();
   void get_IDcoords(int, int, double *);
   double get_temperature(tagint **, int, int);
diff --git a/src/atom.cpp b/src/atom.cpp
index 3308d07267..fe260309e2 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -40,6 +40,10 @@
 #include "neigh_request.h"
 #endif
 
+#ifdef LMP_GPU
+#include "fix_gpu.h"
+#endif
+
 using namespace LAMMPS_NS;
 using namespace MathConst;
 
@@ -1748,7 +1752,7 @@ void Atom::set_mass(const char *file, int line, int /*narg*/, char **arg)
   if (lo < 1 || hi > ntypes) error->all(file,line,"Invalid type for mass set");
 
   for (int itype = lo; itype <= hi; itype++) {
-    mass[itype] = atof(arg[1]);
+    mass[itype] = utils::numeric(FLERR,arg[1],false,lmp);
     mass_setflag[itype] = 1;
 
     if (mass[itype] <= 0.0) error->all(file,line,"Invalid mass value");
@@ -2149,7 +2153,7 @@ void Atom::setup_sort_bins()
   bininvy = nbiny / (bboxhi[1]-bboxlo[1]);
   bininvz = nbinz / (bboxhi[2]-bboxlo[2]);
 
-  #ifdef LMP_USER_INTEL
+#ifdef LMP_USER_INTEL
   int intel_neigh = 0;
   if (neighbor->nrequest) {
     if (neighbor->requests[0]->intel) intel_neigh = 1;
@@ -2194,7 +2198,36 @@ void Atom::setup_sort_bins()
     bboxhi[1] = bboxlo[1] + static_cast<double>(nbiny) / bininvy;
     bboxhi[2] = bboxlo[2] + static_cast<double>(nbinz) / bininvz;
   }
-  #endif
+#endif
+
+#ifdef LMP_GPU
+  if (userbinsize == 0.0) {
+    int ifix = modify->find_fix("package_gpu");
+    if (ifix >= 0) {
+      const double subx = domain->subhi[0] - domain->sublo[0];
+      const double suby = domain->subhi[1] - domain->sublo[1];
+      const double subz = domain->subhi[2] - domain->sublo[2];
+
+      FixGPU *fix = static_cast<FixGPU *>(modify->fix[ifix]);
+      binsize = fix->binsize(subx, suby, subz, atom->nlocal,
+                             neighbor->cutneighmax);
+      bininv = 1.0 / binsize;
+
+      nbinx = static_cast<int> (ceil(subx * bininv));
+      nbiny = static_cast<int> (ceil(suby * bininv));
+      nbinz = static_cast<int> (ceil(subz * bininv));
+      if (domain->dimension == 2) nbinz = 1;
+
+      if (nbinx == 0) nbinx = 1;
+      if (nbiny == 0) nbiny = 1;
+      if (nbinz == 0) nbinz = 1;
+
+      bininvx = bininv;
+      bininvy = bininv;
+      bininvz = bininv;
+    }
+  }
+#endif
 
   if (1.0*nbinx*nbiny*nbinz > INT_MAX)
     error->one(FLERR,"Too many atom sorting bins");
diff --git a/src/citeme.cpp b/src/citeme.cpp
index fdd1ee867d..41ac87f5bb 100644
--- a/src/citeme.cpp
+++ b/src/citeme.cpp
@@ -118,7 +118,7 @@ void CiteMe::flush()
       if (!citefile.empty())
         logbuffer += fmt::format(cite_file,"file",citefile);
       if (screen_flag == VERBOSE)
-        scrbuffer += fmt::format(cite_file,"screen","output");
+        logbuffer += fmt::format(cite_file,"screen","output");
       logbuffer += cite_separator;
       if (logfile) fputs(logbuffer.c_str(),logfile);
       logbuffer.clear();
diff --git a/src/compute_reduce.cpp b/src/compute_reduce.cpp
index 82d3dff458..bc9aeefe7b 100644
--- a/src/compute_reduce.cpp
+++ b/src/compute_reduce.cpp
@@ -148,8 +148,8 @@ ComputeReduce::ComputeReduce(LAMMPS *lmp, int narg, char **arg) :
       if (iarg+3 > narg) error->all(FLERR,"Illegal compute reduce command");
       if (mode != MINN && mode != MAXX)
         error->all(FLERR,"Compute reduce replace requires min or max mode");
-      int col1 = atoi(arg[iarg+1]) - 1;
-      int col2 = atoi(arg[iarg+2]) - 1;
+      int col1 = utils::inumeric(FLERR,arg[iarg+1],false,lmp) - 1;
+      int col2 = utils::inumeric(FLERR,arg[iarg+2],false,lmp) - 1;
       if (col1 < 0 || col1 >= nvalues || col2 < 0 || col2 >= nvalues)
         error->all(FLERR,"Illegal compute reduce command");
       if (col1 == col2) error->all(FLERR,"Illegal compute reduce command");
diff --git a/src/dump_cfg.cpp b/src/dump_cfg.cpp
index ed8df72096..b4e6af90cf 100644
--- a/src/dump_cfg.cpp
+++ b/src/dump_cfg.cpp
@@ -75,7 +75,8 @@ DumpCFG::DumpCFG(LAMMPS *lmp, int narg, char **arg) :
 
     if (argi.get_dim() == 1) {
       std::string newarg(std::to_string(earg[iarg][0]));
-      newarg += '_' + argi.get_name() + '_' + std::to_string(argi.get_index1());
+      newarg += std::string("_") + argi.get_name();
+      newarg += std::string("_") + std::to_string(argi.get_index1());
       auxname[i] = new char[newarg.size()+1];
       strcpy(auxname[i],newarg.c_str());
     } else {
diff --git a/src/fix_addforce.cpp b/src/fix_addforce.cpp
index a06544e268..07031a40a4 100644
--- a/src/fix_addforce.cpp
+++ b/src/fix_addforce.cpp
@@ -83,7 +83,7 @@ FixAddForce::FixAddForce(LAMMPS *lmp, int narg, char **arg) :
   while (iarg < narg) {
     if (strcmp(arg[iarg],"every") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal fix addforce command");
-      nevery = atoi(arg[iarg+1]);
+      nevery = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       if (nevery <= 0) error->all(FLERR,"Illegal fix addforce command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"region") == 0) {
diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp
index c1c52a3f8c..f18888bbfc 100644
--- a/src/fix_property_atom.cpp
+++ b/src/fix_property_atom.cpp
@@ -254,13 +254,19 @@ void FixPropertyAtom::read_data_section(char *keyword, int n, char *buf,
 
     if ((m = atom->map(itag)) >= 0) {
       for (j = 0; j < nvalue; j++) {
-        if (style[j] == MOLECULE) atom->molecule[m] = ATOTAGINT(values[j+1]);
-        else if (style[j] == CHARGE) atom->q[m] = atof(values[j+1]);
-        else if (style[j] == RMASS) atom->rmass[m] = atof(values[j+1]);
-        else if (style[j] == INTEGER)
-          atom->ivector[index[j]][m] = atoi(values[j+1]);
-        else if (style[j] == DOUBLE)
-          atom->dvector[index[j]][m] = atof(values[j+1]);
+        if (style[j] == MOLECULE) {
+          atom->molecule[m] = utils::tnumeric(FLERR,values[j+1],false,lmp);
+        } else if (style[j] == CHARGE) {
+          atom->q[m] = utils::numeric(FLERR,values[j+1],false,lmp);
+        } else if (style[j] == RMASS) {
+          atom->rmass[m] = utils::numeric(FLERR,values[j+1],false,lmp);
+        } else if (style[j] == INTEGER) {
+          atom->ivector[index[j]][m] = utils::inumeric(FLERR,values[j+1],
+                                                       false,lmp);
+        } else if (style[j] == DOUBLE) {
+          atom->dvector[index[j]][m] = utils::numeric(FLERR,values[j+1],
+                                                      true,lmp);
+        }
       }
     }
 
diff --git a/src/image.cpp b/src/image.cpp
index 4b181ee8b0..0acef0bceb 100644
--- a/src/image.cpp
+++ b/src/image.cpp
@@ -113,6 +113,11 @@ Image::Image(LAMMPS *lmp, int nmap_caller) : Pointers(lmp)
   backLightColor[2] = 0.9;
 
   random = nullptr;
+
+  // MPI_Gatherv vectors
+
+  recvcounts = nullptr;
+  displs = nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -134,6 +139,9 @@ Image::~Image()
   memory->destroy(rgbcopy);
 
   if (random) delete random;
+
+  memory->destroy(recvcounts);
+  memory->destroy(displs);
 }
 
 /* ----------------------------------------------------------------------
@@ -334,16 +342,37 @@ void Image::merge()
   // extra SSAO enhancement
   // bcast full image to all procs
   // each works on subset of pixels
-  // gather result back to proc 0
+  // MPI_Gather() result back to proc 0
+  // use Gatherv() if subset of pixels is not the same size on every proc
 
   if (ssao) {
     MPI_Bcast(imageBuffer,npixels*3,MPI_BYTE,0,world);
     MPI_Bcast(surfaceBuffer,npixels*2,MPI_DOUBLE,0,world);
     MPI_Bcast(depthBuffer,npixels,MPI_DOUBLE,0,world);
     compute_SSAO();
-    int pixelPart = height/nprocs * width*3;
-    MPI_Gather(imageBuffer+me*pixelPart,pixelPart,MPI_BYTE,
-               rgbcopy,pixelPart,MPI_BYTE,0,world);
+
+    int pixelstart = 3 * static_cast<int> (1.0*me/nprocs * npixels);
+    int pixelstop = 3 * static_cast<int> (1.0*(me+1)/nprocs * npixels);
+    int mypixels = pixelstop - pixelstart;
+
+    if (npixels % nprocs == 0) {
+      MPI_Gather(imageBuffer+pixelstart,mypixels,MPI_BYTE,
+                 rgbcopy,mypixels,MPI_BYTE,0,world);
+
+    } else {
+      if (recvcounts == nullptr) {
+        memory->create(recvcounts,nprocs,"image:recvcounts");
+        memory->create(displs,nprocs,"image:displs");
+        MPI_Allgather(&mypixels,1,MPI_INT,recvcounts,1,MPI_INT,world);
+        displs[0] = 0;
+        for (int i = 1; i < nprocs; i++)
+          displs[i] = displs[i-1] + recvcounts[i-1];
+      }
+
+      MPI_Gatherv(imageBuffer+pixelstart,mypixels,MPI_BYTE,
+                  rgbcopy,recvcounts,displs,MPI_BYTE,0,world);
+    }
+
     writeBuffer = rgbcopy;
   } else {
     writeBuffer = imageBuffer;
@@ -880,110 +909,117 @@ void Image::compute_SSAO()
         -tanPerPixel / zoom;
   int pixelRadius = (int) trunc (SSAORadius / pixelWidth + 0.5);
 
-  int x,y,s;
-  int hPart = height / nprocs;
-  int index = me * hPart * width;
-  for (y = me * hPart; y < (me + 1) * hPart; y ++) {
-    for (x = 0; x < width; x ++, index ++) {
-      double cdepth = depthBuffer[index];
-      if (cdepth < 0) { continue; }
+  // each proc is assigned a subset of contiguous pixels from the full image
+  // pixels are contiguous in x (columns within a row), then by row
+  // index = pixels from 0 to npixel-1
+  // x = column # from 0 to width-1
+  // y = row # from 0 to height-1
 
-      double sx = surfaceBuffer[index * 2 + 0];
-      double sy = surfaceBuffer[index * 2 + 1];
-      double sin_t = -sqrt(sx*sx + sy*sy);
+  int pixelstart = static_cast<int> (1.0*me/nprocs * npixels);
+  int pixelstop = static_cast<int> (1.0*(me+1)/nprocs * npixels);
 
-      double mytheta = random->uniform() * SSAOJitter;
-      double ao = 0.0;
+  for (int index = pixelstart; index < pixelstop; index++) {
+    int x = index % width;
+    int y = index / width;
 
-      for (s = 0; s < SSAOSamples; s ++) {
-        double hx = cos(mytheta);
-        double hy = sin(mytheta);
-        mytheta += delTheta;
+    double cdepth = depthBuffer[index];
+    if (cdepth < 0) { continue; }
 
-        // multiply by z cross surface tangent
-        // so that dot (aka cos) works here
+    double sx = surfaceBuffer[index * 2 + 0];
+    double sy = surfaceBuffer[index * 2 + 1];
+    double sin_t = -sqrt(sx*sx + sy*sy);
 
-        double scaled_sin_t = sin_t * (hx*sy + hy*sx);
+    double mytheta = random->uniform() * SSAOJitter;
+    double ao = 0.0;
 
-        // Bresenham's line algorithm to march over depthBuffer
+    for (int s = 0; s < SSAOSamples; s ++) {
+      double hx = cos(mytheta);
+      double hy = sin(mytheta);
+      mytheta += delTheta;
 
-        int dx = static_cast<int> (hx * pixelRadius);
-        int dy = static_cast<int> (hy * pixelRadius);
-        int ex = x + dx;
-        if (ex < 0) { ex = 0; } if (ex >= width) { ex = width - 1; }
-        int ey = y + dy;
-        if (ey < 0) { ey = 0; } if (ey >= height) { ey = height - 1; }
-        double delta;
-        int small, large;
-        double lenIncr;
-        if (fabs(hx) > fabs(hy)) {
-          small = (hx > 0) ? 1 : -1;
-          large = (hy > 0) ? width : -width;
-          delta = fabs(hy / hx);
-        } else {
-          small = (hy > 0) ? width : -width;
-          large = (hx > 0) ? 1 : -1;
-          delta = fabs(hx / hy);
+      // multiply by z cross surface tangent
+      // so that dot (aka cos) works here
+
+      double scaled_sin_t = sin_t * (hx*sy + hy*sx);
+
+      // Bresenham's line algorithm to march over depthBuffer
+
+      int dx = static_cast<int> (hx * pixelRadius);
+      int dy = static_cast<int> (hy * pixelRadius);
+      int ex = x + dx;
+      if (ex < 0) { ex = 0; } if (ex >= width) { ex = width - 1; }
+      int ey = y + dy;
+      if (ey < 0) { ey = 0; } if (ey >= height) { ey = height - 1; }
+      double delta;
+      int small, large;
+      double lenIncr;
+      if (fabs(hx) > fabs(hy)) {
+        small = (hx > 0) ? 1 : -1;
+        large = (hy > 0) ? width : -width;
+        delta = fabs(hy / hx);
+      } else {
+        small = (hy > 0) ? width : -width;
+        large = (hx > 0) ? 1 : -1;
+        delta = fabs(hx / hy);
+      }
+      lenIncr = sqrt (1 + delta * delta) * pixelWidth;
+
+      // initialize with one step
+      // because the center point doesn't need testing
+
+      int end = ex + ey * width;
+      int ind = index + small;
+      double len = lenIncr;
+      double err = delta;
+      if (err >= 1.0) {
+        ind += large;
+        err -= 1.0;
+      }
+
+      double minPeak = -1;
+      double peakLen = 0.0;
+      int stepsTaken = 1;
+      while ((small > 0 && ind <= end) || (small < 0 && ind >= end)) {
+        if (ind < 0 || ind >= (width*height)) {
+          break;
         }
-        lenIncr = sqrt (1 + delta * delta) * pixelWidth;
 
-        // initialize with one step
-        // because the center point doesn't need testing
+        // cdepth - depthBuffer B/C we want it in the negative z direction
 
-        int end = ex + ey * width;
-        int ind = index + small;
-        double len = lenIncr;
-        double err = delta;
+        if (minPeak < 0 || (depthBuffer[ind] >= 0 &&
+                            depthBuffer[ind] < minPeak)) {
+          minPeak = depthBuffer[ind];
+          peakLen = len;
+        }
+        ind += small;
+        len += lenIncr;
+        err += delta;
         if (err >= 1.0) {
           ind += large;
           err -= 1.0;
         }
-
-        double minPeak = -1;
-        double peakLen = 0.0;
-        int stepsTaken = 1;
-        while ((small > 0 && ind <= end) || (small < 0 && ind >= end)) {
-          if (ind < 0 || ind >= (width*height)) {
-            break;
-          }
-
-          // cdepth - depthBuffer B/C we want it in the negative z direction
-
-          if (minPeak < 0 || (depthBuffer[ind] >= 0 &&
-                              depthBuffer[ind] < minPeak)) {
-            minPeak = depthBuffer[ind];
-            peakLen = len;
-          }
-          ind += small;
-          len += lenIncr;
-          err += delta;
-          if (err >= 1.0) {
-            ind += large;
-            err -= 1.0;
-          }
-          stepsTaken ++;
-        }
-
-        if (peakLen > 0) {
-          double h = atan ((cdepth - minPeak) / peakLen);
-          ao += saturate(sin (h) - scaled_sin_t);
-        } else {
-          ao += saturate(-scaled_sin_t);
-        }
+        stepsTaken ++;
       }
-      ao /= (double)SSAOSamples;
 
-      double c[3];
-      c[0] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 0]);
-      c[1] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 1]);
-      c[2] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 2]);
-      c[0] *= (1.0 - ao);
-      c[1] *= (1.0 - ao);
-      c[2] *= (1.0 - ao);
-      imageBuffer[index * 3 + 0] = (int) c[0];
-      imageBuffer[index * 3 + 1] = (int) c[1];
-      imageBuffer[index * 3 + 2] = (int) c[2];
+      if (peakLen > 0) {
+        double h = atan ((cdepth - minPeak) / peakLen);
+        ao += saturate(sin (h) - scaled_sin_t);
+      } else {
+        ao += saturate(-scaled_sin_t);
+      }
     }
+    ao /= (double)SSAOSamples;
+
+    double c[3];
+    c[0] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 0]);
+    c[1] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 1]);
+    c[2] = (double) (*(unsigned char *) &imageBuffer[index * 3 + 2]);
+    c[0] *= (1.0 - ao);
+    c[1] *= (1.0 - ao);
+    c[2] *= (1.0 - ao);
+    imageBuffer[index * 3 + 0] = (int) c[0];
+    imageBuffer[index * 3 + 1] = (int) c[1];
+    imageBuffer[index * 3 + 2] = (int) c[2];
   }
 }
 
diff --git a/src/image.h b/src/image.h
index 7df81425d9..1de455d4bd 100644
--- a/src/image.h
+++ b/src/image.h
@@ -73,6 +73,10 @@ class Image : protected Pointers {
   double *depthcopy,*surfacecopy;
   unsigned char *imageBuffer,*rgbcopy,*writeBuffer;
 
+  // MPI_Gatherv
+
+  int *recvcounts,*displs;
+
   // constant view params
 
   double FOV;
diff --git a/src/info.cpp b/src/info.cpp
index bf6f14a48a..f1dc96645b 100644
--- a/src/info.cpp
+++ b/src/info.cpp
@@ -1449,8 +1449,13 @@ void Info::get_memory_info(double *meminfo)
     meminfo[2] = (double)pmc.PeakWorkingSetSize/1048576.0;
 #else
 #if defined(__linux__)
+#if defined(__GLIBC__) && __GLIBC_PREREQ(2, 33)
+    struct mallinfo2 mi;
+    mi = mallinfo2();
+#else
     struct mallinfo mi;
     mi = mallinfo();
+#endif
     meminfo[1] = (double)mi.uordblks/1048576.0+(double)mi.hblkhd/1048576.0;
 #endif
     struct rusage ru;
diff --git a/src/kspace.cpp b/src/kspace.cpp
index 5556a5e8d0..f44cc42aaf 100644
--- a/src/kspace.cpp
+++ b/src/kspace.cpp
@@ -564,9 +564,9 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += 2;
     } else if (strcmp(arg[iarg],"kmax/ewald") == 0) {
       if (iarg+4 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      kx_ewald = atoi(arg[iarg+1]);
-      ky_ewald = atoi(arg[iarg+2]);
-      kz_ewald = atoi(arg[iarg+3]);
+      kx_ewald = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
+      ky_ewald = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
+      kz_ewald = utils::inumeric(FLERR,arg[iarg+3],false,lmp);
       if (kx_ewald < 0 || ky_ewald < 0 || kz_ewald < 0)
         error->all(FLERR,"Bad kspace_modify kmax/ewald parameter");
       if (kx_ewald > 0 && ky_ewald > 0 && kz_ewald > 0)
@@ -583,15 +583,15 @@ void KSpace::modify_params(int narg, char **arg)
       iarg += 2;
     } else if (strcmp(arg[iarg],"force/disp/real") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      accuracy_real_6 = atof(arg[iarg+1]);
+      accuracy_real_6 = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg],"force/disp/kspace") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      accuracy_kspace_6 = atof(arg[iarg+1]);
+      accuracy_kspace_6 = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg],"eigtol") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal kspace_modify command");
-      splittol = atof(arg[iarg+1]);
+      splittol = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       if (splittol >= 1.0)
         error->all(FLERR,"Kspace_modify eigtol must be smaller than one");
       iarg += 2;
diff --git a/src/lammps.cpp b/src/lammps.cpp
index 6734fbd209..277ec4414f 100644
--- a/src/lammps.cpp
+++ b/src/lammps.cpp
@@ -842,12 +842,12 @@ void LAMMPS::post_create()
     if (strcmp(suffix,"omp") == 0 && !modify->check_package("OMP"))
       error->all(FLERR,"Using suffix omp without USER-OMP package installed");
 
-    if (strcmp(suffix,"gpu") == 0) input->one("package gpu 1");
+    if (strcmp(suffix,"gpu") == 0) input->one("package gpu 0");
     if (strcmp(suffix,"intel") == 0) input->one("package intel 1");
     if (strcmp(suffix,"omp") == 0) input->one("package omp 0");
 
     if (suffix2) {
-      if (strcmp(suffix2,"gpu") == 0) input->one("package gpu 1");
+      if (strcmp(suffix2,"gpu") == 0) input->one("package gpu 0");
       if (strcmp(suffix2,"intel") == 0) input->one("package intel 1");
       if (strcmp(suffix2,"omp") == 0) input->one("package omp 0");
     }
diff --git a/src/library.cpp b/src/library.cpp
index 71bf205d90..2a7bbf07b3 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -4128,16 +4128,18 @@ void lammps_get_os_info(char *buffer, int buf_size)
 /* ---------------------------------------------------------------------- */
 
 /** This function is used to query whether LAMMPS was compiled with
- *  a real MPI library or in serial.
+ *  a real MPI library or in serial. For the real MPI library it
+ *  reports the size of the MPI communicator in bytes (4 or 8),
+ *  which allows to check for compatibility with a hosting code.
  *
- * \return 0 when compiled with MPI STUBS, otherwise 1 */
+ * \return 0 when compiled with MPI STUBS, otherwise the MPI_Comm size in bytes */
 
 int lammps_config_has_mpi_support()
 {
 #ifdef MPI_STUBS
   return 0;
 #else
-  return 1;
+  return sizeof(MPI_Comm);
 #endif
 }
 
diff --git a/src/reset_atom_ids.h b/src/reset_atom_ids.h
index 7c5c53e2ba..02a7f77e8d 100644
--- a/src/reset_atom_ids.h
+++ b/src/reset_atom_ids.h
@@ -37,7 +37,7 @@ class ResetIDs : protected Pointers {
     int ilocal;
   };
 
-  #if defined(LMP_QSORT)
+#if defined(LMP_QSORT)
   // static variable across all ResetID objects, for qsort callback
   static AtomRvous *sortrvous;
 #endif
diff --git a/unittest/c-library/test_library_config.cpp b/unittest/c-library/test_library_config.cpp
index f196f800da..e5eb044d31 100644
--- a/unittest/c-library/test_library_config.cpp
+++ b/unittest/c-library/test_library_config.cpp
@@ -74,7 +74,7 @@ TEST(LAMMPSConfig, package_name)
         EXPECT_EQ(lammps_config_package_name(numpkgs + 10, buf, 128), 0);
         EXPECT_THAT(buf, StrEq(""));
     } else {
-        EXPECT_EQ(lammps_config_package_name(0, buf, 128), 1);
+        EXPECT_EQ(lammps_config_package_name(0, buf, 128), 0);
         EXPECT_THAT(buf, StrEq(""));
     }
 };
@@ -200,7 +200,10 @@ TEST(LAMMPSConfig, exceptions)
 
 TEST(LAMMPSConfig, mpi_support)
 {
-    EXPECT_EQ(lammps_config_has_mpi_support(), LAMMPS_HAS_MPI);
+    if (LAMMPS_HAS_MPI)
+        EXPECT_GT(lammps_config_has_mpi_support(), 0);
+    else
+        EXPECT_EQ(lammps_config_has_mpi_support(), 0);
 };
 
 TEST(LAMMPSConfig, png_support)
diff --git a/unittest/python/python-open.py b/unittest/python/python-open.py
index 67500ea6fa..5140ce9185 100644
--- a/unittest/python/python-open.py
+++ b/unittest/python/python-open.py
@@ -37,7 +37,7 @@ class PythonOpen(unittest.TestCase):
         lmp=lammps(name=self.machine)
         self.assertIsNot(lmp.lmp,None)
         self.assertEqual(lmp.opened,1)
-        self.assertEqual(has_mpi4py,lmp.has_mpi4py)
+        self.assertEqual(has_mpi and has_mpi4py,lmp.has_mpi4py)
         self.assertEqual(has_mpi,lmp.has_mpi_support)
         lmp.close()
         self.assertIsNone(lmp.lmp,None)