patch 22Jun18

Merge pull request #960 from lammps/latte-tweak
made latte.in compatible with v1.1.1 and updated log files
2018-06-22 14:08:09 -06:00 · 2018-06-22 10:18:19 -06:00 · 2018-06-22 08:52:20 -06:00 · 2018-06-21 13:57:48 -06:00 · 2018-06-21 14:45:00 -04:00 · 2018-06-21 11:10:29 -06:00
1697 changed files with 90665 additions and 12491 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -43,3 +43,14 @@ src/USER-MISC/*_grem.*              @dstelter92

 # tools
 tools/msi2lmp/*       @akohlmey
+
+# cmake
+cmake/*               @junghans @rbberger
+
+# python
+python/*              @rbberger
+
+# docs
+doc/utils/*/*         @rbberger
+doc/Makefile          @rbberger
+doc/README            @rbberger
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -2,9 +2,9 @@
 # CMake build system
 # This file is part of LAMMPS
 # Created by Christoph Junghans and Richard Berger
-cmake_minimum_required(VERSION 3.1)
+cmake_minimum_required(VERSION 2.8.12)

-project(lammps)
+project(lammps CXX)
 set(SOVERSION 0)
 set(LAMMPS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../src)
 set(LAMMPS_LIB_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../lib)
@ -23,14 +23,22 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)
  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CXX_FLAGS)

-# remove any style headers in the src dir
-file(GLOB SRC_STYLE_FILES ${LAMMPS_SOURCE_DIR}/style_*.h)
-if(SRC_STYLE_FILES)
-  file(REMOVE ${SRC_STYLE_FILES})
+file(GLOB SRC_FILES ${LAMMPS_SOURCE_DIR}/*.cpp)
+list(SORT SRC_FILES)
+# check for files installed by make-based buildsystem 
+# only run this time consuming check if there are new files
+if(NOT SRC_FILES STREQUAL SRC_FILES_CACHED)
+  file(GLOB SRC_PKG_FILES ${LAMMPS_SOURCE_DIR}/*/*.cpp)
+  message(STATUS "Running check for installed package (this might take a while)")
+  foreach(_SRC SRC_PKG_FILES)
+    get_filename_component(FILENAME "${_SRC}" NAME)
+    if(EXISTS ${LAMMPS_SOURCE_DIR}/${FILENAME})
+      message(FATAL_ERROR "Found packages installed by the make-based buildsystem, please run 'make -C ${LAMMPS_SOURCE_DIR} no-all purge'")
+    endif()
+  endforeach()
+  set(SRC_FILES_CACHED "${SRC_FILES}" CACHE INTERNAL "List of file in LAMMPS_SOURCE_DIR" FORCE)
 endif()

-enable_language(CXX)
-
 ######################################################################
 # compiler tests
 # these need ot be done early (before further tests).
@ -41,6 +49,11 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
 endif()

+option(ENABLE_COVERAGE "Enable code coverage" OFF)
+if(ENABLE_COVERAGE)
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --coverage")
+endif()
+
 ########################################################################
 # User input options                                                   #
 ########################################################################
@ -48,21 +61,27 @@ option(BUILD_SHARED_LIBS "Build shared libs" OFF)
 if(BUILD_SHARED_LIBS) # for all pkg libs, mpi_stubs and linalg
  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 endif()
+option(DEVELOPER_MODE "Enable developer mode" OFF)
+mark_as_advanced(DEVELOPER_MODE)
+option(CMAKE_VERBOSE_MAKEFILE "Generate verbose Makefiles" OFF)
 include(GNUInstallDirs)

 set(LAMMPS_LINK_LIBS)
 set(LAMMPS_DEPS)
 set(LAMMPS_API_DEFINES)
-option(ENABLE_MPI "Build MPI version" OFF)
-if(ENABLE_MPI)
+
+find_package(MPI QUIET)
+option(BUILD_MPI "Build MPI version" ${MPI_FOUND})
+if(BUILD_MPI)
  find_package(MPI REQUIRED)
-  include_directories(${MPI_C_INCLUDE_PATH})
+  include_directories(${MPI_CXX_INCLUDE_PATH})
  list(APPEND LAMMPS_LINK_LIBS ${MPI_CXX_LIBRARIES})
  option(LAMMPS_LONGLONG_TO_LONG "Workaround if your system or MPI version does not recognize 'long long' data types" OFF)
  if(LAMMPS_LONGLONG_TO_LONG)
    add_definitions(-DLAMMPS_LONGLONG_TO_LONG)
  endif()
 else()
+  enable_language(C)
  file(GLOB MPI_SOURCES ${LAMMPS_SOURCE_DIR}/STUBS/mpi.c)
  add_library(mpi_stubs STATIC ${MPI_SOURCES})
  include_directories(${LAMMPS_SOURCE_DIR}/STUBS)
@ -108,14 +127,14 @@ set(OTHER_PACKAGES KIM PYTHON MSCG MPIIO VORONOI POEMS LATTE
  USER-SMTBQ USER-SPH USER-TALLY USER-UEF USER-VTK USER-QUIP USER-QMMM)
 set(ACCEL_PACKAGES USER-OMP KOKKOS OPT USER-INTEL GPU)
 foreach(PKG ${DEFAULT_PACKAGES})
-  option(ENABLE_${PKG} "Build ${PKG} Package" ${ENABLE_ALL})
+  option(PKG_${PKG} "Build ${PKG} Package" ${ENABLE_ALL})
 endforeach()
 foreach(PKG ${ACCEL_PACKAGES} ${OTHER_PACKAGES})
-  option(ENABLE_${PKG} "Build ${PKG} Package" OFF)
+  option(PKG_${PKG} "Build ${PKG} Package" OFF)
 endforeach()

 macro(pkg_depends PKG1 PKG2)
-  if(ENABLE_${PKG1} AND NOT ENABLE_${PKG2})
+  if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
    message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
  endif()
 endmacro()
@ -123,39 +142,51 @@ endmacro()
 pkg_depends(MPIIO MPI)
 pkg_depends(QEQ MANYBODY)
 pkg_depends(USER-ATC MANYBODY)
-pkg_depends(USER-H5MD MPI)
 pkg_depends(USER-LB MPI)
 pkg_depends(USER-MISC MANYBODY)
 pkg_depends(USER-PHONON KSPACE)
+pkg_depends(CORESHELL KSPACE)

 ######################################################
 # packages with special compiler needs or external libs
 ######################################################
-if(ENABLE_REAX OR ENABLE_MEAM OR ENABLE_USER-QUIP OR ENABLE_USER-QMMM OR ENABLE_LATTE)
+if(PKG_REAX OR PKG_MEAM OR PKG_USER-QUIP OR PKG_USER-QMMM OR PKG_LATTE)
  enable_language(Fortran)
-  include(CheckFortranCompilerFlag)
-  check_Fortran_compiler_flag("-fno-second-underscore" FC_HAS_NO_SECOND_UNDERSCORE)
 endif()

-if(ENABLE_KOKKOS OR ENABLE_MSCG)
-  # starting with CMake 3.1 this is all you have to do to enforce C++11
-  set(CMAKE_CXX_STANDARD 11) # C++11...
-  set(CMAKE_CXX_STANDARD_REQUIRED ON) #...is required...
-  set(CMAKE_CXX_EXTENSIONS OFF) #...without compiler extensions like gnu++11
+if(PKG_MEAM OR PKG_USER-H5MD OR PKG_USER-QMMM)
+  enable_language(C)
 endif()

-if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL)
+find_package(OpenMP QUIET)
+option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND})
+if(BUILD_OMP OR PKG_USER-OMP OR PKG_KOKKOS OR PKG_USER-INTEL)
  find_package(OpenMP REQUIRED)
  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()

-if(ENABLE_KSPACE)
-  set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
-  set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2)
+if(PKG_KSPACE)
+  option(FFT_SINGLE "Use single precision FFT instead of double" OFF)
+  set(FFTW "FFTW3")
+  if(FFT_SINGLE)
+    set(FFTW "FFTW3F")
+    add_definitions(-DFFT_SINGLE)
+  endif()
+  find_package(${FFTW} QUIET)
+  if(${FFTW}_FOUND)
+    set(FFT "${FFTW}" CACHE STRING "FFT library for KSPACE package")
+  else()
+    set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
+  endif()
+  set_property(CACHE FFT PROPERTY STRINGS KISSFFT ${FFTW} MKL)
  if(NOT FFT STREQUAL "KISSFFT")
    find_package(${FFT} REQUIRED)
-    add_definitions(-DFFT_${FFT})
+    if(NOT FFT STREQUAL "FFTW3F")
+      add_definitions(-DFFT_FFTW)
+    else()
+      add_definitions(-DFFT_${FFT})
+    endif()
    include_directories(${${FFT}_INCLUDE_DIRS})
    list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES})
  endif()
@ -166,22 +197,17 @@ if(ENABLE_KSPACE)
  endif()
 endif()

-if(ENABLE_MSCG OR ENABLE_USER-ATC OR ENABLE_USER-AWPMD OR ENABLE_USER-QUIP OR ENABLE_LATTE)
+if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-QUIP OR PKG_LATTE)
  find_package(LAPACK)
  if(NOT LAPACK_FOUND)
    enable_language(Fortran)
-    file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/*.f)
+    file(GLOB LAPACK_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/linalg/*.[fF])
    add_library(linalg STATIC ${LAPACK_SOURCES})
-    include(CheckFortranCompilerFlag)
-    check_Fortran_compiler_flag("-fno-second-underscore" FC_HAS_NO_SECOND_UNDERSCORE)
-    if(FC_HAS_NO_SECOND_UNDERSCORE)
-      target_compile_options(linalg PRIVATE -fno-second-underscore)
-    endif()
    set(LAPACK_LIBRARIES linalg)
  endif()
 endif()

-if(ENABLE_PYTHON)
+if(PKG_PYTHON)
  find_package(PythonInterp REQUIRED)
  find_package(PythonLibs REQUIRED)
  add_definitions(-DLMP_PYTHON)
@ -197,16 +223,25 @@ if(ENABLE_PYTHON)
  endif()
 endif()

-find_package(JPEG)
-if(JPEG_FOUND)
+find_package(JPEG QUIET)
+option(WITH_JPEG "Enable JPEG support" ${JPEG_FOUND})
+if(WITH_JPEG)
+  find_package(JPEG REQUIRED)
  add_definitions(-DLAMMPS_JPEG)
  include_directories(${JPEG_INCLUDE_DIR})
  list(APPEND LAMMPS_LINK_LIBS ${JPEG_LIBRARIES})
 endif()

-find_package(PNG)
-find_package(ZLIB)
+find_package(PNG QUIET)
+find_package(ZLIB QUIET)
 if(PNG_FOUND AND ZLIB_FOUND)
+  option(WITH_PNG "Enable PNG support" ON)
+else()
+  option(WITH_PNG "Enable PNG support" OFF)
+endif()
+if(WITH_PNG)
+  find_package(PNG REQUIRED)
+  find_package(ZLIB REQUIRED)
  include_directories(${PNG_INCLUDE_DIRS} ${ZLIB_INCLUDE_DIRS})
  list(APPEND LAMMPS_LINK_LIBS ${PNG_LIBRARIES} ${ZLIB_LIBRARIES})
  add_definitions(-DLAMMPS_PNG)
@ -214,25 +249,50 @@ endif()

 find_program(GZIP_EXECUTABLE gzip)
 find_package_handle_standard_args(GZIP REQUIRED_VARS GZIP_EXECUTABLE)
-if(GZIP_FOUND)
+option(WITH_GZIP "Enable GZIP support" ${GZIP_FOUND})
+if(WITH_GZIP)
+  if(NOT GZIP_FOUND)
+    message(FATAL_ERROR "gzip executable not found")
+  endif()
  add_definitions(-DLAMMPS_GZIP)
 endif()

 find_program(FFMPEG_EXECUTABLE ffmpeg)
 find_package_handle_standard_args(FFMPEG REQUIRED_VARS FFMPEG_EXECUTABLE)
-if(FFMPEG_FOUND)
+option(WITH_FFMPEG "Enable FFMPEG support" ${FFMPEG_FOUND})
+if(WITH_FFMPEG)
+  if(NOT FFMPEG_FOUND)
+    message(FATAL_ERROR "ffmpeg executable not found")
+  endif()
  add_definitions(-DLAMMPS_FFMPEG)
 endif()

-if(ENABLE_VORONOI)
-  find_package(VORO REQUIRED) #some distros
+if(PKG_VORONOI)
+  option(DOWNLOAD_VORO "Download voro++ (instead of using the system's one)" OFF)
+  if(DOWNLOAD_VORO)
+    include(ExternalProject)
+    ExternalProject_Add(voro_build
+      URL http://math.lbl.gov/voro++/download/dir/voro++-0.4.6.tar.gz
+      URL_MD5 2338b824c3b7b25590e18e8df5d68af9
+      CONFIGURE_COMMAND "" BUILD_IN_SOURCE 1 INSTALL_COMMAND "" 
+      )
+    ExternalProject_get_property(voro_build SOURCE_DIR)
+    set(VORO_LIBRARIES ${SOURCE_DIR}/src/libvoro++.a)
+    set(VORO_INCLUDE_DIRS ${SOURCE_DIR}/src)
+    list(APPEND LAMMPS_DEPS voro_build)
+  else()
+    find_package(VORO)
+    if(NOT VORO_FOUND)
+      message(FATAL_ERROR "VORO not found, help CMake to find it by setting VORO_LIBRARY and VORO_INCLUDE_DIR, or set DOWNLOAD_VORO=ON to download it")
+    endif()
+  endif()
  include_directories(${VORO_INCLUDE_DIRS})
  list(APPEND LAMMPS_LINK_LIBS ${VORO_LIBRARIES})
 endif()

-if(ENABLE_LATTE)
-  find_package(LATTE QUIET)
-  if(NOT LATTE_FOUND)
+if(PKG_LATTE)
+  option(DOWNLOAD_LATTE "Download latte (instead of using the system's one)" OFF)
+  if(DOWNLOAD_LATTE)
    message(STATUS "LATTE not found - we will build our own")
    include(ExternalProject)
    ExternalProject_Add(latte_build
@ -244,55 +304,76 @@ if(ENABLE_LATTE)
    ExternalProject_get_property(latte_build INSTALL_DIR)
    set(LATTE_LIBRARIES ${INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/liblatte.a)
    list(APPEND LAMMPS_DEPS latte_build)
+  else()
+    find_package(LATTE)
+    if(NOT LATTE_FOUND)
+      message(FATAL_ERROR "LATTE not found, help CMake to find it by setting LATTE_LIBRARY, or set DOWNLOAD_LATTE=ON to download it")
+    endif()
  endif()
-  list(APPEND LAMMPS_LINK_LIBS ${LATTE_LIBRARIES} ${LAPACK_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+  list(APPEND LAMMPS_LINK_LIBS ${LATTE_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()

-if(ENABLE_USER-MOLFILE)
+if(PKG_USER-MOLFILE)
  add_library(molfile INTERFACE)
  target_include_directories(molfile INTERFACE ${LAMMPS_LIB_SOURCE_DIR}/molfile)
  target_link_libraries(molfile INTERFACE ${CMAKE_DL_LIBS})
  list(APPEND LAMMPS_LINK_LIBS molfile)
 endif()

-if(ENABLE_USER-NETCDF)
+if(PKG_USER-NETCDF)
  find_package(NetCDF REQUIRED)
  include_directories(NETCDF_INCLUDE_DIR)
  list(APPEND LAMMPS_LINK_LIBS ${NETCDF_LIBRARY})
  add_definitions(-DLMP_HAS_NETCDF -DNC_64BIT_DATA=0x0020)
 endif()

-if(ENABLE_USER-SMD)
-  find_package(Eigen3 REQUIRED)
+if(PKG_USER-SMD)
+  option(DOWNLOAD_Eigen3 "Download Eigen3 (instead of using the system's one)" OFF)
+  if(DOWNLOAD_Eigen3)
+    include(ExternalProject)
+    ExternalProject_Add(Eigen3_build
+      URL http://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz 
+      URL_MD5 1a47e78efe365a97de0c022d127607c3
+      CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND ""
+    )
+    ExternalProject_get_property(Eigen3_build SOURCE_DIR)
+    set(EIGEN3_INCLUDE_DIR ${SOURCE_DIR})
+    list(APPEND LAMMPS_DEPS Eigen3_build)
+  else()
+    find_package(Eigen3)
+    if(NOT Eigen3_FOUND)
+      message(FATAL_ERROR "Eigen3 not found, help CMake to find it by setting EIGEN3_INCLUDE_DIR, or set DOWNLOAD_Eigen3=ON to download it")
+    endif()
+  endif()
  include_directories(${EIGEN3_INCLUDE_DIR})
 endif()

-if(ENABLE_USER-QUIP)
+if(PKG_USER-QUIP)
  find_package(QUIP REQUIRED)
-  list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${LAPACK_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+  list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${LAPACK_LIBRARIES})
 endif()

-if(ENABLE_USER-QMMM)
+if(PKG_USER-QMMM)
+  message(WARNING "Building QMMM with CMake is still experimental")
  find_package(QE REQUIRED)
  include_directories(${QE_INCLUDE_DIRS})
-  list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES} ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+  list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES})
 endif()

-if(ENABLE_USER-VTK)
+if(PKG_USER-VTK)
  find_package(VTK REQUIRED NO_MODULE)
  include(${VTK_USE_FILE})
  add_definitions(-DLAMMPS_VTK)
  list(APPEND LAMMPS_LINK_LIBS ${VTK_LIBRARIES})
 endif()

-if(ENABLE_KIM)
-  find_package(KIM QUIET)
-  if(NOT KIM_FOUND)
-    message(STATUS "KIM not found - we will build our own")
+if(PKG_KIM)
+  option(DOWNLOAD_KIM "Download kim-api (instead of using the system's one)" OFF)
+  if(DOWNLOAD_KIM)
    include(ExternalProject)
    ExternalProject_Add(kim_build
-      URL https://github.com/openkim/kim-api/archive/v1.9.4.tar.gz
-      URL_MD5 f4d35a1705eed46d64c7c0ab448ff3e0
+      URL https://github.com/openkim/kim-api/archive/v1.9.5.tar.gz
+      URL_MD5 9f66efc128da33039e30659f36fc6d00
      BUILD_IN_SOURCE 1
      CONFIGURE_COMMAND <SOURCE_DIR>/configure --prefix=<INSTALL_DIR>
      )
@ -300,41 +381,62 @@ if(ENABLE_KIM)
    set(KIM_INCLUDE_DIRS ${INSTALL_DIR}/include/kim-api-v1)
    set(KIM_LIBRARIES ${INSTALL_DIR}/lib/libkim-api-v1.so)
    list(APPEND LAMMPS_DEPS kim_build)
+  else()
+    find_package(KIM)
+    if(NOT KIM_FOUND)
+      message(FATAL_ERROR "KIM not found, help CMake to find it by setting KIM_LIBRARY and KIM_INCLUDE_DIR, or set DOWNLOAD_KIM=ON to download it")
+    endif()
  endif()
  list(APPEND LAMMPS_LINK_LIBS ${KIM_LIBRARIES})
  include_directories(${KIM_INCLUDE_DIRS})
 endif()

-if(ENABLE_MSCG)
+if(PKG_MSCG)
  find_package(GSL REQUIRED)
-  set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/mscg)
-  set(MSCG_TARBALL ${LAMMPS_LIB_MSCG_BIN_DIR}/MS-CG-master.zip)
-  set(LAMMPS_LIB_MSCG_BIN_DIR ${LAMMPS_LIB_MSCG_BIN_DIR}/MSCG-release-master/src)
-  if(NOT EXISTS ${LAMMPS_LIB_MSCG_BIN_DIR})
-    if(NOT EXISTS ${MSCG_TARBALL})
-      message(STATUS "Downloading ${MSCG_TARBALL}")
-      file(DOWNLOAD
-        https://github.com/uchicago-voth/MSCG-release/archive/master.zip
-        ${MSCG_TARBALL} SHOW_PROGRESS) #EXPECTED_MD5 cannot be due due to master
+  option(DOWNLOAD_MSCG "Download latte (instead of using the system's one)" OFF)
+  if(DOWNLOAD_MSCG)
+    include(ExternalProject)
+    if(NOT LAPACK_FOUND)
+      set(EXTRA_MSCG_OPTS "-DLAPACK_LIBRARIES=${CMAKE_CURRENT_BINARY_DIR}/liblinalg.a")
+    endif()
+    ExternalProject_Add(mscg_build
+      URL https://github.com/uchicago-voth/MSCG-release/archive/1.7.3.1.tar.gz
+      URL_MD5 8c45e269ee13f60b303edd7823866a91
+      SOURCE_SUBDIR src/CMake
+      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> -DCMAKE_POSITION_INDEPENDENT_CODE=${CMAKE_POSITION_INDEPENDENT_CODE} ${EXTRA_MSCG_OPTS}
+      BUILD_COMMAND make mscg INSTALL_COMMAND ""
+      )
+    ExternalProject_get_property(mscg_build BINARY_DIR)
+    set(MSCG_LIBRARIES ${BINARY_DIR}/libmscg.a)
+    ExternalProject_get_property(mscg_build SOURCE_DIR)
+    set(MSCG_INCLUDE_DIRS ${SOURCE_DIR}/src)
+    list(APPEND LAMMPS_DEPS mscg_build)
+    if(NOT LAPACK_FOUND)
+      file(MAKE_DIRECTORY ${MSCG_INCLUDE_DIRS})
+      add_dependencies(mscg_build linalg)
+    endif()
+  else()
+    find_package(MSCG)
+    if(NOT MSCG_FOUND)
+      message(FATAL_ERROR "MSCG not found, help CMake to find it by setting MSCG_LIBRARY and MSCG_INCLUDE_DIRS, or set DOWNLOAD_MSCG=ON to download it")
    endif()
-    message(STATUS "Unpacking ${MSCG_TARBALL}")
-    execute_process(COMMAND ${CMAKE_COMMAND} -E tar xvf ${MSCG_TARBALL}
-      WORKING_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/mscg)
  endif()
-  file(GLOB MSCG_SOURCES ${LAMMPS_LIB_MSCG_BIN_DIR}/*.cpp)
-  add_library(mscg STATIC ${MSCG_SOURCES})
-  list(APPEND LAMMPS_LINK_LIBS mscg)
-  target_compile_options(mscg PRIVATE -DDIMENSION=3 -D_exclude_gromacs=1)
-  target_include_directories(mscg PUBLIC ${LAMMPS_LIB_MSCG_BIN_DIR})
-  target_link_libraries(mscg ${GSL_LIBRARIES} ${LAPACK_LIBRARIES})
+  list(APPEND LAMMPS_LINK_LIBS ${MSCG_LIBRARIES} ${GSL_LIBRARIES} ${LAPACK_LIBRARIES})
+  include_directories(${MSCG_INCLUDE_DIRS})
+endif()
+
+if(PKG_COMPRESS)
+  find_package(ZLIB REQUIRED)
+  include_directories(${ZLIB_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${ZLIB_LIBRARIES})
 endif()

 ########################################################################
 # Basic system tests (standard libraries, headers, functions, types)   #
 ########################################################################
-include(CheckIncludeFile)
+include(CheckIncludeFileCXX)
 foreach(HEADER math.h)
-  check_include_file(${HEADER} FOUND_${HEADER})
+  check_include_file_cxx(${HEADER} FOUND_${HEADER})
  if(NOT FOUND_${HEADER})
    message(FATAL_ERROR "Could not find needed header - ${HEADER}")
  endif(NOT FOUND_${HEADER})
@ -378,7 +480,7 @@ foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES})
      DetectAndRemovePackageHeader(${LAMMPS_SOURCE_DIR}/${FNAME})
  endforeach()

-  if(ENABLE_${PKG})
+  if(PKG_${PKG})
    # detects styles in package and adds them to global list
    RegisterStyles(${${PKG}_SOURCES_DIR})

@ -392,7 +494,7 @@ endforeach()
 ############################################
 foreach(SIMPLE_LIB REAX MEAM POEMS USER-ATC USER-AWPMD USER-COLVARS USER-H5MD
  USER-QMMM)
-  if(ENABLE_${SIMPLE_LIB})
+  if(PKG_${SIMPLE_LIB})
    string(REGEX REPLACE "^USER-" "" PKG_LIB "${SIMPLE_LIB}")
    string(TOLOWER "${PKG_LIB}" PKG_LIB)
    file(GLOB_RECURSE ${PKG_LIB}_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/${PKG_LIB}/*.F
@ -413,40 +515,26 @@ foreach(SIMPLE_LIB REAX MEAM POEMS USER-ATC USER-AWPMD USER-COLVARS USER-H5MD
  endif()
 endforeach()

-if(ENABLE_USER-AWPMD)
+if(PKG_USER-AWPMD)
  target_link_libraries(awpmd ${LAPACK_LIBRARIES})
 endif()

-if(ENABLE_USER-ATC)
+if(PKG_USER-ATC)
  target_link_libraries(atc ${LAPACK_LIBRARIES})
 endif()

-if(ENABLE_USER-H5MD)
+if(PKG_USER-H5MD)
  find_package(HDF5 REQUIRED)
  target_link_libraries(h5md ${HDF5_LIBRARIES})
  target_include_directories(h5md PRIVATE ${HDF5_INCLUDE_DIRS})
 endif()

-if(ENABLE_MEAM AND FC_HAS_NO_SECOND_UNDERSCORE)
-  foreach(FSRC ${meam_SOURCES})
-    string(REGEX REPLACE "^.*\\." "" FEXT "${FSRC}")
-    list(FIND CMAKE_Fortran_SOURCE_FILE_EXTENSIONS "${FEXT}" FINDEX)
-    if(FINDEX GREATER -1)
-      set_property(SOURCE ${FSRC} APPEND PROPERTY COMPILE_FLAGS "-fno-second-underscore")
-    endif()
-  endforeach()
-endif()
-
-if(ENABLE_REAX AND FC_HAS_NO_SECOND_UNDERSCORE)
-  target_compile_options(reax PRIVATE -fno-second-underscore)
-endif()
-

 ######################################################################
 # packages which selectively include variants based on enabled styles
 # e.g. accelerator packages
 ######################################################################
-if(ENABLE_USER-OMP)
+if(PKG_USER-OMP)
    set(USER-OMP_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-OMP)
    set(USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/thr_data.cpp
                         ${USER-OMP_SOURCES_DIR}/thr_omp.cpp
@ -463,7 +551,7 @@ if(ENABLE_USER-OMP)
    include_directories(${USER-OMP_SOURCES_DIR})
 endif()

-if(ENABLE_KOKKOS)
+if(PKG_KOKKOS)
  set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
  set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
  add_definitions(-DLMP_KOKKOS)
@ -499,7 +587,7 @@ if(ENABLE_KOKKOS)
  RegisterNBinStyle(${KOKKOS_PKG_SOURCES_DIR}/nbin_kokkos.h)
  RegisterNPairStyle(${KOKKOS_PKG_SOURCES_DIR}/npair_kokkos.h)

-  if(ENABLE_USER-DPD)
+  if(PKG_USER-DPD)
    get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES)
    list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/npair_ssa_kokkos.cpp)
    RegisterNPairStyle(${KOKKOS_PKG_SOURCES_DIR}/npair_ssa_kokkos.h)
@ -512,7 +600,7 @@ if(ENABLE_KOKKOS)
  include_directories(${KOKKOS_PKG_SOURCES_DIR})
 endif()

-if(ENABLE_OPT)
+if(PKG_OPT)
    set(OPT_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/OPT)
    set(OPT_SOURCES)
    set_property(GLOBAL PROPERTY "OPT_SOURCES" "${OPT_SOURCES}")
@ -526,7 +614,30 @@ if(ENABLE_OPT)
    include_directories(${OPT_SOURCES_DIR})
 endif()

-if(ENABLE_USER-INTEL)
+if(PKG_USER-INTEL)
+    if(NOT DEVELOPER_MODE)
+      if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+        message(FATAL_ERROR "USER-INTEL is only useful together with intel compiler")
+      endif()
+      if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 16)
+        message(FATAL_ERROR "USER-INTEL is needed at least 2016 intel compiler, found ${CMAKE_CXX_COMPILER_VERSION}")
+      endif()
+    endif()
+    option(INJECT_INTEL_FLAG "Inject OMG fast flags for USER-INTEL" ON)
+    if(INJECT_INTEL_FLAG AND CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+      if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xCOMMON-AVX512")
+      else()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xHost")
+      endif()
+      include(CheckCXXCompilerFlag)
+      foreach(_FLAG -qopenmp -qno-offload -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -O2 "-fp-model fast=2" -no-prec-div -qoverride-limits -qopt-zmm-usage=high)
+        check_cxx_compiler_flag("${__FLAG}" COMPILER_SUPPORTS${_FLAG})
+        if(COMPILER_SUPPORTS${_FLAG})
+          set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${_FLAG}")
+        endif()
+      endforeach()
+    endif()
    set(USER-INTEL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-INTEL)
    set(USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/intel_preprocess.h
                           ${USER-INTEL_SOURCES_DIR}/intel_buffers.h
@ -550,7 +661,10 @@ if(ENABLE_USER-INTEL)
    include_directories(${USER-INTEL_SOURCES_DIR})
 endif()

-if(ENABLE_GPU)
+if(PKG_GPU)
+    if (CMAKE_VERSION VERSION_LESS "3.1") 
+      message(FATAL_ERROR "For the GPU package you need at least cmake-3.1")
+    endif()
    set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
    set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
                    ${GPU_SOURCES_DIR}/fix_gpu.h
@ -647,7 +761,7 @@ if(ENABLE_GPU)
      add_library(gpu STATIC ${GPU_LIB_SOURCES})
      target_link_libraries(gpu ${OpenCL_LIBRARIES})
      target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu ${OpenCL_INCLUDE_DIRS})
-      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC} -DMPI_GERYON -DUCL_NO_EXIT)
+      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
      target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)

      list(APPEND LAMMPS_LINK_LIBS gpu)
@ -687,6 +801,12 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR})
 # Actually add executable and lib to build
 ############################################
 add_library(lammps ${LIB_SOURCES})
+get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
+list (FIND LANGUAGES "Fortran" _index)
+if (${_index} GREATER -1)
+  list(APPEND LAMMPS_LINK_LIBS ${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES})
+endif()
+list(REMOVE_DUPLICATES LAMMPS_LINK_LIBS)
 target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
 if(LAMMPS_DEPS)
  add_dependencies(lammps ${LAMMPS_DEPS})
@ -708,35 +828,93 @@ if(ENABLE_TESTING)
  add_test(ShowHelp lmp${LAMMPS_MACHINE} -help)
 endif()

-##################################
+###############################################################################
+# Testing
+#
+# Requires latest gcovr (for GCC 8.1 support):#
+# pip install git+https://github.com/gcovr/gcovr.git
+###############################################################################
+if(ENABLE_COVERAGE)
+    find_program(GCOVR_BINARY gcovr)
+    find_package_handle_standard_args(GCOVR DEFAULT_MSG GCOVR_BINARY)
+
+    if(GCOVR_FOUND)
+        get_filename_component(ABSOLUTE_LAMMPS_SOURCE_DIR ${LAMMPS_SOURCE_DIR} ABSOLUTE)
+
+        add_custom_target(
+            gen_coverage_xml
+            COMMAND ${GCOVR_BINARY} -s -x -r ${ABSOLUTE_LAMMPS_SOURCE_DIR} --object-directory=${CMAKE_BINARY_DIR} -o coverage.xml
+            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+            COMMENT "Generating XML Coverage Report..."
+        )
+
+        add_custom_target(
+            gen_coverage_html
+            COMMAND ${GCOVR_BINARY} -s  --html --html-details -r ${ABSOLUTE_LAMMPS_SOURCE_DIR} --object-directory=${CMAKE_BINARY_DIR} -o coverage.html
+            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+            COMMENT "Generating HTML Coverage Report..."
+        )
+    endif()
+endif()
+
+###############################################################################
 # Print package summary
-##################################
+###############################################################################
 foreach(PKG ${DEFAULT_PACKAGES} ${OTHER_PACKAGES} ${ACCEL_PACKAGES})
-  if(ENABLE_${PKG})
+  if(PKG_${PKG})
    message(STATUS "Building package: ${PKG}")
  endif()
 endforeach()

 string(TOUPPER "${CMAKE_BUILD_TYPE}" BTYPE)
+get_directory_property(CPPFLAGS DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
+include(FeatureSummary)
+feature_summary(INCLUDE_QUIET_PACKAGES WHAT ALL)
 message(STATUS "<<< Build configuration >>>
   Build type       ${CMAKE_BUILD_TYPE}
   Install path     ${CMAKE_INSTALL_PREFIX}
   Compilers and Flags:
   C++ Compiler     ${CMAKE_CXX_COMPILER}
       Type         ${CMAKE_CXX_COMPILER_ID}
-   C++ Flags        ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${BTYPE}}")
+       Version      ${CMAKE_CXX_COMPILER_VERSION}
+   C++ Flags        ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${BTYPE}}
+   Defines          ${CPPFLAGS}")
 get_property(LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
-if(LANGUAGES MATCHES ".*Fortran.*")
+list (FIND LANGUAGES "Fortran" _index)
+if (${_index} GREATER -1)
  message(STATUS "Fortran Compiler ${CMAKE_Fortran_COMPILER} 
           Type     ${CMAKE_Fortran_COMPILER_ID}
+           Version  ${CMAKE_Fortran_COMPILER_VERSION}
   Fortran Flags    ${CMAKE_Fortran_FLAGS} ${CMAKE_Fortran_FLAGS_${BTYPE}}")
 endif()
-message(STATUS "Linker flags:
+list (FIND LANGUAGES "C" _index)
+if (${_index} GREATER -1)
+  message(STATUS "C Compiler ${CMAKE_C_COMPILER} 
+     Type     ${CMAKE_C_COMPILER_ID}
+     Version  ${CMAKE_C_COMPILER_VERSION}
+     C Flags  ${CMAKE_C_FLAGS} ${CMAKE_C_FLAGS_${BTYPE}}")
+endif()
+if(CMAKE_EXE_LINKER_FLAGS)
+  message(STATUS "Linker flags:
   Executable      ${CMAKE_EXE_LINKER_FLAGS}")
+  endif()
 if(BUILD_SHARED_LIBS)
-  message(STATUS "Shared libries  ${CMAKE_SHARED_LINKER_FLAGS}")
+  message(STATUS "Shared libraries  ${CMAKE_SHARED_LINKER_FLAGS}")
 else()
-  message(STATUS "Static libries  ${CMAKE_STATIC_LINKER_FLAGS}")
+  message(STATUS "Static libraries  ${CMAKE_STATIC_LINKER_FLAGS}")
 endif()
 message(STATUS "Link libraries: ${LAMMPS_LINK_LIBS}")
-
+if(BUILD_MPI)
+  message(STATUS "Using mpi with headers in ${MPI_CXX_INCLUDE_PATH} and ${MPI_CXX_LIBRARIES}")
+endif()
+if(ENABLED_GPU)
+  message(STATUS "GPU Api: ${GPU_API}")
+  if(GPU_API STREQUAL "CUDA")
+    message(STATUS "GPU Arch: ${GPU_ARCH}")
+  elseif(GPU_API STREQUAL "OpenCL")
+    message(STATUS "OCL Tune: ${OCL_TUNE}")
+  endif()
+endif()
+if(PKG_KSPACE)
+  message(STATUS "Using ${FFT} as FFT")
+endif()
--- a/cmake/Modules/FindFFTW2.cmake
+++ b/cmake/Modules/FindFFTW2.cmake
@ -1,22 +0,0 @@
-# - Find fftw2
-# Find the native FFTW2 headers and libraries.
-#
-#  FFTW2_INCLUDE_DIRS - where to find fftw2.h, etc.
-#  FFTW2_LIBRARIES    - List of libraries when using fftw2.
-#  FFTW2_FOUND        - True if fftw2 found.
-#
-
-find_path(FFTW2_INCLUDE_DIR fftw.h)
-
-find_library(FFTW2_LIBRARY NAMES fftw)
-
-set(FFTW2_LIBRARIES ${FFTW2_LIBRARY})
-set(FFTW2_INCLUDE_DIRS ${FFTW2_INCLUDE_DIR})
-
-include(FindPackageHandleStandardArgs)
-# handle the QUIETLY and REQUIRED arguments and set FFTW2_FOUND to TRUE
-# if all listed variables are TRUE
-
-find_package_handle_standard_args(FFTW2 DEFAULT_MSG FFTW2_LIBRARY FFTW2_INCLUDE_DIR)
-
-mark_as_advanced(FFTW2_INCLUDE_DIR FFTW2_LIBRARY )
--- a/cmake/Modules/FindFFTW3F.cmake
+++ b/cmake/Modules/FindFFTW3F.cmake
@ -0,0 +1,25 @@
+# - Find fftw3f
+# Find the native FFTW3F headers and libraries.
+#
+#  FFTW3F_INCLUDE_DIRS - where to find fftw3f.h, etc.
+#  FFTW3F_LIBRARIES    - List of libraries when using fftw3f.
+#  FFTW3F_FOUND        - True if fftw3f found.
+#
+
+find_package(PkgConfig)
+
+pkg_check_modules(PC_FFTW3F fftw3f)
+find_path(FFTW3F_INCLUDE_DIR fftw3.h HINTS ${PC_FFTW3F_INCLUDE_DIRS})
+
+find_library(FFTW3F_LIBRARY NAMES fftw3f HINTS ${PC_FFTW3F_LIBRARY_DIRS})
+
+set(FFTW3F_LIBRARIES ${FFTW3F_LIBRARY})
+set(FFTW3F_INCLUDE_DIRS ${FFTW3F_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set FFTW3F_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(FFTW3F DEFAULT_MSG FFTW3F_LIBRARY FFTW3F_INCLUDE_DIR)
+
+mark_as_advanced(FFTW3F_INCLUDE_DIR FFTW3F_LIBRARY )
--- a/cmake/Modules/FindMSCG.cmake
+++ b/cmake/Modules/FindMSCG.cmake
@ -0,0 +1,22 @@
+# - Find mscg
+# Find the native MSCG headers and libraries.
+#
+#  MSCG_INCLUDE_DIRS - where to find mscg.h, etc.
+#  MSCG_LIBRARIES    - List of libraries when using mscg.
+#  MSCG_FOUND        - True if mscg found.
+#
+
+find_path(MSCG_INCLUDE_DIR mscg.h PATH_SUFFIXES mscg)
+
+find_library(MSCG_LIBRARY NAMES mscg)
+
+set(MSCG_LIBRARIES ${MSCG_LIBRARY})
+set(MSCG_INCLUDE_DIRS ${MSCG_INCLUDE_DIR})
+
+include(FindPackageHandleStandardArgs)
+# handle the QUIETLY and REQUIRED arguments and set MSCG_FOUND to TRUE
+# if all listed variables are TRUE
+
+find_package_handle_standard_args(MSCG DEFAULT_MSG MSCG_LIBRARY MSCG_INCLUDE_DIR)
+
+mark_as_advanced(MSCG_INCLUDE_DIR MSCG_LIBRARY )
--- a/cmake/README.md
+++ b/cmake/README.md
--- a/doc/Makefile
+++ b/doc/Makefile
@ -9,6 +9,7 @@ TXT2RST       = $(VENV)/bin/txt2rst
 ANCHORCHECK   = $(VENV)/bin/doc_anchor_check

 PYTHON        = $(shell which python3)
+VIRTUALENV     = virtualenv
 HAS_PYTHON3    = NO
 HAS_VIRTUALENV = NO

@ -16,7 +17,13 @@ ifeq ($(shell which python3 >/dev/null 2>&1; echo $$?), 0)
 HAS_PYTHON3 = YES
 endif

+ifeq ($(shell which virtualenv-3 >/dev/null 2>&1; echo $$?), 0)
+VIRTUALENV     = virtualenv-3
+HAS_VIRTUALENV = YES
+endif
+
 ifeq ($(shell which virtualenv >/dev/null 2>&1; echo $$?), 0)
+VIRTUALENV     = virtualenv
 HAS_VIRTUALENV = YES
 endif

@ -158,7 +165,7 @@ $(VENV):
 	@if [ "$(HAS_PYTHON3)" == "NO" ] ; then echo "Python3 was not found! Please check README.md for further instructions" 1>&2; exit 1; fi
 	@if [ "$(HAS_VIRTUALENV)" == "NO" ] ; then echo "virtualenv was not found! Please check README.md for further instructions" 1>&2; exit 1; fi
 	@( \
-		virtualenv -p $(PYTHON) $(VENV); \
+		$(VIRTUALENV) -p $(PYTHON) $(VENV); \
 		. $(VENV)/bin/activate; \
 		pip install Sphinx; \
 		pip install sphinxcontrib-images; \
--- a/doc/src/Eqs/dihedral_table_cut.jpg
+++ b/doc/src/Eqs/dihedral_table_cut.jpg
--- a/doc/src/Eqs/dihedral_table_cut.tex
+++ b/doc/src/Eqs/dihedral_table_cut.tex
@ -0,0 +1,11 @@
+\documentclass[12pt]{article}
+\pagestyle{empty}
+\begin{document}
+
+\begin{eqnarray*}
+        f(\theta) & = & K \qquad\qquad\qquad\qquad\qquad\qquad \theta < \theta_1 \\
+        f(\theta) & = & K \left(1-\frac{(\theta - \theta_1)^2}{(\theta_2 - \theta_1)^2}\right) \qquad \theta_1 < \theta < \theta_2
+\end{eqnarray*}
+
+\end{document}
+
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="11 May 2018 version">
+<META NAME="docnumber" CONTENT="22 Jun 2018 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -19,7 +19,7 @@
 :line

 LAMMPS Documentation :c,h1
-11 May 2018 version :c,h2
+22 Jun 2018 version :c,h2

 Version info: :h3

--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@ -129,6 +129,17 @@ region 1 block $((xlo+xhi)/2+sqrt(v_area)) 2 INF INF EDGE EDGE :pre

 so that you do not have to define (or discard) a temporary variable X.

+Additionally, the "immediate" variable expression may be followed by a
+colon, followed by a C-style format string, e.g. ":%f" or ":%.10g".
+The format string must be appropriate for a double-precision
+floating-point value.  The format string is used to output the result
+of the variable expression evaluation.  If a format string is not
+specified a high-precision "%.20g" is used as the default.
+
+This can be useful for formatting print output to a desired precion:
+
+print "Final energy per atom: $(pe/atoms:%10.3f) eV/atom" :pre
+
 Note that neither the curly-bracket or immediate form of variables can
 contain nested $ characters for other variables to substitute for.
 Thus you cannot do this:
@ -1212,7 +1223,8 @@ package"_Section_start.html#start_3.
 "nharmonic (o)"_dihedral_nharmonic.html,
 "quadratic (o)"_dihedral_quadratic.html,
 "spherical (o)"_dihedral_spherical.html,
-"table (o)"_dihedral_table.html :tb(c=4,ea=c)
+"table (o)"_dihedral_table.html,
+"table/cut"_dihedral_table_cut.html :tb(c=4,ea=c)

 :line

--- a/doc/src/Section_errors.txt
+++ b/doc/src/Section_errors.txt
@ -803,6 +803,13 @@ lo value must be less than the hi value for all 3 dimensions. :dd
 The box command cannot be used after a read_data, read_restart, or
 create_box command. :dd

+{BUG: restartinfo=1 but no restart support in pair style} :dt
+
+The pair style has a bug, where it does not support reading
+and writing information to a restart file, but does not set
+the member variable restartinfo to 0 as required in that case. :dd
+
+
 {CPU neighbor lists must be used for ellipsoid/sphere mix.} :dt

 When using Gay-Berne or RE-squared pair styles with both ellipsoidal and
--- a/doc/src/create_bonds.txt
+++ b/doc/src/create_bonds.txt
@ -37,8 +37,8 @@ keyword = {special} :l

 create_bonds many all all 1 1.0 1.2
 create_bonds many surf solvent 3 2.0 2.4
-create_bond single/bond 1 1 2
-create_bond single/angle 5 52 98 107 special no :pre
+create_bonds single/bond 1 1 2
+create_bonds single/angle 5 52 98 107 special no :pre

 [Description:]

--- a/doc/src/dihedral_table_cut.txt
+++ b/doc/src/dihedral_table_cut.txt
@ -0,0 +1,205 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+dihedral_style table/cut command :h3
+
+[Syntax:]
+
+dihedral_style table/cut style Ntable :pre
+
+style = {linear} or {spline} = method of interpolation
+Ntable = size of the internal lookup table :ul
+
+[Examples:]
+
+dihedral_style table/cut spline 400
+dihedral_style table/cut linear 1000
+dihedral_coeff 1 aat 1.0 177 180 file.table DIH_TABLE1
+dihedral_coeff 2 aat 0.5 170 180 file.table DIH_TABLE2 :pre
+
+[Description:]
+
+The {table/cut} dihedral style creates interpolation tables of length
+{Ntable} from dihedral potential and derivative values listed in a
+file(s) as a function of the dihedral angle "phi".  In addition, an
+analytic cutoff that is quadratic in the bond-angle (theta) is applied
+in order to regularize the dihedral interaction.  The dihedral table
+files are read by the "dihedral_coeff"_dihedral_coeff.html command.
+
+The interpolation tables are created by fitting cubic splines to the
+file values and interpolating energy and derivative values at each of
+{Ntable} dihedral angles. During a simulation, these tables are used
+to interpolate energy and force values on individual atoms as
+needed. The interpolation is done in one of 2 styles: {linear} or
+{spline}.
+
+For the {linear} style, the dihedral angle (phi) is used to find 2
+surrounding table values from which an energy or its derivative is
+computed by linear interpolation.
+
+For the {spline} style, cubic spline coefficients are computed and
+stored at each of the {Ntable} evenly-spaced values in the
+interpolated table.  For a given dihedral angle (phi), the appropriate
+coefficients are chosen from this list, and a cubic polynomial is used
+to compute the energy and the derivative at this angle.
+
+The following coefficients must be defined for each dihedral type via
+the "dihedral_coeff"_dihedral_coeff.html command as in the example
+above.
+
+style (aat)
+cutoff prefactor
+cutoff angle1
+cutoff angle2
+filename
+keyword :ul
+
+The cutoff dihedral style uses a tabulated dihedral interaction with a 
+cutoff function:
+
+:c,image(Eqs/dihedral_table_cut.jpg)
+
+The cutoff specifies an prefactor to the cutoff function.  While this value
+would ordinarily equal 1 there may be situations where the value should change.
+
+The cutoff angle1 specifies the angle (in degrees) below which the dihedral
+interaction is unmodified, i.e. the cutoff function is 1.
+
+The cutoff function is applied between angle1 and angle2, which is the angle at
+which the cutoff function drops to zero.  The value of zero effectively "turns
+off" the dihedral interaction.
+
+The filename specifies a file containing tabulated energy and
+derivative values. The keyword specifies a section of the file.  The
+format of this file is described below.
+
+:line
+
+The format of a tabulated file is as follows (without the
+parenthesized comments).  It can begin with one or more comment
+or blank lines.
+
+# Table of the potential and its negative derivative  :pre
+
+DIH_TABLE1                   (keyword is the first text on line)
+N 30 DEGREES                 (N, NOF, DEGREES, RADIANS, CHECKU/F)
+                             (blank line)
+1 -168.0 -1.40351172223 0.0423346818422
+2 -156.0 -1.70447981034 0.00811786522531
+3 -144.0 -1.62956100432 -0.0184129719987
+...
+30 180.0 -0.707106781187 0.0719306095245 :pre
+
+# Example 2: table of the potential. Forces omitted :pre
+
+DIH_TABLE2
+N 30 NOF CHECKU testU.dat CHECKF testF.dat :pre
+
+1 -168.0 -1.40351172223
+2 -156.0 -1.70447981034
+3 -144.0 -1.62956100432
+...
+30 180.0 -0.707106781187 :pre
+
+A section begins with a non-blank line whose 1st character is not a
+"#"; blank lines or lines starting with "#" can be used as comments
+between sections. The first line begins with a keyword which
+identifies the section. The line can contain additional text, but the
+initial text must match the argument specified in the
+"dihedral_coeff"_dihedral_coeff.html command. The next line lists (in
+any order) one or more parameters for the table. Each parameter is a
+keyword followed by one or more numeric values.
+
+Following a blank line, the next N lines list the tabulated values. On
+each line, the 1st value is the index from 1 to N, the 2nd value is
+the angle value, the 3rd value is the energy (in energy units), and
+the 4th is -dE/d(phi) also in energy units). The 3rd term is the
+energy of the 4-atom configuration for the specified angle.  The 4th
+term (when present) is the negative derivative of the energy with
+respect to the angle (in degrees, or radians depending on whether the
+user selected DEGREES or RADIANS).  Thus the units of the last term
+are still energy, not force. The dihedral angle values must increase
+from one line to the next.
+
+Dihedral table splines are cyclic.  There is no discontinuity at 180
+degrees (or at any other angle).  Although in the examples above, the
+angles range from -180 to 180 degrees, in general, the first angle in
+the list can have any value (positive, zero, or negative).  However
+the {range} of angles represented in the table must be {strictly} less
+than 360 degrees (2pi radians) to avoid angle overlap.  (You may not
+supply entries in the table for both 180 and -180, for example.)  If
+the user's table covers only a narrow range of dihedral angles,
+strange numerical behavior can occur in the large remaining gap.
+
+[Parameters:]
+
+The parameter "N" is required and its value is the number of table
+entries that follow. Note that this may be different than the N
+specified in the "dihedral_style table"_dihedral_style.html command.
+Let {Ntable} is the number of table entries requested dihedral_style
+command, and let {Nfile} be the parameter following "N" in the
+tabulated file ("30" in the sparse example above).  What LAMMPS does
+is a preliminary interpolation by creating splines using the {Nfile}
+tabulated values as nodal points.  It uses these to interpolate as
+needed to generate energy and derivative values at {Ntable} different
+points (which are evenly spaced over a 360 degree range, even if the
+angles in the file are not).  The resulting tables of length {Ntable}
+are then used as described above, when computing energy and force for
+individual dihedral angles and their atoms.  This means that if you
+want the interpolation tables of length {Ntable} to match exactly what
+is in the tabulated file (with effectively nopreliminary
+interpolation), you should set {Ntable} = {Nfile}.  To insure the
+nodal points in the user's file are aligned with the interpolated
+table entries, the angles in the table should be integer multiples of
+360/{Ntable} degrees, or 2*PI/{Ntable} radians (depending on your
+choice of angle units).
+
+The optional "NOF" keyword allows the user to omit the forces
+(negative energy derivatives) from the table file (normally located in
+the 4th column).  In their place, forces will be calculated
+automatically by differentiating the potential energy function
+indicated by the 3rd column of the table (using either linear or
+spline interpolation).
+
+The optional "DEGREES" keyword allows the user to specify angles in
+degrees instead of radians (default).
+
+The optional "RADIANS" keyword allows the user to specify angles in
+radians instead of degrees.  (Note: This changes the way the forces
+are scaled in the 4th column of the data file.)
+
+The optional "CHECKU" keyword is followed by a filename.  This allows
+the user to save all of the the {Ntable} different entries in the
+interpolated energy table to a file to make sure that the interpolated
+function agrees with the user's expectations.  (Note: You can
+temporarily increase the {Ntable} parameter to a high value for this
+purpose.  "{Ntable}" is explained above.)
+
+The optional "CHECKF" keyword is analogous to the "CHECKU" keyword.
+It is followed by a filename, and it allows the user to check the
+interpolated force table.  This option is available even if the user
+selected the "NOF" option.
+
+Note that one file can contain many sections, each with a tabulated
+potential. LAMMPS reads the file section by section until it finds one
+that matches the specified keyword.
+
+[Restrictions:]
+
+This dihedral style can only be used if LAMMPS was built with the
+USER-MISC package.  See the "Making LAMMPS"_Section_start.html#start_3
+section for more info on packages.
+
+[Related commands:]
+
+"dihedral_coeff"_dihedral_coeff.html, "dihedral_style table"_dihedral_table.html
+
+[Default:] none
+
+:link(dihedralcut-Salerno)
+[(Salerno)] Salerno, Bernstein, J Chem Theory Comput, --, ---- (2018).
--- a/doc/src/dihedrals.txt
+++ b/doc/src/dihedrals.txt
@ -19,6 +19,7 @@ Dihedral Styles :h1
   dihedral_quadratic
   dihedral_spherical
   dihedral_table
+   dihedral_table_cut
   dihedral_zero
   dihedral_charmm
   dihedral_class2
--- a/doc/src/dump_modify.txt
+++ b/doc/src/dump_modify.txt
@ -15,7 +15,7 @@ dump_modify dump-ID keyword values ... :pre
 dump-ID = ID of dump to modify :ulb,l
 one or more keyword/value pairs may be appended :l
 these keywords apply to various dump styles :l
-keyword = {append} or {at} or {buffer} or {delay} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
+keyword = {append} or {at} or {buffer} or {delay} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {maxfiles} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
  {append} arg = {yes} or {no}
  {at} arg = N
    N = index of frame written upon first dump
@ -37,6 +37,8 @@ keyword = {append} or {at} or {buffer} or {delay} or {element} or {every} or {fi
  {image} arg = {yes} or {no}
  {label} arg = string
    string = character string (e.g. BONDS) to use in header of dump local file
+  {maxfiles} arg = Fmax
+    Fmax = keep only the most recent {Fmax} snapshots (one snapshot per file)
  {nfile} arg = Nf
    Nf = write this many files, one from each of Nf processors
  {pad} arg = Nchar = # of characters to convert timestep to
@ -364,6 +366,20 @@ e.g. BONDS or ANGLES.

 :line

+The {maxfiles} keyword can only be used when a '*' wildcard is
+included in the dump file name, i.e. when writing a new file(s) for
+each snapshot.  The specified {Fmax} is how many snapshots will be
+kept.  Once this number is reached, the file(s) containing the oldest
+snapshot is deleted before a new dump file is written.  If the
+specified {Fmax} <= 0, then all files are retained.
+
+This can be useful for debugging, especially if you don't know on what
+timestep something bad will happen, e.g. when LAMMPS will exit with an
+error.  You can dump every timestep, and limit the number of dump
+files produced, even if you run for 1000s of steps.
+
+:line
+
 The {nfile} or {fileper} keywords can be used in conjunction with the
 "%" wildcard character in the specified dump file name, for all dump
 styles except the {dcd}, {image}, {movie}, {xtc}, and {xyz} styles
@ -901,6 +917,7 @@ flush = yes
 format = %d and %g for each integer or floating point value
 image = no
 label = ENTRIES
+maxifiles = -1
 nfile = 1
 pad = 0
 pbc = no
--- a/doc/src/fix_adapt.txt
+++ b/doc/src/fix_adapt.txt
@ -205,6 +205,14 @@ a bond coefficient over time, very similar to how the {pair} keyword
 operates. The only difference is that now a bond coefficient for a
 given bond type is adapted.

+A wild-card asterisk can be used in place of or in conjunction with
+the bond type argument to set the coefficients for multiple bond types.
+This takes the form "*" or "*n" or "n*" or "m*n".  If N = the number of
+atom types, then an asterisk with no numeric values means all types
+from 1 to N.  A leading asterisk means all types from 1 to n (inclusive).
+A trailing asterisk means all types from n to N (inclusive).  A middle
+asterisk means all types from m to n (inclusive).
+
 Currently {bond} does not support bond_style hybrid nor bond_style
 hybrid/overlay as bond styles. The only bonds that currently are
 working with fix_adapt are
--- a/doc/src/fix_bond_react.txt
+++ b/doc/src/fix_bond_react.txt
@ -20,14 +20,15 @@ ID, group-ID are documented in "fix"_fix.html command. Group-ID is ignored. :ulb
 bond/react = style name of this fix command :l
 zero or more common keyword/value pairs may be appended directly after 'bond/react' :l
 these apply to all reaction specifications (below) :l
-common_keyword = {stabilization}
-  {stabilization} values = group-ID xmax
-    group-ID = user-assigned ID of an internally-created dynamic group that excludes reacting atoms, and can be used by a subsequent time integration fix such as nvt, npt, or nve (cannot be 'all')
-  {xmax} value = distance
-    distance = xmax value that is used by an internally created "nve/limit"_fix_nve_limit.html integrator
-react = mandatory argument indicating new reaction specification
-  react-ID = user-assigned name for the reaction
-  react-group-ID = only atoms in this group are available for the reaction
+common_keyword = {stabilization} :l
+  {stabilization} values = {no} or {yes} {group-ID} {xmax}
+    {no} = no reaction site stabilization
+    {yes} = perform reaction site stabilization
+      {group-ID} = user-assigned ID for all non-reacting atoms (group created internally)
+      {xmax} = xmax value that is used by an internally created "nve/limit"_fix_nve_limit.html integrator :pre
+react = mandatory argument indicating new reaction specification :l
+  react-ID = user-assigned name for the reaction :l
+  react-group-ID = only atoms in this group are available for the reaction :l
  Nevery = attempt reaction every this many steps :l
  Rmin = bonding pair atoms must be separated by more than Rmin to initiate reaction (distance units) :l
  Rmax = bonding pair atoms must be separated by less than Rmax to initiate reaction (distance units) :l
@ -47,7 +48,7 @@ react = mandatory argument indicating new reaction specification

 molecule mol1 pre_reacted_topology.txt
 molecule mol2 post_reacted_topology.txt
-fix 5 all bond/react stabilization no react myrxn1 all 1 0 3.25 mol1 mol2 map_file.txt
+fix 5 all bond/react stabilization no react myrxn1 all 1 0 3.25 mol1 mol2 map_file.txt :pre

 molecule mol1 pre_reacted_rxn1.txt
 molecule mol2 post_reacted_rxn1.txt
@ -56,12 +57,12 @@ molecule mol4 post_reacted_rxn2.txt
 fix 5 all bond/react stabilization yes nvt_grp .03 &
  react myrxn1 all 1 0 3.25 mol1 mol2 map_file_rxn1.txt prob 0.50 12345 &
  react myrxn2 all 1 0 2.75 mol3 mol4 map_file_rxn2.txt prob 0.25 12345
-fix 6 nvt_grp nvt temp 300 300 100 # system-wide thermostat must be defined after bond/react :pre
+fix 6 nvt_grp nvt temp 300 300 100 # set thermostat after bond/react :pre

 [Description:]

 Initiate complex covalent bonding (topology) changes. These topology
-changes will be referred to as "reactions" throughout this
+changes will be referred to as 'reactions' throughout this
 documentation. Topology changes are defined in pre- and post-reaction
 molecule templates and can include creation and deletion of bonds,
 angles, dihedrals, impropers, bond-types, angle-types, dihedral-types,
@ -81,10 +82,10 @@ occurred 3) build a molecule template of the reaction site after the
 reaction has occurred 4) create a map that relates the
 template-atom-IDs of each atom between pre- and post-reaction molecule
 templates 5) fill a simulation box with molecules and run a simulation
-with fix/bond react.
+with fix bond/react.

 Only one 'fix bond/react' command can be used at a time. Multiple
-reactions can be simultaneously applied by specifying multiple 'react'
+reactions can be simultaneously applied by specifying multiple {react}
 arguments to a single 'fix bond/react' command. This syntax is
 necessary because the 'common keywords' are applied to all reactions.

@ -99,10 +100,11 @@ typically be set to the maximum distance that non-reacting atoms move
 during the simulation.

 The group-ID set using the {stabilization} keyword should be a
-previously unused group-ID. The fix bond/react command creates a
-"dynamic group"_group.html of this name that excludes reacting atoms.
-This dynamic group-ID should then be used by a subsequent system-wide
-time integrator, as shown in the second example above. It is currently
+previously unused group-ID. It cannot be specified as 'all'. The fix
+bond/react command creates a "dynamic group"_group.html of this name
+that includes all non-reacting atoms. This dynamic group-ID should
+then be used by a subsequent system-wide time integrator such as nvt,
+npt, or nve, as shown in the second example above. It is currently
 necessary to place the time integration command after the fix
 bond/react command due to the internal dynamic grouping performed by
 fix bond/react.
@ -111,9 +113,9 @@ NOTE: The internally created group currently applies to all atoms in
 the system, i.e. you should generally not have a separate thermostat
 which acts on the 'all' group.

-The following comments pertain to each 'react' argument:
+The following comments pertain to each {react} argument:

-A check for possible new reaction sites is performed every Nevery
+A check for possible new reaction sites is performed every {Nevery}
 timesteps.

 Two conditions must be met for a reaction to occur. First a bonding
@ -124,20 +126,20 @@ modified to match the post-reaction template.

 A bonding atom pair will be identified if several conditions are met.
 First, a pair of atoms within the specified react-group-ID of type
-typei and typej must separated by a distance between Rmin and Rmax. It
-is possible that multiple bonding atom pairs are identified: if the
-bonding atoms in the pre-reacted template are not 1-2, 1-3, or 1-4
-neighbors, the closest bonding atom partner is set as its bonding
-partner; otherwise, the farthest potential partner is chosen. Then, if
-both an atomi and atomj have each other as their nearest bonding
-partners, these two atoms are identified as the bonding atom pair of
-the reaction site. Once this unique bonding atom pair is identified
-for each reaction, there could two or more reactions that involve a
-given atom on the same timestep. If this is the case, only one such
-reaction is permitted to occur. This reaction is chosen randomly from
-all potential reactions. This capability allows e.g. for different
-reaction pathways to proceed from identical reaction sites with
-user-specified probabilities.
+typei and typej must separated by a distance between {Rmin} and
+{Rmax}. It is possible that multiple bonding atom pairs are
+identified: if the bonding atoms in the pre-reacted template are not
+1-2, 1-3, or 1-4 neighbors, the closest bonding atom partner is set as
+its bonding partner; otherwise, the farthest potential partner is
+chosen. Then, if both an atomi and atomj have each other as their
+nearest bonding partners, these two atoms are identified as the
+bonding atom pair of the reaction site. Once this unique bonding atom
+pair is identified for each reaction, there could two or more
+reactions that involve a given atom on the same timestep. If this is
+the case, only one such reaction is permitted to occur. This reaction
+is chosen randomly from all potential reactions. This capability
+allows e.g. for different reaction pathways to proceed from identical
+reaction sites with user-specified probabilities.

 The pre-reacted molecule template is specified by a molecule command.
 This molecule template file contains a sample reaction site and its
@ -175,77 +177,43 @@ A discussion of correctly handling this is also provided on the

 The map file is a text document with the following format:

-Format of the map file
+A map file has a header and a body. The header of map file the
+contains one mandatory keyword and one optional keyword. The mandatory
+keyword is 'equivalences' and the optional keyword is 'edgeIDs':

-A map file has a header and a body. The header appears first. The
-first line of the header is always skipped; it typically contains a
-description of the file.  Lines can have a trailing comment starting
-with '#' that is ignored. If the line is blank (only whitespace after
-comment is deleted), it is skipped. If the line contains a header
-keyword, the corresponding value(s) is read from the line. If it
-doesn't contain a header keyword, the line begins the body of the
-file.
+N {equivalences} = # of atoms N in the reaction molecule templates
+N {edgeIDs} = # of edge atoms N in the pre-reacted molecule template :pre

-The header contains one mandatory keyword and one optional keyword.
-The mandatory keyword is 'equivalences' and the optional keyword is
-'edgeIDs.' These specify the number of atoms in the pre- and
-post-reacted templates and the number of edge atoms in pre-reacted
-template, respectively.
-
-The body contains two mandatory sections and one optional section. The
-first section begins with the keyword 'BondingIDs' and lists the atom
-IDs of the bonding atom pair in the pre-reacted molecule template. The
-second mandatory section begins with the keyword 'Equivalences' and
-lists a one-to-one correspondence between atom IDs of the pre- and
-post-reacted templates. The optional section begins with the keyword
-'EdgeIDs' and list the atom IDs of edge atoms in the pre-reacted
+The body of the map file contains two mandatory sections and one
+optional section. The first mandatory section begins with the keyword
+'BondingIDs' and lists the atom IDs of the bonding atom pair in the
+pre-reacted molecule template. The second mandatory section begins
+with the keyword 'Equivalences' and lists a one-to-one correspondence
+between atom IDs of the pre- and post-reacted templates. The first
+column is an atom ID of the pre-reacted molecule template, and the
+second column is the corresponding atom ID of the post-reacted
+molecule template. The optional section begins with the keyword
+'EdgeIDs' and lists the atom IDs of edge atoms in the pre-reacted
 molecule template.

-Format of the header of the map file
-
-These are the recognized header keywords. Header lines can come in any
-order. The value(s) are read from the beginning of the line. Thus the
-keyword 'equivalences' should be in a line like "25 equivalences."
-
-equivalences = # of atoms in the pre- and post-reacted molecule
-templates edgeIDs = # of edge atoms in the pre-reacted molecule template :pre
-
-The edgeIDs keyword is optional.
-
-Format of the body of the map file
-
-These are the section keywords for the body of the file.
-
-BondingIDs, EdgeIDs = list of atom IDs of bonding and edge atoms in
-the pre-reacted molecule template
-
-Equivalences = a two column list where the first column is an atom ID
-of the pre-reacted molecule template, and the second column is the
-corresponding atom ID of the post-reacted molecule template
-
-The bondingIDs section will always contain two atom IDs, corresponding
-to the bonding atom pairs of the pre-reacted map file. The
-Equivalences section will contain as many rows as there are atoms in
-the pre- and post-reacted molecule templates. The edgeIDs section is
-optional, but would contain an atom ID for each edge atom in the
-pre-reacted molecule template.
-
 A sample map file is given below:

 :line

-# This is a map file :pre
+# this is a map file :pre

 2 edgeIDs
 7 equivalences :pre

 BondingIDs :pre

-3 5 :pre
+3
+5 :pre

 EdgeIDs :pre

-1 7 :pre
+1
+7 :pre

 Equivalences :pre

@ -264,13 +232,13 @@ within LAMMPS that store bond topology are updated to reflect the
 post-reacted molecule template. All force fields with fixed bonds,
 angles, dihedrals or impropers are supported.

-A few capabilities to note: 1) You may specify as many 'react'
+A few capabilities to note: 1) You may specify as many {react}
 arguments as desired. For example, you could break down a complicated
 reaction mechanism into several reaction steps, each defined by its
-own 'react' argument. 2) While typically a bond is formed or removed
+own {react} argument. 2) While typically a bond is formed or removed
 between the bonding atom pairs specified in the pre-reacted molecule
 template, this is not required. 3) By reversing the order of the pre-
-and post- reacted molecule templates in another 'react' argument, you
+and post- reacted molecule templates in another {react} argument, you
 can allow for the possibility of one or more reverse reactions.

 The optional keywords deal with the probability of a given reaction
@ -304,7 +272,7 @@ you can use the internally-created dynamic group named
 would thermostat the group of all atoms currently involved in a
 reaction:

-fix 1 bond_react_MASTER_group temp/rescale 1 300 300 10 1
+fix 1 bond_react_MASTER_group temp/rescale 1 300 300 10 1 :pre

 NOTE: This command must be added after the fix bond/react command, and
 will apply to all reactions.
@ -324,10 +292,11 @@ local command.
 [Restart, fix_modify, output, run start/stop, minimize info:]

 No information about this fix is written to "binary restart
-files"_restart.html.  None of the "fix_modify"_fix_modify.html options
-are relevant to this fix.
+files"_restart.html, aside from internally-created per-atom
+properties. None of the "fix_modify"_fix_modify.html options are
+relevant to this fix.

-This fix computes one statistic for each 'react' argument that it
+This fix computes one statistic for each {react} argument that it
 stores in a global vector, of length 'number of react arguments', that
 can be accessed by various "output
 commands"_Section_howto.html#howto_15. The vector values calculated by
@ -359,5 +328,5 @@ The option defaults are stabilization = no, stabilize_steps = 60

 :line

-:link(Gissinger) [(Gissinger)] Gissinger, Jensen and Wise, Polymer,
-128, 211 (2017).
+:link(Gissinger)
+[(Gissinger)] Gissinger, Jensen and Wise, Polymer, 128, 211 (2017).
--- a/doc/src/fix_dt_reset.txt
+++ b/doc/src/fix_dt_reset.txt
@ -19,7 +19,9 @@ Tmin = minimum dt allowed which can be NULL (time units)
 Tmax = maximum dt allowed which can be NULL (time units)
 Xmax = maximum distance for an atom to move in one timestep (distance units)
 zero or more keyword/value pairs may be appended
-keyword = {units} :ul
+keyword = {emax} or {units} :ul
+  {emax} value = Emax
+    Emax = maximum kinetic energy change for an atom in one timestep (energy units)
  {units} value = {lattice} or {box}
    lattice = Xmax is defined in lattice units
    box = Xmax is defined in simulation box units :pre
@ -27,12 +29,17 @@ keyword = {units} :ul
 [Examples:]

 fix 5 all dt/reset 10 1.0e-5 0.01 0.1
-fix 5 all dt/reset 10 0.01 2.0 0.2 units box :pre
+fix 5 all dt/reset 10 0.01 2.0 0.2 units box
+fix 5 all dt/reset 5 NULL 0.001 0.5 emax 30 units box :pre

 [Description:]

 Reset the timestep size every N steps during a run, so that no atom
-moves further than Xmax, based on current atom velocities and forces.
+moves further than the specified {Xmax} distance, based on current
+atom velocities and forces.  Optionally an additional criterion is
+imposed by the {emax} keyword, so that no atom's kinetic energy
+changes by more than the specified {Emax}.
+
 This can be useful when starting from a configuration with overlapping
 atoms, where forces will be large.  Or it can be useful when running
 an impact simulation where one or more high-energy atoms collide with
@ -48,7 +55,12 @@ current velocity and force.  Since performing this calculation exactly
 would require the solution to a quartic equation, a cheaper estimate
 is generated.  The estimate is conservative in that the atom's
 displacement is guaranteed not to exceed {Xmax}, though it may be
-smaller.
+smaller. 
+
+In addition if the {emax} keyword is used, the specified {Emax} value
+is enforced as a limit on how much an atom's kinetic energy can
+change.  If the timestep required is even smaller than for the {Xmax}
+displacement, then the smaller timestep is used.

 Given this putative timestep for each atom, the minimum timestep value
 across all atoms is computed.  Then the {Tmin} and {Tmax} bounds are
@ -87,4 +99,5 @@ minimization"_minimize.html.

 [Default:]

-The option defaults is units = lattice.
+The option defaults are units = lattice, and no emax kinetic energy
+limit.
--- a/doc/src/lammps.book
+++ b/doc/src/lammps.book
@ -582,6 +582,7 @@ dihedral_opls.html
 dihedral_quadratic.html
 dihedral_spherical.html
 dihedral_table.html
+dihedral_table_cut.html
 dihedral_zero.html

 lammps_commands_improper.html
--- a/doc/src/molecule.txt
+++ b/doc/src/molecule.txt
@ -98,19 +98,20 @@ molecule (header keyword = inertia).
 NOTE: The molecule command can be used to define molecules with bonds,
 angles, dihedrals, imporopers, or special bond lists of neighbors
 within a molecular topology, so that you can later add the molecules
-to your simulation, via one or more of the commands listed above.  If
-such molecules do not already exist when LAMMPS creates the simulation
-box, via the "create_box"_create_box.html or
-"read_data"_read_data.html command, when you later add them you may
-overflow the pre-allocated data structures which store molecular
-topology information with each atom, and an error will be generated.
-Both the "create_box"_create_box.html command and the data files read
-by the "read_data"_read_data.html command have "extra" options which
+to your simulation, via one or more of the commands listed above.
+Since this topology-related information requires that suitable storage
+is reserved when LAMMPS creates the simulation box (e.g. when using
+the "create_box"_create_box.html command or the
+"read_data"_read_data.html command) suitable space has to be reserved
+so you do not overflow those pre-allocated data structures when adding
+molecules later.  Both the "create_box"_create_box.html command and
+the "read_data"_read_data.html command have "extra" options which
 insure space is allocated for storing topology info for molecules that
 are added later.

-The format of an individual molecule file is similar to the data file
-read by the "read_data"_read_data.html commands, and is as follows.
+The format of an individual molecule file is similar but
+(not identical) to the data file read by the "read_data"_read_data.html
+commands, and is as follows.

 A molecule file has a header and a body.  The header appears first.
 The first line of the header is always skipped; it typically contains
@ -455,7 +456,11 @@ of SHAKE clusters.

 :line

-[Restrictions:] none
+[Restrictions:]
+
+This command must come after the simulation box is define by a
+"read_data"_read_data.html, "read_restart"_read_restart.html, or
+"create_box"_create_box.html command.

 [Related commands:]

--- a/doc/src/pair_born.txt
+++ b/doc/src/pair_born.txt
@ -12,12 +12,14 @@ pair_style born/omp command :h3
 pair_style born/gpu command :h3
 pair_style born/coul/long command :h3
 pair_style born/coul/long/cs command :h3
+pair_style born/coul/long/cs/gpu command :h3
 pair_style born/coul/long/gpu command :h3
 pair_style born/coul/long/omp command :h3
 pair_style born/coul/msm command :h3
 pair_style born/coul/msm/omp command :h3
 pair_style born/coul/wolf command :h3
 pair_style born/coul/wolf/cs command :h3
+pair_style born/coul/wolf/cs/gpu command :h3
 pair_style born/coul/wolf/gpu command :h3
 pair_style born/coul/wolf/omp command :h3
 pair_style born/coul/dsf command :h3
--- a/doc/src/pair_coul.txt
+++ b/doc/src/pair_coul.txt
@ -20,6 +20,7 @@ pair_style coul/dsf/kk command :h3
 pair_style coul/dsf/omp command :h3
 pair_style coul/long command :h3
 pair_style coul/long/cs command :h3
+pair_style coul/long/cs/gpu command :h3
 pair_style coul/long/omp command :h3
 pair_style coul/long/gpu command :h3
 pair_style coul/long/kk command :h3
--- a/doc/src/pair_gw.txt
+++ b/doc/src/pair_gw.txt
@ -95,9 +95,9 @@ This pair style can only be used via the {pair} keyword of the

 [Restrictions:]

-This pair style is part of the USER-MISC package. It is only enabled
-if LAMMPS was built with that package.  See
-the "Making LAMMPS"_Section_start.html#start_3 section for more info.
+This pair style is part of the MANYBODY package. It is only enabled if
+LAMMPS was built with that package.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.

 This pair style requires the "newton"_newton.html setting to be "on"
 for pair interactions.
@ -117,4 +117,5 @@ appropriate units if your simulation doesn't use "metal" units.
 :line

 :link(Gao)
-[(Gao)] Gao and Weber, Nuclear Instruments and Methods in Physics Research B 191 (2012) 504.
+[(Gao)] Gao and Weber, Nuclear Instruments and Methods in Physics
+Research B 191 (2012) 504.
--- a/doc/src/pair_reaxc.txt
+++ b/doc/src/pair_reaxc.txt
@ -47,13 +47,14 @@ the "(Aktulga)"_#Aktulga paper. The {reax/c} style was initially
 implemented as a stand-alone C code and is now integrated into LAMMPS
 as a package.

-The {reax/c/kk} style is a Kokkos version of the ReaxFF potential that is
-derived from the {reax/c} style. The Kokkos version can run on GPUs and
-can also use OpenMP multithreading. For more information about the Kokkos package,
-see "Section 4"_Section_packages.html#kokkos and "Section 5.3.3"_accelerate_kokkos.html.
-One important consideration when using the {reax/c/kk} style is the choice of either
-half or full neighbor lists. This setting can be changed using the Kokkos "package"_package.html
-command.
+The {reax/c/kk} style is a Kokkos version of the ReaxFF potential that
+is derived from the {reax/c} style. The Kokkos version can run on GPUs
+and can also use OpenMP multithreading. For more information about the
+Kokkos package, see "Section 4"_Section_packages.html#kokkos and
+"Section 5.3.3"_accelerate_kokkos.html.  One important consideration
+when using the {reax/c/kk} style is the choice of either half or full
+neighbor lists. This setting can be changed using the Kokkos
+"package"_package.html command.

 The {reax/c} style differs from the "pair_style reax"_pair_reax.html
 command in the lo-level implementation details.  The {reax} style is a
@ -80,9 +81,8 @@ parameterizations for different classes of materials.  You can submit
 a contact request at the Materials Computation Center (MCC) website
 "https://www.mri.psu.edu/materials-computation-center/connect-mcc"_https://www.mri.psu.edu/materials-computation-center/connect-mcc,
 describing the material(s) you are interested in modeling with ReaxFF.
-They can tell
-you what is currently available or what it would take to create a
-suitable ReaxFF parameterization.
+They can tell you what is currently available or what it would take to
+create a suitable ReaxFF parameterization.

 The {cfile} setting can be specified as NULL, in which case default
 settings are used. A control file can be specified which defines
@ -120,28 +120,31 @@ assign to each atom will be used for computing the electrostatic
 interactions in the system.
 See the "fix qeq/reax"_fix_qeq_reax.html command for details.

-Using the optional keyword {lgvdw} with the value {yes} turns on
-the low-gradient correction of the ReaxFF/C for long-range
-London Dispersion, as described in the "(Liu)"_#Liu_2011 paper. Force field
+Using the optional keyword {lgvdw} with the value {yes} turns on the
+low-gradient correction of the ReaxFF/C for long-range London
+Dispersion, as described in the "(Liu)"_#Liu_2011 paper. Force field
 file {ffield.reax.lg} is designed for this correction, and is trained
 for several energetic materials (see "Liu"). When using lg-correction,
 recommended value for parameter {thb} is 0.01, which can be set in the
 control file.  Note: Force field files are different for the original
-or lg corrected pair styles, using wrong ffield file generates an error message.
+or lg corrected pair styles, using wrong ffield file generates an
+error message.

 Using the optional keyword {enobonds} with the value {yes}, the energy
 of atoms with no bonds (i.e. isolated atoms) is included in the total
 potential energy and the per-atom energy of that atom.  If the value
-{no} is specified then the energy of atoms with no bonds is set to zero.
-The latter behavior is usual not desired, as it causes discontinuities
-in the potential energy when the bonding of an atom drops to zero.
+{no} is specified then the energy of atoms with no bonds is set to
+zero.  The latter behavior is usual not desired, as it causes
+discontinuities in the potential energy when the bonding of an atom
+drops to zero.

 Optional keywords {safezone} and {mincap} are used for allocating
-reax/c arrays.  Increasing these values can avoid memory problems, such
-as segmentation faults and bondchk failed errors, that could occur under
-certain conditions. These keywords aren't used by the Kokkos version, which
-instead uses a more robust memory allocation scheme that checks if the sizes of
-the arrays have been exceeded and automatically allocates more memory.
+reax/c arrays.  Increasing these values can avoid memory problems,
+such as segmentation faults and bondchk failed errors, that could
+occur under certain conditions. These keywords aren't used by the
+Kokkos version, which instead uses a more robust memory allocation
+scheme that checks if the sizes of the arrays have been exceeded and
+automatically allocates more memory.

 The thermo variable {evdwl} stores the sum of all the ReaxFF potential
 energy contributions, with the exception of the Coulombic and charge
@ -153,7 +156,8 @@ This pair style tallies a breakdown of the total ReaxFF potential
 energy into sub-categories, which can be accessed via the "compute
 pair"_compute_pair.html command as a vector of values of length 14.
 The 14 values correspond to the following sub-categories (the variable
-names in italics match those used in the original FORTRAN ReaxFF code):
+names in italics match those used in the original FORTRAN ReaxFF
+code):

 {eb} = bond energy
 {ea} = atom energy
@ -340,8 +344,8 @@ reax"_pair_reax.html

 [Default:]

-The keyword defaults are checkqeq = yes, enobonds = yes, lgvdw = no, safezone = 1.2,
-mincap = 50.
+The keyword defaults are checkqeq = yes, enobonds = yes, lgvdw = no,
+safezone = 1.2, mincap = 50.

 :line

--- a/doc/src/pair_sw.txt
+++ b/doc/src/pair_sw.txt
@ -192,8 +192,8 @@ This pair style can only be used via the {pair} keyword of the
 [Restrictions:]

 This pair style is part of the MANYBODY package.  It is only enabled
-if LAMMPS was built with that package.  See
-the "Making LAMMPS"_Section_start.html#start_3 section for more info.
+if LAMMPS was built with that package.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.

 This pair style requires the "newton"_newton.html setting to be "on"
 for pair interactions.
--- a/doc/src/variable.txt
+++ b/doc/src/variable.txt
@ -296,17 +296,24 @@ list of runs (e.g. 1000) without having to list N strings in the input
 script.

 For the {string} style, a single string is assigned to the variable.
-The only difference between this and using the {index} style with a
-single string is that a variable with {string} style can be redefined.
-E.g. by another command later in the input script, or if the script is
-read again in a loop.
+Two differences between this this and using the {index} style exist:
+a variable with {string} style can be redefined, e.g. by another command later
+in the input script, or if the script is read again in a loop. The other
+difference is that {string} performs variable substitution even if the
+string parameter is quoted.

 For the {format} style, an equal-style variable is specified along
 with a C-style format string, e.g. "%f" or "%.10g", which must be
 appropriate for formatting a double-precision floating-point value.
-This allows an equal-style variable to be formatted specifically for
-output as a string, e.g. by the "print"_print.html command, if the
-default format "%.15g" has too much precision.
+The default format is "%.15g".  This variable style allows an
+equal-style variable to be formatted precisely when it is evaluated.
+
+If you simply wish to print a variable value with desired precision to
+the screen or logfile via the "print"_print.html or "fix
+print"_fix_print.html commands, you can also do this by specifying an
+"immediate" variable with a trailing colon and format string, as part
+of the string argument of those commands.  This is explained in
+"Section 3.2"_Section_commands.html#cmd_2.

 For the {getenv} style, a single string is assigned to the variable
 which should be the name of an environment variable.  When the
--- a/examples/COUPLE/lammps_quest/lmpqst.cpp
+++ b/examples/COUPLE/lammps_quest/lmpqst.cpp
@ -6,10 +6,10 @@
 //         in.lammps = LAMMPS input script
 //         in.quest = Quest input script

-#include "mpi.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <mpi.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 #include "stdint.h"

 #include "many2one.h"
--- a/examples/COUPLE/lammps_spparks/lmpspk.cpp
+++ b/examples/COUPLE/lammps_spparks/lmpspk.cpp
@ -7,10 +7,10 @@
 //         Sfactor = multiplier on strain effect
 //         in.spparks = SPPARKS input script

-#include "mpi.h"
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <mpi.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>

 #include "lammps_data_write.h"
 #include "many2many.h"
--- a/examples/COUPLE/library/error.cpp
+++ b/examples/COUPLE/library/error.cpp
@ -1,6 +1,6 @@
 #include <mpi.h>
-#include <stdio.h>
-#include <stdlib.h>
+#include <cstdio>
+#include <cstdlib>
 #include "error.h"

 /* ---------------------------------------------------------------------- */
--- a/examples/COUPLE/library/files.cpp
+++ b/examples/COUPLE/library/files.cpp
@ -1,5 +1,5 @@
-#include <stdio.h>
-#include <string.h>
+#include <cstdio>
+#include <cstring>
 #include "files.h"

 #define MAXLINE 256
--- a/examples/COUPLE/library/irregular.cpp
+++ b/examples/COUPLE/library/irregular.cpp
@ -1,6 +1,6 @@
-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
 #include "irregular.h"
 #include "memory.h"
 #include "error.h"
--- a/examples/COUPLE/library/lammps_data_write.cpp
+++ b/examples/COUPLE/library/lammps_data_write.cpp
@ -1,6 +1,6 @@
 #include <mpi.h>
-#include <stdlib.h>
-#include <string.h>
+#include <cstdlib>
+#include <cstring>
 #include "lammps_data_write.h"
 #include "memory.h"
 #include "error.h"
--- a/examples/COUPLE/library/lammps_data_write.h
+++ b/examples/COUPLE/library/lammps_data_write.h
@ -1,7 +1,7 @@
 #ifndef LAMMPS_DATA_WRITE_H
 #define LAMMPS_DATA_WRITE_H

-#include <stdio.h>
+#include <cstdio>
 #include "send2one.h"

 class LAMMPSDataWrite : public Send2One {
--- a/examples/COUPLE/library/many2many.cpp
+++ b/examples/COUPLE/library/many2many.cpp
@ -1,6 +1,6 @@
 #include <mpi.h>
-#include <stdlib.h>
-#include <stdio.h>
+#include <cstdlib>
+#include <cstdio>
 #include "many2many.h"
 #include "irregular.h"
 #include "memory.h"
--- a/examples/COUPLE/library/many2one.cpp
+++ b/examples/COUPLE/library/many2one.cpp
@ -1,6 +1,6 @@
-#include "mpi.h"
-#include "stdio.h"
-#include "stdlib.h"
+#include <mpi.h>
+#include <cstdio>
+#include <cstdlib>
 #include "many2one.h"
 #include "memory.h"

--- a/examples/COUPLE/library/memory.cpp
+++ b/examples/COUPLE/library/memory.cpp
@ -1,6 +1,6 @@
 #include <mpi.h>
-#include <stdlib.h>
-#include <stdio.h>
+#include <cstdlib>
+#include <cstdio>
 #include "memory.h"
 #include "error.h"

--- a/examples/COUPLE/library/one2many.cpp
+++ b/examples/COUPLE/library/one2many.cpp
@ -1,5 +1,5 @@
 #include <mpi.h>
-#include <stdlib.h>
+#include <cstdlib>
 #include "one2many.h"
 #include "memory.h"

--- a/examples/COUPLE/library/send2one.cpp
+++ b/examples/COUPLE/library/send2one.cpp
@ -1,6 +1,6 @@
-#include "mpi.h"
-#include "stdlib.h"
-#include "stdio.h"
+#include <mpi.h>
+#include <cstdlib>
+#include <cstdio>
 #include "send2one.h"
 #include "memory.h"
 #include "error.h"
--- a/examples/COUPLE/multiple/multiple.cpp
+++ b/examples/COUPLE/multiple/multiple.cpp
@ -23,10 +23,10 @@
 //         Tdelta = incremental temperature for each of N runs
 // See README for compilation instructions

-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
-#include "mpi.h"
+#include <mpi.h>
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>

 #include "lammps.h"         // these are LAMMPS include files
 #include "input.h"
--- a/examples/COUPLE/simple/simple.cpp
+++ b/examples/COUPLE/simple/simple.cpp
@ -19,15 +19,16 @@
 //         in.lammps = LAMMPS input script
 // See README for compilation instructions

-#include "stdio.h"
-#include "stdlib.h"
-#include "string.h"
-#include "mpi.h"
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <mpi.h>

-#include "lammps.h"         // these are LAMMPS include files
-#include "input.h"
-#include "atom.h"
-#include "library.h"
+// these are LAMMPS include files
+#include <lammps/lammps.h>
+#include <lammps/input.h>
+#include <lammps/atom.h>
+#include <lammps/library.h>

 using namespace LAMMPS_NS;

--- a/examples/latte/data.graphene.boxrel
+++ b/examples/latte/data.graphene.boxrel
@ -0,0 +1,49 @@
+ LAMMPS Description
+ 
+          32 atoms
+ 
+           1 atom types
+ 
+   0.0000000000000000        10.000000000000000      xlo xhi
+   0.0000000000000000        8.0000000000000000      ylo yhi
+   0.0000000000000000        20.000000000000000      zlo zhi
+   4.8985871965894128E-016   1.2246467991473533E-015   1.2246467991473533E-015 xy xz yz
+ 
+ Masses
+ 
+              1   12.010000000000000     
+ 
+ Atoms
+ 
+    1    1    1   0.0   4.93100   4.25000   0.00500
+    2    1    1   0.0   8.62100   2.12100   0.14000
+    3    1    1   0.0   3.70700   2.12600   0.14700
+    4    1    1   0.0   7.38200   4.25400   0.07800
+    5    1    1   0.0   2.47900   4.25400   0.08000
+    6    1    1   0.0   6.15800   6.37400  -0.01000
+    7    1    1   0.0   1.23700   6.38300   0.06600
+    8    1    1   0.0   1.24000   2.12100   0.14600
+    9    1    1   0.0   6.15500   2.12600   0.12900
+   10    1    1   0.0   0.00700   4.25200   0.12200
+   11    1    1   0.0   8.62100   6.38500   0.04100
+   12    1    1   0.0   3.70000   6.37400  -0.01000
+   13    1    1   0.0   0.00600   1.41600   0.13000
+   14    1    1   0.0   4.93000   1.40800   0.14700
+   15    1    1   0.0   8.61800   3.54600   0.11500
+   16    1    1   0.0   3.70800   3.55300   0.08400
+   17    1    1   0.0   7.39400   5.68000   0.03500
+   18    1    1   0.0   2.46500   5.68000   0.03500
+   19    1    1   0.0   6.16000   7.80500   0.02700
+   20    1    1   0.0   1.23800   7.81100   0.06000
+   21    1    1   0.0   2.47300   1.41800   0.16100
+   22    1    1   0.0   7.38900   1.41700   0.14800
+   23    1    1   0.0   1.24200   3.54700   0.12600
+   24    1    1   0.0   6.15300   3.55300   0.07400
+   25    1    1   0.0   0.00700   5.67800   0.09700
+   26    1    1   0.0   4.93100   5.66800  -0.03100
+   27    1    1   0.0   8.62000   7.81300   0.03900
+   28    1    1   0.0   3.70100   7.80200   0.03700
+   29    1    1   0.0   0.00700  -0.01000   0.08900
+   30    1    1   0.0   4.93100  -0.01500   0.16100
+   31    1    1   0.0   2.47300  -0.01200   0.14400
+   32    1    1   0.0   7.38900  -0.01300   0.14800
--- a/examples/latte/in.graphene.boxrel
+++ b/examples/latte/in.graphene.boxrel
@ -0,0 +1,44 @@
+# Simple water model with LATTE
+
+units		metal
+atom_style	full
+atom_modify     sort 0 0.0    # turn off sorting of the coordinates
+
+read_data       data.graphene.boxrel
+
+# replicate system if requested
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable        nrep equal v_x*v_y*v_z
+if              "${nrep} > 1" then "replicate $x $y $z"
+
+# initialize system
+
+velocity	all create 0.0 87287 loop geom
+
+pair_style      zero 1.0
+pair_coeff	* *  
+
+neighbor	1.0 bin
+neigh_modify    every 1 delay 0 check yes 
+
+timestep        0.00025
+
+fix 1 all box/relax iso 0.0 vmax 0.001
+
+fix             2 all latte NULL
+fix_modify      2 energy yes
+
+thermo_style    custom   etotal
+
+# minimization
+
+thermo          1
+fix 3 all print 1 "Total Energy ="
+min_style cg
+min_modify dmax 0.1
+min_modify line quadratic
+minimize        1.0e-4 1.0e-4 10000 10000
--- a/examples/latte/in.latte.water.min
+++ b/examples/latte/in.latte.water.min
@ -37,5 +37,6 @@ thermo_style    custom step temp pe etotal press
 # minimization

 thermo          10
-min_style fire
-minimize        1.0e-9 1.0e-9 500 500
+
+min_style       fire
+minimize        1.0e-4 1.0e-4 500 500
--- a/examples/latte/latte.in
+++ b/examples/latte/latte.in
@ -11,7 +11,6 @@ LATTE INPUT FILE
 CONTROL{
  xControl= 1
  BASISTYPE= NONORTHO
-  COORDSFILE= "./coords.dat"
  PARAMPATH= "./TBparam"
  KBT= 0.0
  ENTROPYKIND= 1
@ -32,9 +31,3 @@ CONTROL{
  XBODISORDER= 5
  KON= 0
 }
-
-#Controls for QMD (if using lammps MAXITER must be set to -1)
-MDCONTROL{
-  MAXITER= -1
-}
-
--- a/examples/latte/log.19Sep17.latte.sucrose.g++.1
+++ b/examples/latte/log.19Sep17.latte.sucrose.g++.1
@ -1,406 +0,0 @@
- The log file for latte_lib
- 
- CONTROL{                                          }                                                 
-  
- WARNING: variable JobName= is missing. I will use a default value instead ...
- WARNING: variable PARAMPATH= is missing. I will use a default value instead ...
- WARNING: variable DEBUGON= is missing. I will use a default value instead ...
- WARNING: variable FERMIM= is missing. I will use a default value instead ...
- WARNING: variable CGORLIB= is missing. I will use a default value instead ...
- WARNING: variable NORECS= is missing. I will use a default value instead ...
- WARNING: variable VDWON= is missing. I will use a default value instead ...
- WARNING: variable ORDERNMOL= is missing. I will use a default value instead ...
- WARNING: variable LCNON= is missing. I will use a default value instead ...
- WARNING: variable LCNITER= is missing. I will use a default value instead ...
- WARNING: variable MDON= is missing. I will use a default value instead ...
- WARNING: variable PBCON= is missing. I will use a default value instead ...
- WARNING: variable RESTART= is missing. I will use a default value instead ...
- WARNING: variable NGPU= is missing. I will use a default value instead ...
- WARNING: variable COMPFORCE= is missing. I will use a default value instead ...
- WARNING: variable DOSFIT= is missing. I will use a default value instead ...
- WARNING: variable INTS2FIT= is missing. I will use a default value instead ...
- WARNING: variable NFITSTEP= is missing. I will use a default value instead ...
- WARNING: variable QFIT= is missing. I will use a default value instead ...
- WARNING: variable PPFITON= is missing. I will use a default value instead ...
- WARNING: variable ALLFITON= is missing. I will use a default value instead ...
- WARNING: variable PPSTEP= is missing. I will use a default value instead ...
- WARNING: variable BISTEP= is missing. I will use a default value instead ...
- WARNING: variable PP2FIT= is missing. I will use a default value instead ...
- WARNING: variable BINT2FIT= is missing. I will use a default value instead ...
- WARNING: variable PPNMOL= is missing. I will use a default value instead ...
- WARNING: variable PPNGEOM= is missing. I will use a default value instead ...
- WARNING: variable PARREP= is missing. I will use a default value instead ...
- WARNING: variable VERBOSE= is missing. I will use a default value instead ...
- WARNING: variable MIXER= is missing. I will use a default value instead ...
- WARNING: variable RESTARTLIB= is missing. I will use a default value instead ...
- WARNING: variable CGTOL= is missing. I will use a default value instead ...
- WARNING: variable ELEC_ETOL= is missing. I will use a default value instead ...
- WARNING: variable COULACC= is missing. I will use a default value instead ...
- WARNING: variable COULCUT= is missing. I will use a default value instead ...
- WARNING: variable COULR1= is missing. I will use a default value instead ...
- WARNING: variable CHTOL= is missing. I will use a default value instead ...
- WARNING: variable BETA= is missing. I will use a default value instead ...
- WARNING: variable MCSIGMA= is missing. I will use a default value instead ...
- WARNING: variable PPBETA= is missing. I will use a default value instead ...
- WARNING: variable PPSIGMA= is missing. I will use a default value instead ...
- WARNING: variable ER= is missing. I will use a default value instead ...
- WARNING: variable INITIALIZED= is missing. I will use a default value instead ...
-  
-  
- ############### Parameters used for this run ################
-  CONTROL{                                          
-  xControl=           1
-  DEBUGON=           0
-  FERMIM=           6
-  CGORLIB=           1
-  NORECS=           1
-  ENTROPYKIND=           1
-  PPOTON=           1
-  VDWON=           0
-  SPINON=           0
-  ELECTRO=           1
-  ELECMETH=           0
-  MAXSCF=         450
-  MINSP2ITER=          22
-  FULLQCONV=           1
-  QITER=           3
-  ORDERNMOL=           0
-  SPARSEON=           1
-  THRESHOLDON=           1
-  FILLINSTOP=         100
-  BLKSZ=           4
-  MSPARSE=        1500
-  LCNON=           0
-  LCNITER=           4
-  RELAX=           0
-  MAXITER=      100000
-  MDON=           1
-  PBCON=           1
-  RESTART=           0
-  CHARGE=           0
-  XBO=           1
-  XBODISON=           1
-  XBODISORDER=           5
-  NGPU=           2
-  KON=           0
-  COMPFORCE=           1
-  DOSFIT=           0
-  INTS2FIT=           1
-  NFITSTEP=        5000
-  QFIT=           0
-  PPFITON=           0
-  ALLFITON=           0
-  PPSTEP=         500
-  BISTEP=         500
-  PP2FIT=           2
-  BINT2FIT=           6
-  PPNMOL=          10
-  PPNGEOM=         200
-  PARREP=           0
-  VERBOSE=           0
-  MIXER=           0
-  RESTARTLIB=           0
-  CGTOL=   9.9999999747524271E-007
-  KBT=   0.0000000000000000     
-  SPINTOL=   1.0000000000000000E-004
-  ELEC_ETOL=   1.0000000474974513E-003
-  ELEC_QTOL=   1.0000000000000000E-008
-  COULACC=   9.9999999747524271E-007
-  COULCUT=  -500.00000000000000     
-  COULR1=   500.00000000000000     
-  BREAKTOL=   9.9999999999999995E-007
-  QMIX=  0.25000000000000000     
-  SPINMIX=  0.25000000000000000     
-  MDMIX=  0.25000000000000000     
-  NUMTHRESH=   9.9999999999999995E-007
-  CHTOL=   9.9999997764825821E-003
-  SKIN=   1.0000000000000000     
-  RLXFTOL=   9.9999999999999995E-008
-  BETA=   1000.0000000000000     
-  MCSIGMA=  0.20000000298023224     
-  PPBETA=   1000.0000000000000     
-  PPSIGMA=   9.9999997764825821E-003
-  ER=   1.0000000000000000     
-  JobName=MyJob                                                                                               
-  BASISTYPE=NONORTHO                                                                                            
-  SP2CONV=REL                                                                                                 
-  RELAXTYPE=SD                                                                                                  
-  PARAMPATH=./TBparam                                                                                           
-  COORDSFILE=./coords.dat                                                                                        
-  INITIALIZED= F
-  }                                                 
-  
- ./TBparam/electrons.dat
- MDCONTROL{                                        }                                                 
-  
- WARNING: variable RNDIST= is missing. I will use a default value instead ...
- WARNING: variable SEEDINIT= is missing. I will use a default value instead ...
- WARNING: variable NPTTYPE= is missing. I will use a default value instead ...
- WARNING: variable UDNEIGH= is missing. I will use a default value instead ...
- WARNING: variable DUMPFREQ= is missing. I will use a default value instead ...
- WARNING: variable RSFREQ= is missing. I will use a default value instead ...
- WARNING: variable WRTFREQ= is missing. I will use a default value instead ...
- WARNING: variable TOINITTEMP5= is missing. I will use a default value instead ...
- WARNING: variable THERMPER= is missing. I will use a default value instead ...
- WARNING: variable THERMRUN= is missing. I will use a default value instead ...
- WARNING: variable NVTON= is missing. I will use a default value instead ...
- WARNING: variable NPTON= is missing. I will use a default value instead ...
- WARNING: variable AVEPER= is missing. I will use a default value instead ...
- WARNING: variable SEED= is missing. I will use a default value instead ...
- WARNING: variable SHOCKON= is missing. I will use a default value instead ...
- WARNING: variable SHOCKSTART= is missing. I will use a default value instead ...
- WARNING: variable SHOCKDIR= is missing. I will use a default value instead ...
- WARNING: variable MDADAPT= is missing. I will use a default value instead ...
- WARNING: variable GETHUG= is missing. I will use a default value instead ...
- WARNING: variable RSLEVEL= is missing. I will use a default value instead ...
- WARNING: variable DT= is missing. I will use a default value instead ...
- WARNING: variable TEMPERATURE= is missing. I will use a default value instead ...
- WARNING: variable FRICTION= is missing. I will use a default value instead ...
- WARNING: variable PTARGET= is missing. I will use a default value instead ...
- WARNING: variable UPARTICLE= is missing. I will use a default value instead ...
- WARNING: variable USHOCK= is missing. I will use a default value instead ...
- WARNING: variable C0= is missing. I will use a default value instead ...
- WARNING: variable E0= is missing. I will use a default value instead ...
- WARNING: variable V0= is missing. I will use a default value instead ...
- WARNING: variable P0= is missing. I will use a default value instead ...
- WARNING: variable DUMMY= is missing. I will use a default value instead ...
-  
-  
- ############### Parameters used for this run ################
-  MDCONTROL{                                        
-  MAXITER=          -1
-  UDNEIGH=           1
-  DUMPFREQ=         250
-  RSFREQ=         500
-  WRTFREQ=          25
-  TOINITTEMP5=           1
-  THERMPER=         500
-  THERMRUN=       50000
-  NVTON=           0
-  NPTON=           0
-  AVEPER=        1000
-  SEED=          54
-  SHOCKON=           0
-  SHOCKSTART=      100000
-  SHOCKDIR=           1
-  MDADAPT=           0
-  GETHUG=           0
-  RSLEVEL=           0
-  DT=  0.25000000000000000     
-  TEMPERATURE=   300.00000000000000     
-  FRICTION=   1000.0000000000000     
-  PTARGET=   0.0000000000000000     
-  UPARTICLE=   500.00000000000000     
-  USHOCK=  -4590.0000000000000     
-  C0=   1300.0000000000000     
-  E0=  -795.72497558593750     
-  V0=   896.98486328125000     
-  P0=   8.3149001002311707E-002
-  RNDIST=GAUSSIAN                                                                                            
-  SEEDINIT=UNIFORM                                                                                             
-  NPTTYPE=ISO                                                                                                 
-  DUMMY= F
-  }                                                 
-  
- LIBCALLS           0
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15165627147849        13.850829743067372        0.0000000000000000        3.9653384620309846     
- LIBCALLS           1
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15142147081917        13.850596160685321        0.0000000000000000        3.9653428217526296     
- LIBCALLS           2
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15072431717670        13.849902902335046        0.0000000000000000        3.9653556077235628     
- LIBCALLS           3
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14958682134301        13.848772166382796        0.0000000000000000        3.9653762812719782     
- LIBCALLS           4
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14804481054080        13.847240065975685        0.0000000000000000        3.9654039257311324     
- LIBCALLS           5
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14614669298459        13.845355347298943        0.0000000000000000        3.9654372593625880     
- LIBCALLS           6
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14395200541782        13.843177681164811        0.0000000000000000        3.9654747563744728     
- LIBCALLS           7
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14152950027858        13.840775605612510        0.0000000000000000        3.9655146828204026     
- LIBCALLS           8
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13895477239572        13.838224210058369        0.0000000000000000        3.9655551214573213     
- LIBCALLS           9
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13630808318862        13.835602658269416        0.0000000000000000        3.9655940696401335     
- LIBCALLS          10
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13367156672246        13.832991646694552        0.0000000000000000        3.9656294961085377     
- LIBCALLS          11
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13112695791978        13.830470890853416        0.0000000000000000        3.9656594331001127     
- LIBCALLS          12
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12875304084571        13.828116721514562        0.0000000000000000        3.9656820468287637     
- LIBCALLS          13
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12662314462005        13.825999860613845        0.0000000000000000        3.9656956633599689     
- LIBCALLS          14
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12480303363179        13.824183432931337        0.0000000000000000        3.9656988576578489     
- LIBCALLS          15
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12334906554690        13.822721254684298        0.0000000000000000        3.9656905013961525     
- LIBCALLS          16
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12230649281338        13.821656427050725        0.0000000000000000        3.9656697961568699     
- LIBCALLS          17
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12170820445976        13.821020251989051        0.0000000000000000        3.9656362957330207     
- LIBCALLS          18
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12157378544725        13.820831478957400        0.0000000000000000        3.9655899465557289     
- LIBCALLS          19
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12190902409918        13.821095885466233        0.0000000000000000        3.9655310732858191     
- LIBCALLS          20
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12270578464654        13.821806190548854        0.0000000000000000        3.9654603894825375     
- LIBCALLS          21
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12394226924755        13.822942298269552        0.0000000000000000        3.9653789701528157     
- LIBCALLS          22
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12558369933174        13.824471866833779        0.0000000000000000        3.9652882392864672     
- LIBCALLS          23
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12758334335854        13.826351196916939        0.0000000000000000        3.9651899208403507     
- LIBCALLS          24
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12988392857540        13.828526429544008        0.0000000000000000        3.9650859962581815     
- LIBCALLS          25
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13241933900565        13.830935038404082        0.0000000000000000        3.9649786471076300     
- LIBCALLS          26
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13511663668885        13.833507593821677        0.0000000000000000        3.9648702062183578     
- LIBCALLS          27
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13789821166085        13.836169765592846        0.0000000000000000        3.9647630647732250     
- LIBCALLS          28
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14068416314257        13.838844520440762        0.0000000000000000        3.9646596094056243     
- LIBCALLS          29
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14339478125902        13.841454456993119        0.0000000000000000        3.9645621614306648     
- LIBCALLS          30
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14595299166797        13.843924209084781        0.0000000000000000        3.9644728862209537     
- LIBCALLS          31
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14828672908391        13.846182838096166        0.0000000000000000        3.9643937231592781     
- LIBCALLS          32
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15033121417270        13.848166127650318        0.0000000000000000        3.9643263326484774     
- LIBCALLS          33
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15203097820654        13.849818691045462        0.0000000000000000        3.9642720350529470     
- LIBCALLS          34
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15334158494318        13.851095804201121        0.0000000000000000        3.9642317563508436     
- LIBCALLS          35
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15423101277941        13.851964884709183        0.0000000000000000        3.9642060118064197     
- LIBCALLS          36
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15468060067406        13.852406550643760        0.0000000000000000        3.9641948735126151     
- LIBCALLS          37
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15468556770435        13.852415210893483        0.0000000000000000        3.9641979705462513     
- LIBCALLS          38
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15425506702360        13.851999160128511        0.0000000000000000        3.9642145018322728     
- LIBCALLS          39
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15341177086162        13.851180175004831        0.0000000000000000        3.9642432622019754     
- LIBCALLS          40
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15219100341108        13.849992631968849        0.0000000000000000        3.9642826797086155     
- LIBCALLS          41
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15063948253476        13.848482189284203        0.0000000000000000        3.9643308764467280     
- LIBCALLS          42
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14881366363778        13.846704095034502        0.0000000000000000        3.9643857194231229     
- LIBCALLS          43
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14677783841711        13.844721197666447        0.0000000000000000        3.9644449063996254     
- LIBCALLS          44
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14460195130079        13.842601745208173        0.0000000000000000        3.9645060327113080     
- LIBCALLS          45
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14235930197236        13.840417063344470        0.0000000000000000        3.9645666751650537     
- LIBCALLS          46
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14012416839108        13.838239201362184        0.0000000000000000        3.9646244709241216     
- LIBCALLS          47
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13796944534135        13.836138629087953        0.0000000000000000        3.9646771958199687     
- LIBCALLS          48
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13596436459642        13.834182058508610        0.0000000000000000        3.9647228360374207     
- LIBCALLS          49
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13417236277201        13.832430452024822        0.0000000000000000        3.9647596471475066     
- LIBCALLS          50
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13264918465853        13.830937266579358        0.0000000000000000        3.9647862263274365     
- LIBCALLS          51
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13144121811348        13.829746970164395        0.0000000000000000        3.9648015300858930     
- LIBCALLS          52
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13058418584075        13.828893856279002        0.0000000000000000        3.9648049379175174     
- LIBCALLS          53
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13010212355317        13.828401171909800        0.0000000000000000        3.9647962482159476     
- LIBCALLS          54
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13000675986638        13.828280567696357        0.0000000000000000        3.9647757005033171     
- LIBCALLS          55
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13029725443062        13.828531873218640        0.0000000000000000        3.9647439679967813     
- LIBCALLS          56
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13096031859556        13.829143196581525        0.0000000000000000        3.9647021412055241     
- LIBCALLS          57
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13197071275096        13.830091344339912        0.0000000000000000        3.9646517009757813     
- LIBCALLS          58
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13329208290526        13.831342554670950        0.0000000000000000        3.9645944691057076     
- LIBCALLS          59
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13487817952188        13.832853532802908        0.0000000000000000        3.9645325717081379     
- LIBCALLS          60
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13667431785007        13.834572772174083        0.0000000000000000        3.9644683636269380     
- LIBCALLS          61
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13861917436014        13.836442137716100        0.0000000000000000        3.9644043716683206     
- LIBCALLS          62
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14064674344610        13.838398678492441        0.0000000000000000        3.9643432117931376     
- LIBCALLS          63
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14268847880851        13.840376626541268        0.0000000000000000        3.9642875107994442     
- LIBCALLS          64
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14467552446979        13.842309527587247        0.0000000000000000        3.9642398279114381     
- LIBCALLS          65
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14654097615647        13.844132438475109        0.0000000000000000        3.9642025589783412     
- LIBCALLS          66
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14822207995957        13.845784117078871        0.0000000000000000        3.9641778771678413     
- LIBCALLS          67
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14966231911774        13.847209123749478        0.0000000000000000        3.9641676470155103     
- LIBCALLS          68
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15081329445576        13.848359751049152        0.0000000000000000        3.9641733618391299     
- LIBCALLS          69
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15163634076458        13.849197700537186        0.0000000000000000        3.9641960937768981     
- LIBCALLS          70
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15210380659516        13.849695432596437        0.0000000000000000        3.9642364336978391     
- LIBCALLS          71
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15219997215792        13.849837127658775        0.0000000000000000        3.9642944914660605     
- LIBCALLS          72
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15192153900722        13.849619213627008        0.0000000000000000        3.9643698667021590     
- LIBCALLS          73
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15127769530471        13.849050434626310        0.0000000000000000        3.9644616585289247     
- LIBCALLS          74
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.15028974592457        13.848151458176057        0.0000000000000000        3.9645684873567908     
- LIBCALLS          75
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14899032381624        13.846954040343237        0.0000000000000000        3.9646885325372980     
- LIBCALLS          76
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14742221364327        13.845499789571511        0.0000000000000000        3.9648195821504211     
- LIBCALLS          77
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14563684020112        13.843838588134755        0.0000000000000000        3.9649591055666282     
- LIBCALLS          78
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14369246883172        13.842026744273829        0.0000000000000000        3.9651043223068876     
- LIBCALLS          79
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14165219754119        13.840124957235691        0.0000000000000000        3.9652522794782556     
- LIBCALLS          80
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13958181195608        13.838196181062383        0.0000000000000000        3.9653999492835532     
- LIBCALLS          81
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13754757713065        13.836303471774007        0.0000000000000000        3.9655443071963385     
- LIBCALLS          82
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13561405478509        13.834507896249461        0.0000000000000000        3.9656824354232736     
- LIBCALLS          83
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13384198639028        13.832866571528193        0.0000000000000000        3.9658115908515681     
- LIBCALLS          84
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13228634940748        13.831430891696755        0.0000000000000000        3.9659292903699495     
- LIBCALLS          85
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13099461122306        13.830244986101496        0.0000000000000000        3.9660333724384569     
- LIBCALLS          86
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13000526350720        13.829344440260281        0.0000000000000000        3.9661220782532145     
- LIBCALLS          87
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12934661713206        13.828755299191645        0.0000000000000000        3.9661940662588862     
- LIBCALLS          88
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12903595764971        13.828493364127572        0.0000000000000000        3.9662484623936765     
- LIBCALLS          89
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12907904533250        13.828563786156602        0.0000000000000000        3.9662848954537067     
- LIBCALLS          90
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.12946994320248        13.828960955791626        0.0000000000000000        3.9663034756730777     
- LIBCALLS          91
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13019123489619        13.829668684955367        0.0000000000000000        3.9663048073711558     
- LIBCALLS          92
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13121457766835        13.830660675785223        0.0000000000000000        3.9662899643566578     
- LIBCALLS          93
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13250159637499        13.831901269302985        0.0000000000000000        3.9662604605307470     
- LIBCALLS          94
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13400508153813        13.833346464674193        0.0000000000000000        3.9662181906403653     
- LIBCALLS          95
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13567049003717        13.834945196074795        0.0000000000000000        3.9661653991148187     
- LIBCALLS          96
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13743766487022        13.836640848231452        0.0000000000000000        3.9661045863001441     
- LIBCALLS          97
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.13924277096038        13.838372983906890        0.0000000000000000        3.9660384593805307     
- LIBCALLS          98
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14102036682124        13.840079246589914        0.0000000000000000        3.9659698320311318     
- LIBCALLS          99
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14270555407057        13.841697390518378        0.0000000000000000        3.9659015537535014     
- LIBCALLS         100
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -261.14423615166146        13.843167378892108        0.0000000000000000        3.9658364191978137     
--- a/examples/latte/log.19Sep17.latte.water.g++.1
+++ b/examples/latte/log.19Sep17.latte.water.g++.1
@ -1,406 +0,0 @@
- The log file for latte_lib
- 
- CONTROL{                                          }                                                 
-  
- WARNING: variable JobName= is missing. I will use a default value instead ...
- WARNING: variable PARAMPATH= is missing. I will use a default value instead ...
- WARNING: variable DEBUGON= is missing. I will use a default value instead ...
- WARNING: variable FERMIM= is missing. I will use a default value instead ...
- WARNING: variable CGORLIB= is missing. I will use a default value instead ...
- WARNING: variable NORECS= is missing. I will use a default value instead ...
- WARNING: variable VDWON= is missing. I will use a default value instead ...
- WARNING: variable ORDERNMOL= is missing. I will use a default value instead ...
- WARNING: variable LCNON= is missing. I will use a default value instead ...
- WARNING: variable LCNITER= is missing. I will use a default value instead ...
- WARNING: variable MDON= is missing. I will use a default value instead ...
- WARNING: variable PBCON= is missing. I will use a default value instead ...
- WARNING: variable RESTART= is missing. I will use a default value instead ...
- WARNING: variable NGPU= is missing. I will use a default value instead ...
- WARNING: variable COMPFORCE= is missing. I will use a default value instead ...
- WARNING: variable DOSFIT= is missing. I will use a default value instead ...
- WARNING: variable INTS2FIT= is missing. I will use a default value instead ...
- WARNING: variable NFITSTEP= is missing. I will use a default value instead ...
- WARNING: variable QFIT= is missing. I will use a default value instead ...
- WARNING: variable PPFITON= is missing. I will use a default value instead ...
- WARNING: variable ALLFITON= is missing. I will use a default value instead ...
- WARNING: variable PPSTEP= is missing. I will use a default value instead ...
- WARNING: variable BISTEP= is missing. I will use a default value instead ...
- WARNING: variable PP2FIT= is missing. I will use a default value instead ...
- WARNING: variable BINT2FIT= is missing. I will use a default value instead ...
- WARNING: variable PPNMOL= is missing. I will use a default value instead ...
- WARNING: variable PPNGEOM= is missing. I will use a default value instead ...
- WARNING: variable PARREP= is missing. I will use a default value instead ...
- WARNING: variable VERBOSE= is missing. I will use a default value instead ...
- WARNING: variable MIXER= is missing. I will use a default value instead ...
- WARNING: variable RESTARTLIB= is missing. I will use a default value instead ...
- WARNING: variable CGTOL= is missing. I will use a default value instead ...
- WARNING: variable ELEC_ETOL= is missing. I will use a default value instead ...
- WARNING: variable COULACC= is missing. I will use a default value instead ...
- WARNING: variable COULCUT= is missing. I will use a default value instead ...
- WARNING: variable COULR1= is missing. I will use a default value instead ...
- WARNING: variable CHTOL= is missing. I will use a default value instead ...
- WARNING: variable BETA= is missing. I will use a default value instead ...
- WARNING: variable MCSIGMA= is missing. I will use a default value instead ...
- WARNING: variable PPBETA= is missing. I will use a default value instead ...
- WARNING: variable PPSIGMA= is missing. I will use a default value instead ...
- WARNING: variable ER= is missing. I will use a default value instead ...
- WARNING: variable INITIALIZED= is missing. I will use a default value instead ...
-  
-  
- ############### Parameters used for this run ################
-  CONTROL{                                          
-  xControl=           1
-  DEBUGON=           0
-  FERMIM=           6
-  CGORLIB=           1
-  NORECS=           1
-  ENTROPYKIND=           1
-  PPOTON=           1
-  VDWON=           0
-  SPINON=           0
-  ELECTRO=           1
-  ELECMETH=           0
-  MAXSCF=         450
-  MINSP2ITER=          22
-  FULLQCONV=           1
-  QITER=           3
-  ORDERNMOL=           0
-  SPARSEON=           1
-  THRESHOLDON=           1
-  FILLINSTOP=         100
-  BLKSZ=           4
-  MSPARSE=        1500
-  LCNON=           0
-  LCNITER=           4
-  RELAX=           0
-  MAXITER=      100000
-  MDON=           1
-  PBCON=           1
-  RESTART=           0
-  CHARGE=           0
-  XBO=           1
-  XBODISON=           1
-  XBODISORDER=           5
-  NGPU=           2
-  KON=           0
-  COMPFORCE=           1
-  DOSFIT=           0
-  INTS2FIT=           1
-  NFITSTEP=        5000
-  QFIT=           0
-  PPFITON=           0
-  ALLFITON=           0
-  PPSTEP=         500
-  BISTEP=         500
-  PP2FIT=           2
-  BINT2FIT=           6
-  PPNMOL=          10
-  PPNGEOM=         200
-  PARREP=           0
-  VERBOSE=           0
-  MIXER=           0
-  RESTARTLIB=           0
-  CGTOL=   9.9999999747524271E-007
-  KBT=   0.0000000000000000     
-  SPINTOL=   1.0000000000000000E-004
-  ELEC_ETOL=   1.0000000474974513E-003
-  ELEC_QTOL=   1.0000000000000000E-008
-  COULACC=   9.9999999747524271E-007
-  COULCUT=  -500.00000000000000     
-  COULR1=   500.00000000000000     
-  BREAKTOL=   9.9999999999999995E-007
-  QMIX=  0.25000000000000000     
-  SPINMIX=  0.25000000000000000     
-  MDMIX=  0.25000000000000000     
-  NUMTHRESH=   9.9999999999999995E-007
-  CHTOL=   9.9999997764825821E-003
-  SKIN=   1.0000000000000000     
-  RLXFTOL=   9.9999999999999995E-008
-  BETA=   1000.0000000000000     
-  MCSIGMA=  0.20000000298023224     
-  PPBETA=   1000.0000000000000     
-  PPSIGMA=   9.9999997764825821E-003
-  ER=   1.0000000000000000     
-  JobName=MyJob                                                                                               
-  BASISTYPE=NONORTHO                                                                                            
-  SP2CONV=REL                                                                                                 
-  RELAXTYPE=SD                                                                                                  
-  PARAMPATH=./TBparam                                                                                           
-  COORDSFILE=./coords.dat                                                                                        
-  INITIALIZED= F
-  }                                                 
-  
- ./TBparam/electrons.dat
- MDCONTROL{                                        }                                                 
-  
- WARNING: variable RNDIST= is missing. I will use a default value instead ...
- WARNING: variable SEEDINIT= is missing. I will use a default value instead ...
- WARNING: variable NPTTYPE= is missing. I will use a default value instead ...
- WARNING: variable UDNEIGH= is missing. I will use a default value instead ...
- WARNING: variable DUMPFREQ= is missing. I will use a default value instead ...
- WARNING: variable RSFREQ= is missing. I will use a default value instead ...
- WARNING: variable WRTFREQ= is missing. I will use a default value instead ...
- WARNING: variable TOINITTEMP5= is missing. I will use a default value instead ...
- WARNING: variable THERMPER= is missing. I will use a default value instead ...
- WARNING: variable THERMRUN= is missing. I will use a default value instead ...
- WARNING: variable NVTON= is missing. I will use a default value instead ...
- WARNING: variable NPTON= is missing. I will use a default value instead ...
- WARNING: variable AVEPER= is missing. I will use a default value instead ...
- WARNING: variable SEED= is missing. I will use a default value instead ...
- WARNING: variable SHOCKON= is missing. I will use a default value instead ...
- WARNING: variable SHOCKSTART= is missing. I will use a default value instead ...
- WARNING: variable SHOCKDIR= is missing. I will use a default value instead ...
- WARNING: variable MDADAPT= is missing. I will use a default value instead ...
- WARNING: variable GETHUG= is missing. I will use a default value instead ...
- WARNING: variable RSLEVEL= is missing. I will use a default value instead ...
- WARNING: variable DT= is missing. I will use a default value instead ...
- WARNING: variable TEMPERATURE= is missing. I will use a default value instead ...
- WARNING: variable FRICTION= is missing. I will use a default value instead ...
- WARNING: variable PTARGET= is missing. I will use a default value instead ...
- WARNING: variable UPARTICLE= is missing. I will use a default value instead ...
- WARNING: variable USHOCK= is missing. I will use a default value instead ...
- WARNING: variable C0= is missing. I will use a default value instead ...
- WARNING: variable E0= is missing. I will use a default value instead ...
- WARNING: variable V0= is missing. I will use a default value instead ...
- WARNING: variable P0= is missing. I will use a default value instead ...
- WARNING: variable DUMMY= is missing. I will use a default value instead ...
-  
-  
- ############### Parameters used for this run ################
-  MDCONTROL{                                        
-  MAXITER=          -1
-  UDNEIGH=           1
-  DUMPFREQ=         250
-  RSFREQ=         500
-  WRTFREQ=          25
-  TOINITTEMP5=           1
-  THERMPER=         500
-  THERMRUN=       50000
-  NVTON=           0
-  NPTON=           0
-  AVEPER=        1000
-  SEED=          54
-  SHOCKON=           0
-  SHOCKSTART=      100000
-  SHOCKDIR=           1
-  MDADAPT=           0
-  GETHUG=           0
-  RSLEVEL=           0
-  DT=  0.25000000000000000     
-  TEMPERATURE=   300.00000000000000     
-  FRICTION=   1000.0000000000000     
-  PTARGET=   0.0000000000000000     
-  UPARTICLE=   500.00000000000000     
-  USHOCK=  -4590.0000000000000     
-  C0=   1300.0000000000000     
-  E0=  -795.72497558593750     
-  V0=   896.98486328125000     
-  P0=   8.3149001002311707E-002
-  RNDIST=GAUSSIAN                                                                                            
-  SEEDINIT=UNIFORM                                                                                             
-  NPTTYPE=ISO                                                                                                 
-  DUMMY= F
-  }                                                 
-  
- LIBCALLS           0
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -110.94281402417451        9.3197859655447317        0.0000000000000000        3.3331152608769714     
- LIBCALLS           1
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -111.00875524736128        9.3653691493930946        0.0000000000000000        3.3307590218500454     
- LIBCALLS           2
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -111.20542679804305        9.5022104076319209        0.0000000000000000        3.3237269236958826     
- LIBCALLS           3
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -111.52938059528239        9.7304811436977623        0.0000000000000000        3.3121168872278743     
- LIBCALLS           4
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -111.97463249071366        10.050121693432235        0.0000000000000000        3.2961492065207088     
- LIBCALLS           5
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.53270518796754        10.460328095449432        0.0000000000000000        3.2761112890303719     
- LIBCALLS           6
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.19233973551384        10.958848347453728        0.0000000000000000        3.2524094948032394     
- LIBCALLS           7
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.93936061504219        11.541120618354967        0.0000000000000000        3.2255715906285793     
- LIBCALLS           8
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.75657630591589        12.199315594286325        0.0000000000000000        3.1962412869596100     
- LIBCALLS           9
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -115.62363727592754        12.921383532128770        0.0000000000000000        3.1652236023838971     
- LIBCALLS          10
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -116.51738028417616        13.690253224922545        0.0000000000000000        3.1333864449223818     
- LIBCALLS          11
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -117.41167836078414        14.483370804317431        0.0000000000000000        3.1018474945925432     
- LIBCALLS          12
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -118.27888830961329        15.272791625586624        0.0000000000000000        3.0716022180609772     
- LIBCALLS          13
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.09006809777934        16.026020995592610        0.0000000000000000        3.0437832241644842     
- LIBCALLS          14
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.81665859965702        16.707725410478066        0.0000000000000000        3.0194382402972129     
- LIBCALLS          15
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.43171665196000        17.282293509806884        0.0000000000000000        2.9995944159949395     
- LIBCALLS          16
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.91202932933264        17.717025741135480        0.0000000000000000        2.9850159611897484     
- LIBCALLS          17
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.23935305628714        17.985521384886379        0.0000000000000000        2.9763132734231292     
- LIBCALLS          18
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.40195013006486        18.070687763205626        0.0000000000000000        2.9738279411203812     
- LIBCALLS          19
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.39540873020161        17.966785565900089        0.0000000000000000        2.9776410698341418     
- LIBCALLS          20
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.22299732491055        17.680085363043698        0.0000000000000000        2.9875419962840417     
- LIBCALLS          21
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.89520311723561        17.228004261852682        0.0000000000000000        3.0030824758482719     
- LIBCALLS          22
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.42892991839108        16.636927104987372        0.0000000000000000        3.0235548851138652     
- LIBCALLS          23
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.84603562384113        15.939176953031323        0.0000000000000000        3.0480682132279808     
- LIBCALLS          24
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.17151378155378        15.169713318754383        0.0000000000000000        3.0757033760823562     
- LIBCALLS          25
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -118.43237009319661        14.363090728730079        0.0000000000000000        3.1053593079625457     
- LIBCALLS          26
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -117.65587959220025        13.551051330611342        0.0000000000000000        3.1359367589132958     
- LIBCALLS          27
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -116.86794783202731        12.760928656005802        0.0000000000000000        3.1665525874091585     
- LIBCALLS          28
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -116.09314111752745        12.014864684105008        0.0000000000000000        3.1962157162544820     
- LIBCALLS          29
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -115.35329645548983        11.329720850249741        0.0000000000000000        3.2241713466126849     
- LIBCALLS          30
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.66766945168203        10.717501941208962        0.0000000000000000        3.2497326120829619     
- LIBCALLS          31
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.05267853351812        10.186102377105355        0.0000000000000000        3.2723439005172468     
- LIBCALLS          32
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.52195471723405        9.7402032028335377        0.0000000000000000        3.2915777178346559     
- LIBCALLS          33
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.08654808143162        9.3821857555240076        0.0000000000000000        3.3070881064986164     
- LIBCALLS          34
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.75494140290169        9.1129669843369658        0.0000000000000000        3.3186769594405297     
- LIBCALLS          35
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.53346080566452        8.9326971516334606        0.0000000000000000        3.3261797960311763     
- LIBCALLS          36
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.42631053676025        8.8412887543407273        0.0000000000000000        3.3295101207595583     
- LIBCALLS          37
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.43567911088179        8.8387604511711384        0.0000000000000000        3.3286360397306387     
- LIBCALLS          38
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.56180874683180        8.9253908783870841        0.0000000000000000        3.3235794828927934     
- LIBCALLS          39
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.80290981416660        9.1016780459478674        0.0000000000000000        3.3144303393175201     
- LIBCALLS          40
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.15529209572232        9.3681021116147463        0.0000000000000000        3.3012719922659173     
- LIBCALLS          41
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.61284717182851        9.7246892073080176        0.0000000000000000        3.2843276907821406     
- LIBCALLS          42
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.16711238367500        10.170382433756300        0.0000000000000000        3.2638758866524444     
- LIBCALLS          43
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.80697882175535        10.702240750749448        0.0000000000000000        3.2402928278295451     
- LIBCALLS          44
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -115.51862249254057        11.314512276989859        0.0000000000000000        3.2140189987358694     
- LIBCALLS          45
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -116.28534475502829        11.997664972113199        0.0000000000000000        3.1855791836729437     
- LIBCALLS          46
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -117.08723294353808        12.737504349188432        0.0000000000000000        3.1557205936583181     
- LIBCALLS          47
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -117.90172272355942        13.514542609912253        0.0000000000000000        3.1252466759266087     
- LIBCALLS          48
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -118.70392627447073        14.303827027310493        0.0000000000000000        3.0950533786893732     
- LIBCALLS          49
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.46728361372288        15.075425279261220        0.0000000000000000        3.0661202668284480     
- LIBCALLS          50
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.16480071670361        15.795723720235596        0.0000000000000000        3.0394030522382605     
- LIBCALLS          51
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.77012122199473        16.429579578207949        0.0000000000000000        3.0158910566711334     
- LIBCALLS          52
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.25943485841766        16.943195338409559        0.0000000000000000        2.9964108616830281     
- LIBCALLS          53
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.61275582007269        17.307379355481601        0.0000000000000000        2.9817016064731785     
- LIBCALLS          54
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.81557415209883        17.500688554193868        0.0000000000000000        2.9722905637821611     
- LIBCALLS          55
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.85979389563140        17.511877645177901        0.0000000000000000        2.9685356305551474     
- LIBCALLS          56
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.74454585055143        17.341170281709367        0.0000000000000000        2.9705149057151141     
- LIBCALLS          57
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.47625724150488        17.000096879575938        0.0000000000000000        2.9780008785307088     
- LIBCALLS          58
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.06771474420596        16.509959464438374        0.0000000000000000        2.9906138266349656     
- LIBCALLS          59
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.53702830874704        15.899266098308772        0.0000000000000000        3.0078351734174715     
- LIBCALLS          60
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.90667912574422        15.200652842845301        0.0000000000000000        3.0288733658622142     
- LIBCALLS          61
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.20142467775943        14.447825469624703        0.0000000000000000        3.0529481020908245     
- LIBCALLS          62
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -118.44747494197328        13.672949108115853        0.0000000000000000        3.0790791220573088     
- LIBCALLS          63
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -117.67063237406208        12.904741667499017        0.0000000000000000        3.1063745183559131     
- LIBCALLS          64
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -116.89550228683500        12.167344616151606        0.0000000000000000        3.1339818740985033     
- LIBCALLS          65
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -116.14487351718614        11.479908971904207        0.0000000000000000        3.1610748652786995     
- LIBCALLS          66
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -115.43917601644073        10.856755674815151        0.0000000000000000        3.1869042214936911     
- LIBCALLS          67
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.79630542914917        10.307930318909381        0.0000000000000000        3.2107896540741994     
- LIBCALLS          68
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.23118520942130        9.8399835349372715        0.0000000000000000        3.2322754400486997     
- LIBCALLS          69
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.75645667348935        9.4568320682906393        0.0000000000000000        3.2508686207040949     
- LIBCALLS          70
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.38220191758144        9.1605931457952803        0.0000000000000000        3.2662052636761625     
- LIBCALLS          71
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.11651461323785        8.9523172650382463        0.0000000000000000        3.2778578161416640     
- LIBCALLS          72
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.96490300473705        8.8325758589074610        0.0000000000000000        3.2856373346184280     
- LIBCALLS          73
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -112.93101384064629        8.8018792766284140        0.0000000000000000        3.2893376450243901     
- LIBCALLS          74
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.01657988020818        8.8609123616606951        0.0000000000000000        3.2887786713823335     
- LIBCALLS          75
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.22122702505257        9.0105808374276855        0.0000000000000000        3.2838806809960044     
- LIBCALLS          76
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.54255812607462        9.2518619694254909        0.0000000000000000        3.2746170980725564     
- LIBCALLS          77
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -113.97595003796289        9.5854566564348804        0.0000000000000000        3.2610495238703536     
- LIBCALLS          78
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -114.51445216471619        10.011242264155852        0.0000000000000000        3.2433103887056101     
- LIBCALLS          79
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -115.14835871057100        10.527538366743359        0.0000000000000000        3.2217018278255036     
- LIBCALLS          80
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -115.86512618816471        11.130220642932718        0.0000000000000000        3.1966546818138903     
- LIBCALLS          81
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -116.64916580084807        11.811746817430592        0.0000000000000000        3.1687509169099037     
- LIBCALLS          82
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -117.48162972769103        12.560201275368994        0.0000000000000000        3.1387793445426220     
- LIBCALLS          83
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -118.34080112521505        13.358507776606700        0.0000000000000000        3.1076005013428842     
- LIBCALLS          84
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.20206255799097        14.183999576696523        0.0000000000000000        3.0762625451098367     
- LIBCALLS          85
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.03875955947012        15.008549885925623        0.0000000000000000        3.0458557745855401     
- LIBCALLS          86
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.82281065648482        15.799445052997022        0.0000000000000000        3.0175902569508040     
- LIBCALLS          87
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.52638053902615        16.521105731022047        0.0000000000000000        2.9925661691795984     
- LIBCALLS          88
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -122.12297505178334        17.137613862262167        0.0000000000000000        2.9718740800190462     
- LIBCALLS          89
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -122.58954501498538        17.615819283155187        0.0000000000000000        2.9563457612376758     
- LIBCALLS          90
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -122.90768650775293        17.928615619513138        0.0000000000000000        2.9466637669908935     
- LIBCALLS          91
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -123.06510359278838        18.057846294334183        0.0000000000000000        2.9432773288779130     
- LIBCALLS          92
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -123.05653995529889        17.996310208253615        0.0000000000000000        2.9463730237128352     
- LIBCALLS          93
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -122.88443709725219        17.748486968230267        0.0000000000000000        2.9557418006906766     
- LIBCALLS          94
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -122.55804625906457        17.329857520510558        0.0000000000000000        2.9710497340098647     
- LIBCALLS          95
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -122.09316916859144        16.764989519228550        0.0000000000000000        2.9916333369114647     
- LIBCALLS          96
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -121.51050736457847        16.084787212290774        0.0000000000000000        3.0167038701280053     
- LIBCALLS          97
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.83475656442954        15.323405512114466        0.0000000000000000        3.0451593241515909     
- LIBCALLS          98
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -120.09218577985371        14.515310319889227        0.0000000000000000        3.0759929793994090     
- LIBCALLS          99
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -119.30969482099719        13.692843612811791        0.0000000000000000        3.1081426979179545     
- LIBCALLS         100
- Energy Components (TRRHOH, EREP, ENTE, ECOUL)  -118.51358261827596        12.884492109393644        0.0000000000000000        3.1405428597121636     
--- a/examples/latte/log.19Sep17.latte.water.min.g++.1
+++ b/examples/latte/log.19Sep17.latte.water.min.g++.1
@ -1,152 +0,0 @@
-LAMMPS (1 Sep 2017)
-# simple water model with LATTE
-
-units		metal
-atom_style	full
-atom_modify     sort 0 0.0    # turn off sorting of the coordinates
-
-read_data       data.water
-  orthogonal box = (0 0 0) to (6.267 6.267 6.267)
-  1 by 1 by 1 MPI processor grid
-  reading atoms ...
-  24 atoms
-  0 = max # of 1-2 neighbors
-  0 = max # of 1-3 neighbors
-  0 = max # of 1-4 neighbors
-  1 = max # of special neighbors
-
-# replicate system if requested
-
-variable	x index 1
-variable	y index 1
-variable	z index 1
-
-variable        nrep equal v_x*v_y*v_z
-if              "${nrep} > 1" then "replicate $x $y $z"
-
-# initialize system
-
-velocity	all create 0.0 87287 loop geom
-
-pair_style      zero 1.0
-pair_coeff	* *
-
-neighbor	1.0 bin
-neigh_modify    every 1 delay 0 check yes
-
-timestep        0.00025
-
-fix		1 all nve
-
-fix             2 all latte NULL
-fix_modify      2 energy yes
-
-thermo_style    custom step temp pe etotal press
-
-# minimization
-
-thermo          10
-min_style fire
-minimize        1.0e-9 1.0e-9 500 500
-Neighbor list info ...
-  update every 1 steps, delay 0 steps, check yes
-  max neighbors/atom: 2000, page size: 100000
-  master list distance cutoff = 2
-  ghost atom cutoff = 2
-  binsize = 1, bins = 7 7 7
-  1 neighbor lists, perpetual/occasional/extra = 1 0 0
-  (1) pair zero, perpetual
-      attributes: half, newton on
-      pair build: half/bin/newton
-      stencil: half/bin/3d/newton
-      bin: standard
-Per MPI rank memory allocation (min/avg/max) = 5.629 | 5.629 | 5.629 Mbytes
-Step Temp PotEng TotEng Press 
-       0            0   -104.95614   -104.95614    48229.712 
-      10    349.44219   -105.50971   -104.47083    62149.591 
-      20    1253.6752   -107.00898   -103.28182    116444.44 
-      30    134.63588   -107.56184   -107.16157    59854.143 
-      40    2.4043703   -108.15301   -108.14586     32685.77 
-      50    162.13426   -108.40551   -107.92349    62104.273 
-      60    134.03149   -108.70118   -108.30271    49400.525 
-      70    64.159014   -108.78034    -108.5896    37243.303 
-      80    240.49926   -109.10766   -108.39266    42158.884 
-      90   0.60467192   -109.61818   -109.61639    14107.515 
-     100    1.4691163   -109.65556   -109.65119    21596.775 
-     110    30.500628   -109.69267     -109.602    16104.639 
-     120    120.62379   -109.83749   -109.47888     9474.971 
-     130    8.4742975   -109.99986   -109.97467    10104.102 
-     140    3.4732679   -110.01209   -110.00176    11990.442 
-     150    24.749482   -110.04313   -109.96955    10851.569 
-     160    4.1106505   -110.13288   -110.12066    8257.3969 
-     170 0.0065628716   -110.18061   -110.18059    7876.8748 
-     180    2.0542078    -110.1837   -110.17759    7996.0533 
-     190    20.134782   -110.21071   -110.15085    7556.1811 
-     200    2.3397267    -110.3244   -110.31745     3767.062 
-     210    4.3544709   -110.34438   -110.33143     4889.145 
-     220    1.1872367   -110.37457   -110.37104    4162.6543 
-     230    2.2798399   -110.38081   -110.37403    4321.0943 
-     240    11.835907   -110.39611   -110.36092    4187.5757 
-     250   0.13741849   -110.41453   -110.41412    3720.7527 
-     260    4.2283185   -110.42036   -110.40779    3743.3494 
-     270   0.47243724   -110.44349   -110.44208    3172.1866 
-     280   0.06090137   -110.45428    -110.4541    3065.9348 
-     290    5.3413962   -110.46285   -110.44697    3121.2924 
-     300    8.2032986   -110.48519    -110.4608    2705.5001 
-     310    2.0783529   -110.48807   -110.48189    2740.7989 
-     320    16.629185   -110.51002   -110.46058    2581.7434 
-     330   0.19723065   -110.53444   -110.53385    1942.0228 
-     340    6.2758334   -110.54361   -110.52495    1924.0965 
-     350    1.4539052   -110.59108   -110.58676   -449.41056 
-     360    0.0514233   -110.60143   -110.60128    1284.8259 
-     370    1.7240145   -110.60394   -110.59881    1468.0004 
-     380     13.28516   -110.62337   -110.58387    1573.4714 
-     390    1.2247432   -110.63525   -110.63161    1113.4557 
-     400    0.3946985   -110.63694   -110.63576    1083.0801 
-     410    2.9831433     -110.641   -110.63213     1112.419 
-     420  0.068550589   -110.66029   -110.66009    897.09211 
-     430   0.83976182   -110.66259   -110.66009    918.69832 
-     440    4.4760907   -110.66844   -110.65513    915.24435 
-     450    1.2841241   -110.67482     -110.671    953.30422 
-     460    2.5707455   -110.68509   -110.67745    775.21273 
-     470   0.99721544   -110.68646    -110.6835    812.74984 
-     480    6.8379261   -110.69468   -110.67435     787.9705 
-     490   0.18134438   -110.69628   -110.69574    675.52792 
-     500    2.0946523   -110.69918   -110.69295    696.82065 
-Loop time of 31.775 on 1 procs for 500 steps with 24 atoms
-
-884.8% CPU use with 1 MPI tasks x no OpenMP threads
-
-Minimization stats:
-  Stopping criterion = max iterations
-  Energy initial, next-to-last, final = 
-         -104.95614332     -110.698546127     -110.699182193
-  Force two-norm initial, final = 19.119 0.234621
-  Force max component initial, final = 11.7759 0.0903198
-  Final line search alpha, max atom move = 0 0
-  Iterations, force evaluations = 500 500
-
-MPI task timing breakdown:
-Section |  min time  |  avg time  |  max time  |%varavg| %total
---------------------------------------------------------------
-Pair    | 0.00016952 | 0.00016952 | 0.00016952 |   0.0 |  0.00
-Bond    | 2.8372e-05 | 2.8372e-05 | 2.8372e-05 |   0.0 |  0.00
-Neigh   | 3.0994e-05 | 3.0994e-05 | 3.0994e-05 |   0.0 |  0.00
-Comm    | 0.00060034 | 0.00060034 | 0.00060034 |   0.0 |  0.00
-Output  | 0.00057817 | 0.00057817 | 0.00057817 |   0.0 |  0.00
-Modify  | 31.771     | 31.771     | 31.771     |   0.0 | 99.99
-Other   |            | 0.002469   |            |       |  0.01
-
-Nlocal:    24 ave 24 max 24 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    71 ave 71 max 71 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    27 ave 27 max 27 min
-Histogram: 1 0 0 0 0 0 0 0 0 0
-
-Total # of neighbors = 27
-Ave neighs/atom = 1.125
-Ave special neighs/atom = 0
-Neighbor list builds = 2
-Dangerous builds = 0
-Total wall time: 0:00:31
--- a/examples/latte/log.21Jun18.latte.graphene.boxrelax.g++.1
+++ b/examples/latte/log.21Jun18.latte.graphene.boxrelax.g++.1
@ -0,0 +1,170 @@
+LAMMPS (11 May 2018)
+# Simple water model with LATTE
+
+units		metal
+atom_style	full
+atom_modify     sort 0 0.0    # turn off sorting of the coordinates
+
+read_data       data.graphene.boxrel
+  triclinic box = (0 0 0) to (10 8 20) with tilt (4.89859e-16 1.22465e-15 1.22465e-15)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  32 atoms
+  0 = max # of 1-2 neighbors
+  0 = max # of 1-3 neighbors
+  0 = max # of 1-4 neighbors
+  1 = max # of special neighbors
+
+# replicate system if requested
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable        nrep equal v_x*v_y*v_z
+if              "${nrep} > 1" then "replicate $x $y $z"
+
+# initialize system
+
+velocity	all create 0.0 87287 loop geom
+
+pair_style      zero 1.0
+pair_coeff	* *
+
+neighbor	1.0 bin
+neigh_modify    every 1 delay 0 check yes
+
+timestep        0.00025
+
+fix 1 all box/relax iso 0.0 vmax 0.001
+
+fix             2 all latte NULL
+fix_modify      2 energy yes
+
+thermo_style    custom   etotal
+
+# minimization
+
+thermo          1
+fix 3 all print 1 "Total Energy ="
+min_style cg
+min_modify dmax 0.1
+min_modify line quadratic
+minimize        1.0e-4 1.0e-4 10000 10000
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 2
+  ghost atom cutoff = 2
+  binsize = 1, bins = 11 9 20
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton/tri
+      stencil: half/bin/3d/newton/tri
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 6.779 | 6.779 | 6.779 Mbytes
+TotEng 
+  -247.46002 
+  -247.67224 
+  -247.87937 
+  -248.08148 
+  -248.27865 
+  -248.47096 
+  -248.65851 
+  -248.84137 
+  -249.01964 
+  -249.19342 
+  -249.36281 
+  -249.52791 
+  -249.68883 
+   -249.8457 
+  -249.99865 
+   -250.1478 
+  -250.29332 
+  -250.43535 
+  -250.57409 
+  -250.70972 
+  -250.84247 
+  -250.97258 
+  -251.10035 
+   -251.2261 
+  -251.35021 
+  -251.47314 
+  -251.59543 
+  -251.71776 
+  -251.84096 
+   -251.9661 
+  -252.09459 
+  -252.22833 
+  -252.37003 
+  -252.52371 
+  -252.69578 
+  -252.89752 
+  -253.15197 
+  -253.52044 
+  -254.31418 
+   -255.6175 
+   -256.8162 
+   -258.1227 
+  -259.38401 
+  -260.74831 
+  -262.03991 
+   -263.5463 
+  -264.70486 
+  -267.69144 
+  -267.88682 
+  -269.03519 
+  -270.60187 
+  -270.65382 
+  -270.74279 
+  -271.55883 
+  -271.81248 
+  -271.87529 
+  -273.01494 
+  -273.23948 
+  -273.28719 
+  -273.35272 
+  -273.41591 
+  -273.46274 
+  -273.54755 
+  -273.58318 
+  -273.73111 
+  -273.75754 
+Loop time of 39.4155 on 1 procs for 65 steps with 32 atoms
+
+1582.4% CPU use with 1 MPI tasks x no OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+        -247.460020579     -273.731112592     -273.757543461
+  Force two-norm initial, final = 201.608 9.43485
+  Force max component initial, final = 188.924 2.41297
+  Final line search alpha, max atom move = 0.000223273 0.00053875
+  Iterations, force evaluations = 65 65
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.00012159 | 0.00012159 | 0.00012159 |   0.0 |  0.00
+Bond    | 5.1975e-05 | 5.1975e-05 | 5.1975e-05 |   0.0 |  0.00
+Neigh   | 4.1962e-05 | 4.1962e-05 | 4.1962e-05 |   0.0 |  0.00
+Comm    | 0.00026107 | 0.00026107 | 0.00026107 |   0.0 |  0.00
+Output  | 0.0013342  | 0.0013342  | 0.0013342  |   0.0 |  0.00
+Modify  | 39.412     | 39.412     | 39.412     |   0.0 | 99.99
+Other   |            | 0.00127    |            |       |  0.00
+
+Nlocal:    32 ave 32 max 32 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    100 ave 100 max 100 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    48 ave 48 max 48 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 48
+Ave neighs/atom = 1.5
+Ave special neighs/atom = 0
+Neighbor list builds = 1
+Dangerous builds = 0
+Total wall time: 0:00:40
--- a/examples/latte/log.21Jun18.latte.sucrose.g++.1
+++ b/examples/latte/log.21Jun18.latte.sucrose.g++.1
@ -0,0 +1,103 @@
+LAMMPS (11 May 2018)
+# simple sucrose model with LATTE
+
+units		metal
+atom_style	full
+atom_modify     sort 0 0.0    # turn off sorting of the coordinates
+
+read_data       data.sucrose
+  orthogonal box = (0 0 0) to (17.203 18.009 21.643)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  45 atoms
+  0 = max # of 1-2 neighbors
+  0 = max # of 1-3 neighbors
+  0 = max # of 1-4 neighbors
+  1 = max # of special neighbors
+
+# replicate system if requested
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable        nrep equal v_x*v_y*v_z
+if              "${nrep} > 1" then "replicate $x $y $z"
+
+# initialize system
+
+velocity	all create 0.0 87287 loop geom
+
+pair_style      zero 1.0
+pair_coeff	* *
+
+neighbor	1.0 bin
+neigh_modify    every 1 delay 0 check yes
+
+timestep        0.00025
+
+fix		1 all nve
+
+fix             2 all latte NULL
+fix_modify      2 energy yes
+
+thermo_style    custom step temp pe etotal press
+
+# dynamics
+
+thermo          10
+run		100
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 2
+  ghost atom cutoff = 2
+  binsize = 1, bins = 18 19 22
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 0.5064 | 0.5064 | 0.5064 Mbytes
+Step Temp PotEng TotEng Press 
+       0            0   -251.26617   -251.26617    16.617234 
+      10  0.025263709   -251.26631   -251.26617    8.0576708 
+      20  0.034232467   -251.26636   -251.26617    1.6673442 
+      30  0.059079556    -251.2665   -251.26617    11.058458 
+      40  0.055499766   -251.26648   -251.26617    14.837775 
+      50  0.058499509    -251.2665   -251.26617    6.7183113 
+      60  0.071094535   -251.26657   -251.26617    6.6133687 
+      70  0.084309439   -251.26665   -251.26617    12.372721 
+      80    0.1089929   -251.26679   -251.26617    8.8355516 
+      90   0.11378257   -251.26681   -251.26617    5.1177922 
+     100   0.13003966   -251.26691   -251.26617    8.2431185 
+Loop time of 27.8386 on 1 procs for 100 steps with 45 atoms
+
+Performance: 0.078 ns/day, 309.318 hours/ns, 3.592 timesteps/s
+1799.6% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 8.3685e-05 | 8.3685e-05 | 8.3685e-05 |   0.0 |  0.00
+Bond    | 7.4148e-05 | 7.4148e-05 | 7.4148e-05 |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0.00016689 | 0.00016689 | 0.00016689 |   0.0 |  0.00
+Output  | 0.00032401 | 0.00032401 | 0.00032401 |   0.0 |  0.00
+Modify  | 27.837     | 27.837     | 27.837     |   0.0 |100.00
+Other   |            | 0.0005403  |            |       |  0.00
+
+Nlocal:    45 ave 45 max 45 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    0 ave 0 max 0 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    59 ave 59 max 59 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 59
+Ave neighs/atom = 1.31111
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:28
--- a/examples/latte/log.21Jun18.latte.water.g++.1
+++ b/examples/latte/log.21Jun18.latte.water.g++.1
@ -0,0 +1,103 @@
+LAMMPS (11 May 2018)
+# simple water model with LATTE
+
+units		metal
+atom_style	full
+atom_modify     sort 0 0.0    # turn off sorting of the coordinates
+
+read_data       data.water
+  orthogonal box = (0 0 0) to (6.267 6.267 6.267)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  24 atoms
+  0 = max # of 1-2 neighbors
+  0 = max # of 1-3 neighbors
+  0 = max # of 1-4 neighbors
+  1 = max # of special neighbors
+
+# replicate system if requested
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable        nrep equal v_x*v_y*v_z
+if              "${nrep} > 1" then "replicate $x $y $z"
+
+# initialize system
+
+velocity	all create 0.0 87287 loop geom
+
+pair_style      zero 1.0
+pair_coeff	* *
+
+neighbor	1.0 bin
+neigh_modify    every 1 delay 0 check yes
+
+timestep        0.00025
+
+fix		1 all nve
+
+fix             2 all latte NULL
+fix_modify      2 energy yes
+
+thermo_style    custom step temp pe etotal press
+
+# dynamics
+
+thermo          10
+run		100
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 2
+  ghost atom cutoff = 2
+  binsize = 1, bins = 7 7 7
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 5.629 | 5.629 | 5.629 Mbytes
+Step Temp PotEng TotEng Press 
+       0            0   -104.95594   -104.95594    48236.006 
+      10     336.5303   -105.96026   -104.95976    97997.303 
+      20    529.06385   -106.53021   -104.95731    131520.49 
+      30    753.62616    -107.1995   -104.95898    49297.371 
+      40     716.6565   -107.08802   -104.95741    28307.272 
+      50    824.04417   -107.40822   -104.95835    102167.48 
+      60    933.56056   -107.73478   -104.95932    92508.792 
+      70    851.18518   -107.48766   -104.95711     13993.28 
+      80    999.80265   -107.93146   -104.95906    36700.417 
+      90    998.77707   -107.92569   -104.95634     107233.7 
+     100    1281.4446   -108.76961   -104.95989    49703.193 
+Loop time of 10.6388 on 1 procs for 100 steps with 24 atoms
+
+Performance: 0.203 ns/day, 118.209 hours/ns, 9.400 timesteps/s
+6459.7% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 7.6771e-05 | 7.6771e-05 | 7.6771e-05 |   0.0 |  0.00
+Bond    | 7.5817e-05 | 7.5817e-05 | 7.5817e-05 |   0.0 |  0.00
+Neigh   | 4.6015e-05 | 4.6015e-05 | 4.6015e-05 |   0.0 |  0.00
+Comm    | 0.00031829 | 0.00031829 | 0.00031829 |   0.0 |  0.00
+Output  | 0.00032401 | 0.00032401 | 0.00032401 |   0.0 |  0.00
+Modify  | 10.637     | 10.637     | 10.637     |   0.0 | 99.99
+Other   |            | 0.00052    |            |       |  0.00
+
+Nlocal:    24 ave 24 max 24 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    77 ave 77 max 77 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    31 ave 31 max 31 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 31
+Ave neighs/atom = 1.29167
+Ave special neighs/atom = 0
+Neighbor list builds = 2
+Dangerous builds = 0
+Total wall time: 0:00:10
--- a/examples/latte/log.21Jun18.latte.water.min.g++.1
+++ b/examples/latte/log.21Jun18.latte.water.min.g++.1
@ -0,0 +1,108 @@
+LAMMPS (11 May 2018)
+# simple water model with LATTE
+
+units		metal
+atom_style	full
+atom_modify     sort 0 0.0    # turn off sorting of the coordinates
+
+read_data       data.water
+  orthogonal box = (0 0 0) to (6.267 6.267 6.267)
+  1 by 1 by 1 MPI processor grid
+  reading atoms ...
+  24 atoms
+  0 = max # of 1-2 neighbors
+  0 = max # of 1-3 neighbors
+  0 = max # of 1-4 neighbors
+  1 = max # of special neighbors
+
+# replicate system if requested
+
+variable	x index 1
+variable	y index 1
+variable	z index 1
+
+variable        nrep equal v_x*v_y*v_z
+if              "${nrep} > 1" then "replicate $x $y $z"
+
+# initialize system
+
+velocity	all create 0.0 87287 loop geom
+
+pair_style      zero 1.0
+pair_coeff	* *
+
+neighbor	1.0 bin
+neigh_modify    every 1 delay 0 check yes
+
+timestep        0.00025
+
+fix		1 all nve
+
+fix             2 all latte NULL
+fix_modify      2 energy yes
+
+thermo_style    custom step temp pe etotal press
+
+# minimization
+
+thermo          10
+
+min_style       fire
+minimize        1.0e-4 1.0e-4 500 500
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 2
+  ghost atom cutoff = 2
+  binsize = 1, bins = 7 7 7
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair zero, perpetual
+      attributes: half, newton on
+      pair build: half/bin/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 5.629 | 5.629 | 5.629 Mbytes
+Step Temp PotEng TotEng Press 
+       0            0   -104.95594   -104.95594    48236.006 
+      10     349.4534   -105.50948   -104.47056    62157.729 
+      20    1253.6636   -107.00863   -103.28151    116456.71 
+      30    134.64051   -107.56155   -107.16127    59864.196 
+      40    2.4044989    -108.1527   -108.14556    32695.648 
+      47    137.26885   -108.30413   -107.89603    60177.442 
+Loop time of 6.42677 on 1 procs for 47 steps with 24 atoms
+
+6481.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+Minimization stats:
+  Stopping criterion = energy tolerance
+  Energy initial, next-to-last, final = 
+        -104.955944301     -108.302982895     -108.304126127
+  Force two-norm initial, final = 19.119 3.44609
+  Force max component initial, final = 11.7758 1.3408
+  Final line search alpha, max atom move = 0 0
+  Iterations, force evaluations = 47 47
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 4.6253e-05 | 4.6253e-05 | 4.6253e-05 |   0.0 |  0.00
+Bond    | 3.1948e-05 | 3.1948e-05 | 3.1948e-05 |   0.0 |  0.00
+Neigh   | 0          | 0          | 0          |   0.0 |  0.00
+Comm    | 0.00014353 | 0.00014353 | 0.00014353 |   0.0 |  0.00
+Output  | 0.00012302 | 0.00012302 | 0.00012302 |   0.0 |  0.00
+Modify  | 6.426      | 6.426      | 6.426      |   0.0 | 99.99
+Other   |            | 0.0004699  |            |       |  0.01
+
+Nlocal:    24 ave 24 max 24 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:    71 ave 71 max 71 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:    37 ave 37 max 37 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 37
+Ave neighs/atom = 1.54167
+Ave special neighs/atom = 0
+Neighbor list builds = 0
+Dangerous builds = 0
+Total wall time: 0:00:06
--- a/lib/atc/Function.cpp
+++ b/lib/atc/Function.cpp
@ -1,4 +1,6 @@
+#ifndef _WIN32
 #include <alloca.h>
+#endif
 #include "Function.h"
 #include "ATC_Error.h"
 #include "LammpsInterface.h"
@ -59,9 +61,13 @@ namespace ATC {
  {
    string type = args[0];
    int narg = nargs -1;
+#ifdef _WIN32
+    double *dargs = (double *) _alloca(sizeof(double) * narg);
+#else
    double *dargs = (double *) alloca(sizeof(double) * narg);
+#endif
    for (int i = 0; i < narg; ++i) dargs[i] = atof(args[i+1]);
-  
+
    return function(type, narg, dargs);
  }

@ -193,7 +199,11 @@ XT_Function_Mgr * XT_Function_Mgr::myInstance_ = NULL;
  {
    string type = args[0];
    int narg = nargs -1;
+#ifdef _WIN32
+    double *dargs = (double *) _alloca(sizeof(double) * narg);
+#else
    double *dargs = (double *) alloca(sizeof(double) * narg);
+#endif
    for (int i = 0; i < narg; ++i) dargs[i] = atof(args[i+1]);
  
    return function(type, narg, dargs);
--- a/lib/gpu/Makefile.linux_multi
+++ b/lib/gpu/Makefile.linux_multi
@ -0,0 +1,53 @@
+# /* ----------------------------------------------------------------------   
+#  Generic Linux Makefile for CUDA 
+#     - Change CUDA_ARCH for your GPU
+# ------------------------------------------------------------------------- */
+
+# which file will be copied to Makefile.lammps
+
+EXTRAMAKE = Makefile.lammps.standard
+
+ifeq ($(CUDA_HOME),)
+CUDA_HOME = /usr/local/cuda
+endif
+
+NVCC = nvcc
+
+# Kepler CUDA
+#CUDA_ARCH = -arch=sm_35
+# newer CUDA
+#CUDA_ARCH = -arch=sm_13
+# older CUDA
+#CUDA_ARCH = -arch=sm_10 -DCUDA_PRE_THREE
+
+CUDA_ARCH = -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61
+
+# this setting should match LAMMPS Makefile
+# one of LAMMPS_SMALLBIG (default), LAMMPS_BIGBIG and LAMMPS_SMALLSMALL
+
+LMP_INC = -DLAMMPS_SMALLBIG
+
+# precision for GPU calculations
+# -D_SINGLE_SINGLE  # Single precision for all calculations
+# -D_DOUBLE_DOUBLE  # Double precision for all calculations
+# -D_SINGLE_DOUBLE  # Accumulation of forces, etc. in double
+
+CUDA_PRECISION = -D_SINGLE_DOUBLE
+
+CUDA_INCLUDE = -I$(CUDA_HOME)/include
+CUDA_LIB = -L$(CUDA_HOME)/lib64
+CUDA_OPTS = -DUNIX -O3 -Xptxas -v --use_fast_math $(LMP_INC) -Xcompiler "-fPIC -std=c++98"
+
+CUDR_CPP = mpicxx -DMPI_GERYON -DUCL_NO_EXIT -DMPICH_IGNORE_CXX_SEEK -DOMPI_SKIP_MPICXX=1 -fPIC
+CUDR_OPTS = -O2 $(LMP_INC) # -xHost -no-prec-div -ansi-alias
+
+BIN_DIR = ./
+OBJ_DIR = ./
+LIB_DIR = ./
+AR = ar
+BSH = /bin/sh
+
+CUDPP_OPT = -DUSE_CUDPP -Icudpp_mini
+
+include Nvidia.makefile_multi
+
--- a/lib/gpu/Nvidia.makefile
+++ b/lib/gpu/Nvidia.makefile
@ -77,7 +77,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
       $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o \
       $(OBJ_DIR)/lal_zbl.o $(OBJ_DIR)/lal_zbl_ext.o \
       $(OBJ_DIR)/lal_lj_cubic.o $(OBJ_DIR)/lal_lj_cubic_ext.o \
-       $(OBJ_DIR)/lal_ufm.o $(OBJ_DIR)/lal_ufm_ext.o 
+       $(OBJ_DIR)/lal_ufm.o $(OBJ_DIR)/lal_ufm_ext.o \
+       $(OBJ_DIR)/lal_dipole_long_lj.o $(OBJ_DIR)/lal_dipole_long_lj_ext.o \
+       $(OBJ_DIR)/lal_lj_expand_coul_long.o $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o \
+       $(OBJ_DIR)/lal_coul_long_cs.o $(OBJ_DIR)/lal_coul_long_cs_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long_cs.o $(OBJ_DIR)/lal_born_coul_long_cs_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf_cs.o $(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o

 CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
@ -133,7 +138,12 @@ CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
       $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye_cubin.h \
       $(OBJ_DIR)/zbl.cubin $(OBJ_DIR)/zbl_cubin.h \
       $(OBJ_DIR)/lj_cubic.cubin $(OBJ_DIR)/lj_cubic_cubin.h \
-       $(OBJ_DIR)/ufm.cubin $(OBJ_DIR)/ufm_cubin.h
+       $(OBJ_DIR)/ufm.cubin $(OBJ_DIR)/ufm_cubin.h \
+       $(OBJ_DIR)/dipole_long_lj.cubin $(OBJ_DIR)/dipole_long_lj_cubin.h \
+       $(OBJ_DIR)/lj_expand_coul_long.cubin $(OBJ_DIR)/lj_expand_coul_long_cubin.h \
+       $(OBJ_DIR)/coul_long_cs.cubin $(OBJ_DIR)/coul_long_cs_cubin.h \
+       $(OBJ_DIR)/born_coul_long_cs.cubin $(OBJ_DIR)/born_coul_long_cs_cubin.h \
+       $(OBJ_DIR)/born_coul_wolf_cs.cubin $(OBJ_DIR)/born_coul_wolf_cs_cubin.h

 all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)

@ -809,6 +819,66 @@ $(OBJ_DIR)/lal_lj_cubic.o: $(ALL_H) lal_lj_cubic.h lal_lj_cubic.cpp $(OBJ_DIR)/l
 $(OBJ_DIR)/lal_lj_cubic_ext.o: $(ALL_H) lal_lj_cubic.h lal_lj_cubic_ext.cpp lal_base_atomic.h
 	$(CUDR) -o $@ -c lal_lj_cubic_ext.cpp -I$(OBJ_DIR)

+$(OBJ_DIR)/dipole_long_lj.cubin: lal_dipole_long_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_dipole_long_lj.cu
+
+$(OBJ_DIR)/dipole_long_lj_cubin.h: $(OBJ_DIR)/dipole_long_lj.cubin $(OBJ_DIR)/dipole_long_lj.cubin
+	$(BIN2C) -c -n dipole_long_lj $(OBJ_DIR)/dipole_long_lj.cubin > $(OBJ_DIR)/dipole_long_lj_cubin.h
+
+$(OBJ_DIR)/lal_dipole_long_lj.o: $(ALL_H) lal_dipole_long_lj.h lal_dipole_long_lj.cpp $(OBJ_DIR)/dipole_long_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_long_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_long_lj_ext.o: $(ALL_H) lal_dipole_long_lj.h lal_dipole_long_lj_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_long_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand_coul_long.cubin: lal_lj_expand_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_lj_expand_coul_long.cu
+
+$(OBJ_DIR)/lj_expand_coul_long_cubin.h: $(OBJ_DIR)/lj_expand_coul_long.cubin $(OBJ_DIR)/lj_expand_coul_long.cubin
+	$(BIN2C) -c -n lj_expand_coul_long $(OBJ_DIR)/lj_expand_coul_long.cubin > $(OBJ_DIR)/lj_expand_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_lj_expand_coul_long.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_expand_coul_long.cpp $(OBJ_DIR)/lj_expand_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_expand_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_expand_coul_long_ext.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_expand_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_expand_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_long_cs.cubin: lal_coul_long_cs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_coul_long_cs.cu
+
+$(OBJ_DIR)/coul_long_cs_cubin.h: $(OBJ_DIR)/coul_long_cs.cubin $(OBJ_DIR)/coul_long_cs.cubin
+	$(BIN2C) -c -n coul_long_cs $(OBJ_DIR)/coul_long_cs.cubin > $(OBJ_DIR)/coul_long_cs_cubin.h
+
+$(OBJ_DIR)/lal_coul_long_cs.o: $(ALL_H) lal_coul_long_cs.h lal_coul_long_cs.cpp $(OBJ_DIR)/coul_long_cs_cubin.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_coul_long.o
+	$(CUDR) -o $@ -c lal_coul_long_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_long_cs_ext.o: $(ALL_H) lal_coul_long_cs.h lal_coul_long_cs_ext.cpp lal_coul_long.h
+	$(CUDR) -o $@ -c lal_coul_long_cs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long_cs.cubin: lal_born_coul_long_cs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_long_cs.cu
+
+$(OBJ_DIR)/born_coul_long_cs_cubin.h: $(OBJ_DIR)/born_coul_long_cs.cubin $(OBJ_DIR)/born_coul_long_cs.cubin
+	$(BIN2C) -c -n born_coul_long_cs $(OBJ_DIR)/born_coul_long_cs.cubin > $(OBJ_DIR)/born_coul_long_cs_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_long_cs.o: $(ALL_H) lal_born_coul_long_cs.h lal_born_coul_long_cs.cpp $(OBJ_DIR)/born_coul_long_cs_cubin.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_born_coul_long.o
+	$(CUDR) -o $@ -c lal_born_coul_long_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_cs_ext.o: $(ALL_H) lal_born_coul_long_cs.h lal_born_coul_long_cs_ext.cpp lal_born_coul_long.h
+	$(CUDR) -o $@ -c lal_born_coul_long_cs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf_cs.cubin: lal_born_coul_wolf_cs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --cubin -DNV_KERNEL -o $@ lal_born_coul_wolf_cs.cu
+
+$(OBJ_DIR)/born_coul_wolf_cs_cubin.h: $(OBJ_DIR)/born_coul_wolf_cs.cubin $(OBJ_DIR)/born_coul_wolf_cs.cubin
+	$(BIN2C) -c -n born_coul_wolf_cs $(OBJ_DIR)/born_coul_wolf_cs.cubin > $(OBJ_DIR)/born_coul_wolf_cs_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_wolf_cs.o: $(ALL_H) lal_born_coul_wolf_cs.h lal_born_coul_wolf_cs.cpp $(OBJ_DIR)/born_coul_wolf_cs_cubin.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_born_coul_wolf.o
+	$(CUDR) -o $@ -c lal_born_coul_wolf_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o: $(ALL_H) lal_born_coul_wolf_cs.h lal_born_coul_wolf_cs_ext.cpp lal_born_coul_wolf.h
+	$(CUDR) -o $@ -c lal_born_coul_wolf_cs_ext.cpp -I$(OBJ_DIR)
+
 $(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
 	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 

--- a/lib/gpu/Nvidia.makefile_multi
+++ b/lib/gpu/Nvidia.makefile_multi
@ -0,0 +1,854 @@
+CUDA  = $(NVCC) $(CUDA_INCLUDE) $(CUDA_OPTS) -Icudpp_mini $(CUDA_ARCH) \
+             $(CUDA_PRECISION)
+CUDR  = $(CUDR_CPP) $(CUDR_OPTS) $(CUDA_PRECISION) $(CUDA_INCLUDE) \
+         $(CUDPP_OPT)
+CUDA_LINK = $(CUDA_LIB) -lcudart
+BIN2C = $(CUDA_HOME)/bin/bin2c
+
+GPU_LIB = $(LIB_DIR)/libgpu.a
+
+# Headers for Geryon
+UCL_H  = $(wildcard ./geryon/ucl*.h)
+NVC_H  = $(wildcard ./geryon/nvc*.h) $(UCL_H)
+NVD_H  = $(wildcard ./geryon/nvd*.h) $(UCL_H) lal_preprocessor.h
+# Headers for Pair Stuff
+PAIR_H  = lal_atom.h lal_answer.h lal_neighbor_shared.h \
+          lal_neighbor.h lal_precision.h lal_device.h \
+          lal_balance.h lal_pppm.h
+
+ALL_H = $(NVD_H) $(PAIR_H)
+
+EXECS = $(BIN_DIR)/nvc_get_devices
+ifdef CUDPP_OPT
+CUDPP = $(OBJ_DIR)/cudpp.o $(OBJ_DIR)/cudpp_plan.o \
+        $(OBJ_DIR)/cudpp_maximal_launch.o $(OBJ_DIR)/cudpp_plan_manager.o \
+        $(OBJ_DIR)/radixsort_app.cu_o $(OBJ_DIR)/scan_app.cu_o
+endif
+OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_ans.o \
+       $(OBJ_DIR)/lal_neighbor.o $(OBJ_DIR)/lal_neighbor_shared.o \
+       $(OBJ_DIR)/lal_device.o $(OBJ_DIR)/lal_base_atomic.o \
+       $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_base_ellipsoid.o \
+       $(OBJ_DIR)/lal_base_dipole.o $(OBJ_DIR)/lal_base_three.o \
+       $(OBJ_DIR)/lal_base_dpd.o \
+       $(OBJ_DIR)/lal_pppm.o $(OBJ_DIR)/lal_pppm_ext.o \
+       $(OBJ_DIR)/lal_gayberne.o $(OBJ_DIR)/lal_gayberne_ext.o \
+       $(OBJ_DIR)/lal_re_squared.o $(OBJ_DIR)/lal_re_squared_ext.o \
+       $(OBJ_DIR)/lal_lj.o $(OBJ_DIR)/lal_lj_ext.o \
+       $(OBJ_DIR)/lal_lj96.o $(OBJ_DIR)/lal_lj96_ext.o \
+       $(OBJ_DIR)/lal_lj_expand.o $(OBJ_DIR)/lal_lj_expand_ext.o \
+       $(OBJ_DIR)/lal_lj_coul.o $(OBJ_DIR)/lal_lj_coul_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_long.o $(OBJ_DIR)/lal_lj_coul_long_ext.o \
+       $(OBJ_DIR)/lal_lj_dsf.o $(OBJ_DIR)/lal_lj_dsf_ext.o \
+       $(OBJ_DIR)/lal_lj_class2_long.o $(OBJ_DIR)/lal_lj_class2_long_ext.o \
+       $(OBJ_DIR)/lal_coul_long.o $(OBJ_DIR)/lal_coul_long_ext.o \
+       $(OBJ_DIR)/lal_morse.o $(OBJ_DIR)/lal_morse_ext.o \
+       $(OBJ_DIR)/lal_charmm_long.o $(OBJ_DIR)/lal_charmm_long_ext.o \
+       $(OBJ_DIR)/lal_lj_sdk.o $(OBJ_DIR)/lal_lj_sdk_ext.o \
+       $(OBJ_DIR)/lal_lj_sdk_long.o $(OBJ_DIR)/lal_lj_sdk_long_ext.o \
+       $(OBJ_DIR)/lal_eam.o $(OBJ_DIR)/lal_eam_ext.o \
+       $(OBJ_DIR)/lal_eam_fs_ext.o $(OBJ_DIR)/lal_eam_alloy_ext.o \
+       $(OBJ_DIR)/lal_buck.o $(OBJ_DIR)/lal_buck_ext.o \
+       $(OBJ_DIR)/lal_buck_coul.o $(OBJ_DIR)/lal_buck_coul_ext.o \
+       $(OBJ_DIR)/lal_buck_coul_long.o $(OBJ_DIR)/lal_buck_coul_long_ext.o \
+       $(OBJ_DIR)/lal_table.o $(OBJ_DIR)/lal_table_ext.o \
+       $(OBJ_DIR)/lal_yukawa.o $(OBJ_DIR)/lal_yukawa_ext.o \
+       $(OBJ_DIR)/lal_born.o $(OBJ_DIR)/lal_born_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf.o $(OBJ_DIR)/lal_born_coul_wolf_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long.o $(OBJ_DIR)/lal_born_coul_long_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj.o $(OBJ_DIR)/lal_dipole_lj_ext.o \
+       $(OBJ_DIR)/lal_dipole_lj_sf.o $(OBJ_DIR)/lal_dipole_lj_sf_ext.o \
+       $(OBJ_DIR)/lal_colloid.o $(OBJ_DIR)/lal_colloid_ext.o \
+       $(OBJ_DIR)/lal_gauss.o $(OBJ_DIR)/lal_gauss_ext.o \
+       $(OBJ_DIR)/lal_yukawa_colloid.o $(OBJ_DIR)/lal_yukawa_colloid_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_debye.o $(OBJ_DIR)/lal_lj_coul_debye_ext.o \
+       $(OBJ_DIR)/lal_coul_dsf.o $(OBJ_DIR)/lal_coul_dsf_ext.o \
+       $(OBJ_DIR)/lal_sw.o $(OBJ_DIR)/lal_sw_ext.o \
+       $(OBJ_DIR)/lal_vashishta.o $(OBJ_DIR)/lal_vashishta_ext.o \
+       $(OBJ_DIR)/lal_beck.o $(OBJ_DIR)/lal_beck_ext.o \
+       $(OBJ_DIR)/lal_mie.o $(OBJ_DIR)/lal_mie_ext.o \
+       $(OBJ_DIR)/lal_soft.o $(OBJ_DIR)/lal_soft_ext.o \
+       $(OBJ_DIR)/lal_lj_coul_msm.o $(OBJ_DIR)/lal_lj_coul_msm_ext.o \
+       $(OBJ_DIR)/lal_lj_gromacs.o $(OBJ_DIR)/lal_lj_gromacs_ext.o \
+       $(OBJ_DIR)/lal_dpd.o $(OBJ_DIR)/lal_dpd_ext.o \
+       $(OBJ_DIR)/lal_tersoff.o $(OBJ_DIR)/lal_tersoff_ext.o \
+       $(OBJ_DIR)/lal_tersoff_zbl.o $(OBJ_DIR)/lal_tersoff_zbl_ext.o \
+       $(OBJ_DIR)/lal_tersoff_mod.o $(OBJ_DIR)/lal_tersoff_mod_ext.o \
+       $(OBJ_DIR)/lal_coul.o $(OBJ_DIR)/lal_coul_ext.o \
+       $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o \
+       $(OBJ_DIR)/lal_zbl.o $(OBJ_DIR)/lal_zbl_ext.o \
+       $(OBJ_DIR)/lal_lj_cubic.o $(OBJ_DIR)/lal_lj_cubic_ext.o \
+       $(OBJ_DIR)/lal_ufm.o $(OBJ_DIR)/lal_ufm_ext.o \
+       $(OBJ_DIR)/lal_dipole_long_lj.o $(OBJ_DIR)/lal_dipole_long_lj_ext.o \
+       $(OBJ_DIR)/lal_lj_expand_coul_long.o $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o
+
+CBNS = $(OBJ_DIR)/device.cubin $(OBJ_DIR)/device_cubin.h \
+       $(OBJ_DIR)/atom.cubin $(OBJ_DIR)/atom_cubin.h \
+       $(OBJ_DIR)/neighbor_cpu.cubin $(OBJ_DIR)/neighbor_cpu_cubin.h \
+       $(OBJ_DIR)/neighbor_gpu.cubin $(OBJ_DIR)/neighbor_gpu_cubin.h \
+       $(OBJ_DIR)/pppm_f.cubin $(OBJ_DIR)/pppm_f_cubin.h \
+       $(OBJ_DIR)/pppm_d.cubin $(OBJ_DIR)/pppm_d_cubin.h \
+       $(OBJ_DIR)/ellipsoid_nbor.cubin $(OBJ_DIR)/ellipsoid_nbor_cubin.h \
+       $(OBJ_DIR)/gayberne.cubin $(OBJ_DIR)/gayberne_lj.cubin \
+       $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h \
+       $(OBJ_DIR)/re_squared.cubin $(OBJ_DIR)/re_squared_lj.cubin \
+       $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h \
+       $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj_cubin.h \
+       $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96_cubin.h \
+       $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand_cubin.h \
+       $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul_cubin.h \
+       $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long_cubin.h \
+       $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf_cubin.h \
+       $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long_cubin.h \
+       $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long_cubin.h \
+       $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse_cubin.h \
+       $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long_cubin.h \
+       $(OBJ_DIR)/lj_sdk.cubin $(OBJ_DIR)/lj_sdk_cubin.h \
+       $(OBJ_DIR)/lj_sdk_long.cubin $(OBJ_DIR)/lj_sdk_long_cubin.h \
+       $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam_cubin.h \
+       $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck_cubin.h \
+       $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long_cubin.h \
+       $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul_cubin.h \
+       $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table_cubin.h \
+       $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa_cubin.h \
+       $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born_cubin.h \
+       $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf_cubin.h \
+       $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long_cubin.h \
+       $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj_cubin.h \
+       $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf_cubin.h \
+       $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid_cubin.h \
+       $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss_cubin.h \
+       $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid_cubin.h \
+       $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye_cubin.h \
+       $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf_cubin.h \
+       $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw_cubin.h \
+       $(OBJ_DIR)/vashishta.cubin $(OBJ_DIR)/vashishta_cubin.h \
+       $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck_cubin.h \
+       $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie_cubin.h \
+       $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft_cubin.h \
+       $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm_cubin.h \
+       $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs_cubin.h \
+       $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd_cubin.h \
+       $(OBJ_DIR)/tersoff.cubin $(OBJ_DIR)/tersoff_cubin.h \
+       $(OBJ_DIR)/tersoff_zbl.cubin $(OBJ_DIR)/tersoff_zbl_cubin.h \
+       $(OBJ_DIR)/tersoff_mod.cubin $(OBJ_DIR)/tersoff_mod_cubin.h \
+       $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul_cubin.h \
+       $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye_cubin.h \
+       $(OBJ_DIR)/zbl.cubin $(OBJ_DIR)/zbl_cubin.h \
+       $(OBJ_DIR)/lj_cubic.cubin $(OBJ_DIR)/lj_cubic_cubin.h \
+       $(OBJ_DIR)/ufm.cubin $(OBJ_DIR)/ufm_cubin.h \
+       $(OBJ_DIR)/dipole_long_lj.cubin $(OBJ_DIR)/dipole_long_lj_cubin.h \
+       $(OBJ_DIR)/lj_expand_coul_long.cubin $(OBJ_DIR)/lj_expand_coul_long_cubin.h
+
+all: $(OBJ_DIR) $(GPU_LIB) $(EXECS)
+
+$(OBJ_DIR):
+	mkdir -p $@
+
+$(OBJ_DIR)/cudpp.o: cudpp_mini/cudpp.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp.cpp -Icudpp_mini
+
+$(OBJ_DIR)/cudpp_plan.o: cudpp_mini/cudpp_plan.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan.cpp -Icudpp_mini
+
+$(OBJ_DIR)/cudpp_maximal_launch.o: cudpp_mini/cudpp_maximal_launch.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp_maximal_launch.cpp -Icudpp_mini
+
+$(OBJ_DIR)/cudpp_plan_manager.o: cudpp_mini/cudpp_plan_manager.cpp
+	$(CUDR) -o $@ -c cudpp_mini/cudpp_plan_manager.cpp -Icudpp_mini
+
+$(OBJ_DIR)/radixsort_app.cu_o: cudpp_mini/radixsort_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/radixsort_app.cu
+
+$(OBJ_DIR)/scan_app.cu_o: cudpp_mini/scan_app.cu
+	$(CUDA) -o $@ -c cudpp_mini/scan_app.cu
+
+$(OBJ_DIR)/atom.cubin: lal_atom.cu lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_atom.cu
+
+$(OBJ_DIR)/atom_cubin.h: $(OBJ_DIR)/atom.cubin
+	$(BIN2C) -c -n atom $(OBJ_DIR)/atom.cubin > $(OBJ_DIR)/atom_cubin.h
+
+$(OBJ_DIR)/lal_atom.o: lal_atom.cpp lal_atom.h $(NVD_H) $(OBJ_DIR)/atom_cubin.h
+	$(CUDR) -o $@ -c lal_atom.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_ans.o: lal_answer.cpp lal_answer.h $(NVD_H)
+	$(CUDR) -o $@ -c lal_answer.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/neighbor_cpu.cubin: lal_neighbor_cpu.cu lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_neighbor_cpu.cu
+
+$(OBJ_DIR)/neighbor_cpu_cubin.h: $(OBJ_DIR)/neighbor_cpu.cubin
+	$(BIN2C) -c -n neighbor_cpu $(OBJ_DIR)/neighbor_cpu.cubin > $(OBJ_DIR)/neighbor_cpu_cubin.h
+
+$(OBJ_DIR)/neighbor_gpu.cubin: lal_neighbor_gpu.cu lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_neighbor_gpu.cu
+
+$(OBJ_DIR)/neighbor_gpu_cubin.h: $(OBJ_DIR)/neighbor_gpu.cubin
+	$(BIN2C) -c -n neighbor_gpu $(OBJ_DIR)/neighbor_gpu.cubin > $(OBJ_DIR)/neighbor_gpu_cubin.h
+
+$(OBJ_DIR)/lal_neighbor_shared.o: lal_neighbor_shared.cpp lal_neighbor_shared.h $(OBJ_DIR)/neighbor_cpu_cubin.h $(OBJ_DIR)/neighbor_gpu_cubin.h $(NVD_H)
+	$(CUDR) -o $@ -c lal_neighbor_shared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_neighbor.o: lal_neighbor.cpp lal_neighbor.h lal_neighbor_shared.h $(NVD_H)
+	$(CUDR) -o $@ -c lal_neighbor.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/device.cubin: lal_device.cu lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_device.cu
+
+$(OBJ_DIR)/device_cubin.h: $(OBJ_DIR)/device.cubin
+	$(BIN2C) -c -n device $(OBJ_DIR)/device.cubin > $(OBJ_DIR)/device_cubin.h
+
+$(OBJ_DIR)/lal_device.o: lal_device.cpp lal_device.h $(ALL_H) $(OBJ_DIR)/device_cubin.h
+	$(CUDR) -o $@ -c lal_device.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_atomic.o: $(ALL_H) lal_base_atomic.h lal_base_atomic.cpp
+	$(CUDR) -o $@ -c lal_base_atomic.cpp
+
+$(OBJ_DIR)/lal_base_charge.o: $(ALL_H) lal_base_charge.h lal_base_charge.cpp
+	$(CUDR) -o $@ -c lal_base_charge.cpp
+
+$(OBJ_DIR)/lal_base_ellipsoid.o: $(ALL_H) lal_base_ellipsoid.h lal_base_ellipsoid.cpp $(OBJ_DIR)/ellipsoid_nbor_cubin.h
+	$(CUDR) -o $@ -c lal_base_ellipsoid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_base_dipole.o: $(ALL_H) lal_base_dipole.h lal_base_dipole.cpp
+	$(CUDR) -o $@ -c lal_base_dipole.cpp
+
+$(OBJ_DIR)/lal_base_three.o: $(ALL_H) lal_base_three.h lal_base_three.cpp
+	$(CUDR) -o $@ -c lal_base_three.cpp
+
+$(OBJ_DIR)/lal_base_dpd.o: $(ALL_H) lal_base_dpd.h lal_base_dpd.cpp
+	$(CUDR) -o $@ -c lal_base_dpd.cpp
+
+$(OBJ_DIR)/pppm_f.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=float -Dgrdtyp4=float4 -o $@ lal_pppm.cu
+
+$(OBJ_DIR)/pppm_f_cubin.h: $(OBJ_DIR)/pppm_f.cubin
+	$(BIN2C) -c -n pppm_f $(OBJ_DIR)/pppm_f.cubin > $(OBJ_DIR)/pppm_f_cubin.h
+
+$(OBJ_DIR)/pppm_d.cubin: lal_pppm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -Dgrdtyp=double -Dgrdtyp4=double4 -o $@ lal_pppm.cu
+
+$(OBJ_DIR)/pppm_d_cubin.h: $(OBJ_DIR)/pppm_d.cubin
+	$(BIN2C) -c -n pppm_d $(OBJ_DIR)/pppm_d.cubin > $(OBJ_DIR)/pppm_d_cubin.h
+
+$(OBJ_DIR)/lal_pppm.o: $(ALL_H) lal_pppm.h lal_pppm.cpp $(OBJ_DIR)/pppm_f_cubin.h $(OBJ_DIR)/pppm_d_cubin.h
+	$(CUDR) -o $@ -c lal_pppm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_pppm_ext.o: $(ALL_H) lal_pppm.h lal_pppm_ext.cpp
+	$(CUDR) -o $@ -c lal_pppm_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/ellipsoid_nbor.cubin: lal_ellipsoid_nbor.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_ellipsoid_nbor.cu
+
+$(OBJ_DIR)/ellipsoid_nbor_cubin.h: $(OBJ_DIR)/ellipsoid_nbor.cubin
+	$(BIN2C) -c -n ellipsoid_nbor $(OBJ_DIR)/ellipsoid_nbor.cubin > $(OBJ_DIR)/ellipsoid_nbor_cubin.h
+
+$(OBJ_DIR)/gayberne.cubin: lal_gayberne.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_gayberne.cu
+
+$(OBJ_DIR)/gayberne_lj.cubin: lal_gayberne_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_gayberne_lj.cu
+
+$(OBJ_DIR)/gayberne_cubin.h: $(OBJ_DIR)/gayberne.cubin
+	$(BIN2C) -c -n gayberne $(OBJ_DIR)/gayberne.cubin > $(OBJ_DIR)/gayberne_cubin.h
+
+$(OBJ_DIR)/gayberne_lj_cubin.h: $(OBJ_DIR)/gayberne_lj.cubin
+	$(BIN2C) -c -n gayberne_lj $(OBJ_DIR)/gayberne_lj.cubin > $(OBJ_DIR)/gayberne_lj_cubin.h
+
+$(OBJ_DIR)/lal_gayberne.o: $(ALL_H) lal_gayberne.h lal_gayberne.cpp $(OBJ_DIR)/gayberne_cubin.h $(OBJ_DIR)/gayberne_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
+	$(CUDR) -o $@ -c lal_gayberne.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_gayberne_ext.o: $(ALL_H) $(OBJ_DIR)/lal_gayberne.o lal_gayberne_ext.cpp
+	$(CUDR) -o $@ -c lal_gayberne_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/re_squared.cubin: lal_re_squared.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_re_squared.cu
+
+$(OBJ_DIR)/re_squared_lj.cubin: lal_re_squared_lj.cu lal_precision.h lal_ellipsoid_extra.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_re_squared_lj.cu
+
+$(OBJ_DIR)/re_squared_cubin.h: $(OBJ_DIR)/re_squared.cubin
+	$(BIN2C) -c -n re_squared $(OBJ_DIR)/re_squared.cubin > $(OBJ_DIR)/re_squared_cubin.h
+
+$(OBJ_DIR)/re_squared_lj_cubin.h: $(OBJ_DIR)/re_squared_lj.cubin
+	$(BIN2C) -c -n re_squared_lj $(OBJ_DIR)/re_squared_lj.cubin > $(OBJ_DIR)/re_squared_lj_cubin.h
+
+$(OBJ_DIR)/lal_re_squared.o: $(ALL_H) lal_re_squared.h lal_re_squared.cpp $(OBJ_DIR)/re_squared_cubin.h $(OBJ_DIR)/re_squared_lj_cubin.h $(OBJ_DIR)/lal_base_ellipsoid.o
+	$(CUDR) -o $@ -c lal_re_squared.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_re_squared_ext.o: $(ALL_H) $(OBJ_DIR)/lal_re_squared.o lal_re_squared_ext.cpp
+	$(CUDR) -o $@ -c lal_re_squared_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj.cubin: lal_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj.cu
+
+$(OBJ_DIR)/lj_cubin.h: $(OBJ_DIR)/lj.cubin $(OBJ_DIR)/lj.cubin
+	$(BIN2C) -c -n lj $(OBJ_DIR)/lj.cubin > $(OBJ_DIR)/lj_cubin.h
+
+$(OBJ_DIR)/lal_lj.o: $(ALL_H) lal_lj.h lal_lj.cpp $(OBJ_DIR)/lj_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_ext.o: $(ALL_H) lal_lj.h lal_lj_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul.cubin: lal_lj_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_coul.cu
+
+$(OBJ_DIR)/lj_coul_cubin.h: $(OBJ_DIR)/lj_coul.cubin $(OBJ_DIR)/lj_coul.cubin
+	$(BIN2C) -c -n lj_coul $(OBJ_DIR)/lj_coul.cubin > $(OBJ_DIR)/lj_coul_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul.o: $(ALL_H) lal_lj_coul.h lal_lj_coul.cpp $(OBJ_DIR)/lj_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_ext.o: $(ALL_H) lal_lj_coul.h lal_lj_coul_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_class2_long.cubin: lal_lj_class2_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_class2_long.cu
+
+$(OBJ_DIR)/lj_class2_long_cubin.h: $(OBJ_DIR)/lj_class2_long.cubin $(OBJ_DIR)/lj_class2_long.cubin
+	$(BIN2C) -c -n lj_class2_long $(OBJ_DIR)/lj_class2_long.cubin > $(OBJ_DIR)/lj_class2_long_cubin.h
+
+$(OBJ_DIR)/lal_lj_class2_long.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long.cpp $(OBJ_DIR)/lj_class2_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_class2_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_class2_long_ext.o: $(ALL_H) lal_lj_class2_long.h lal_lj_class2_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_class2_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_long.cubin: lal_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_coul_long.cu
+
+$(OBJ_DIR)/coul_long_cubin.h: $(OBJ_DIR)/coul_long.cubin $(OBJ_DIR)/coul_long.cubin
+	$(BIN2C) -c -n coul_long $(OBJ_DIR)/coul_long.cubin > $(OBJ_DIR)/coul_long_cubin.h
+
+$(OBJ_DIR)/lal_coul_long.o: $(ALL_H) lal_coul_long.h lal_coul_long.cpp $(OBJ_DIR)/coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_long_ext.o: $(ALL_H) lal_coul_long.h lal_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_long.cubin: lal_lj_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_coul_long.cu
+
+$(OBJ_DIR)/lj_coul_long_cubin.h: $(OBJ_DIR)/lj_coul_long.cubin $(OBJ_DIR)/lj_coul_long.cubin
+	$(BIN2C) -c -n lj_coul_long $(OBJ_DIR)/lj_coul_long.cubin > $(OBJ_DIR)/lj_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul_long.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long.cpp $(OBJ_DIR)/lj_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_long_ext.o: $(ALL_H) lal_lj_coul_long.h lal_lj_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_dsf.cubin: lal_lj_dsf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_dsf.cu
+
+$(OBJ_DIR)/lj_dsf_cubin.h: $(OBJ_DIR)/lj_dsf.cubin $(OBJ_DIR)/lj_dsf.cubin
+	$(BIN2C) -c -n lj_dsf $(OBJ_DIR)/lj_dsf.cubin > $(OBJ_DIR)/lj_dsf_cubin.h
+
+$(OBJ_DIR)/lal_lj_dsf.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf.cpp $(OBJ_DIR)/lj_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_dsf_ext.o: $(ALL_H) lal_lj_dsf.h lal_lj_dsf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_dsf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/morse.cubin: lal_morse.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_morse.cu
+
+$(OBJ_DIR)/morse_cubin.h: $(OBJ_DIR)/morse.cubin $(OBJ_DIR)/morse.cubin
+	$(BIN2C) -c -n morse $(OBJ_DIR)/morse.cubin > $(OBJ_DIR)/morse_cubin.h
+
+$(OBJ_DIR)/lal_morse.o: $(ALL_H) lal_morse.h lal_morse.cpp $(OBJ_DIR)/morse_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_morse.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_morse_ext.o: $(ALL_H) lal_morse.h lal_morse_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_morse_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/charmm_long.cubin: lal_charmm_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_charmm_long.cu
+
+$(OBJ_DIR)/charmm_long_cubin.h: $(OBJ_DIR)/charmm_long.cubin $(OBJ_DIR)/charmm_long.cubin
+	$(BIN2C) -c -n charmm_long $(OBJ_DIR)/charmm_long.cubin > $(OBJ_DIR)/charmm_long_cubin.h
+
+$(OBJ_DIR)/lal_charmm_long.o: $(ALL_H) lal_charmm_long.h lal_charmm_long.cpp $(OBJ_DIR)/charmm_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_charmm_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_charmm_long_ext.o: $(ALL_H) lal_charmm_long.h lal_charmm_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_charmm_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj96.cubin: lal_lj96.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj96.cu
+
+$(OBJ_DIR)/lj96_cubin.h: $(OBJ_DIR)/lj96.cubin $(OBJ_DIR)/lj96.cubin
+	$(BIN2C) -c -n lj96 $(OBJ_DIR)/lj96.cubin > $(OBJ_DIR)/lj96_cubin.h
+
+$(OBJ_DIR)/lal_lj96.o: $(ALL_H) lal_lj96.h lal_lj96.cpp $(OBJ_DIR)/lj96_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj96.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj96_ext.o: $(ALL_H) lal_lj96.h lal_lj96_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj96_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand.cubin: lal_lj_expand.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_expand.cu
+
+$(OBJ_DIR)/lj_expand_cubin.h: $(OBJ_DIR)/lj_expand.cubin $(OBJ_DIR)/lj_expand.cubin
+	$(BIN2C) -c -n lj_expand $(OBJ_DIR)/lj_expand.cubin > $(OBJ_DIR)/lj_expand_cubin.h
+
+$(OBJ_DIR)/lal_lj_expand.o: $(ALL_H) lal_lj_expand.h lal_lj_expand.cpp $(OBJ_DIR)/lj_expand_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_expand.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_expand_ext.o: $(ALL_H) lal_lj_expand.h lal_lj_expand_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj_expand_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_sdk.cubin: lal_lj_sdk.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_sdk.cu
+
+$(OBJ_DIR)/lj_sdk_cubin.h: $(OBJ_DIR)/lj_sdk.cubin $(OBJ_DIR)/lj_sdk.cubin
+	$(BIN2C) -c -n lj_sdk $(OBJ_DIR)/lj_sdk.cubin > $(OBJ_DIR)/lj_sdk_cubin.h
+
+$(OBJ_DIR)/lal_lj_sdk.o: $(ALL_H) lal_lj_sdk.h lal_lj_sdk.cpp $(OBJ_DIR)/lj_sdk_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_sdk.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_sdk_ext.o: $(ALL_H) lal_lj_sdk.h lal_lj_sdk_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj_sdk_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_sdk_long.cubin: lal_lj_sdk_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_sdk_long.cu
+
+$(OBJ_DIR)/lj_sdk_long_cubin.h: $(OBJ_DIR)/lj_sdk_long.cubin $(OBJ_DIR)/lj_sdk_long.cubin
+	$(BIN2C) -c -n lj_sdk_long $(OBJ_DIR)/lj_sdk_long.cubin > $(OBJ_DIR)/lj_sdk_long_cubin.h
+
+$(OBJ_DIR)/lal_lj_sdk_long.o: $(ALL_H) lal_lj_sdk_long.h lal_lj_sdk_long.cpp $(OBJ_DIR)/lj_sdk_long_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_sdk_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_sdk_long_ext.o: $(ALL_H) lal_lj_sdk_long.h lal_lj_sdk_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_sdk_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/eam.cubin: lal_eam.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_eam.cu
+
+$(OBJ_DIR)/eam_cubin.h: $(OBJ_DIR)/eam.cubin $(OBJ_DIR)/eam.cubin
+	$(BIN2C) -c -n eam $(OBJ_DIR)/eam.cubin > $(OBJ_DIR)/eam_cubin.h
+
+$(OBJ_DIR)/lal_eam.o: $(ALL_H) lal_eam.h lal_eam.cpp $(OBJ_DIR)/eam_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_eam.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_ext.o: $(ALL_H) lal_eam.h lal_eam_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_eam_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_fs_ext.o: $(ALL_H) lal_eam.h lal_eam_fs_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_eam_fs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_eam_alloy_ext.o: $(ALL_H) lal_eam.h lal_eam_alloy_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_eam_alloy_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/buck.cubin: lal_buck.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_buck.cu
+
+$(OBJ_DIR)/buck_cubin.h: $(OBJ_DIR)/buck.cubin $(OBJ_DIR)/buck.cubin
+	$(BIN2C) -c -n buck $(OBJ_DIR)/buck.cubin > $(OBJ_DIR)/buck_cubin.h
+
+$(OBJ_DIR)/lal_buck.o: $(ALL_H) lal_buck.h lal_buck.cpp $(OBJ_DIR)/buck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_buck.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_buck_ext.o: $(ALL_H) lal_buck.h lal_buck_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_buck_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/buck_coul.cubin: lal_buck_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_buck_coul.cu
+
+$(OBJ_DIR)/buck_coul_cubin.h: $(OBJ_DIR)/buck_coul.cubin $(OBJ_DIR)/buck_coul.cubin
+	$(BIN2C) -c -n buck_coul $(OBJ_DIR)/buck_coul.cubin > $(OBJ_DIR)/buck_coul_cubin.h
+
+$(OBJ_DIR)/lal_buck_coul.o: $(ALL_H) lal_buck_coul.h lal_buck_coul.cpp $(OBJ_DIR)/buck_coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_buck_coul.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_buck_coul_ext.o: $(ALL_H) lal_buck_coul.h lal_buck_coul_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_buck_coul_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/buck_coul_long.cubin: lal_buck_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_buck_coul_long.cu
+
+$(OBJ_DIR)/buck_coul_long_cubin.h: $(OBJ_DIR)/buck_coul_long.cubin $(OBJ_DIR)/buck_coul_long.cubin
+	$(BIN2C) -c -n buck_coul_long $(OBJ_DIR)/buck_coul_long.cubin > $(OBJ_DIR)/buck_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_buck_coul_long.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long.cpp $(OBJ_DIR)/buck_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_buck_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_buck_coul_long_ext.o: $(ALL_H) lal_buck_coul_long.h lal_buck_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_buck_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/table.cubin: lal_table.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_table.cu
+
+$(OBJ_DIR)/table_cubin.h: $(OBJ_DIR)/table.cubin $(OBJ_DIR)/table.cubin
+	$(BIN2C) -c -n table $(OBJ_DIR)/table.cubin > $(OBJ_DIR)/table_cubin.h
+
+$(OBJ_DIR)/lal_table.o: $(ALL_H) lal_table.h lal_table.cpp $(OBJ_DIR)/table_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_table.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_table_ext.o: $(ALL_H) lal_table.h lal_table_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_table_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/yukawa.cubin: lal_yukawa.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_yukawa.cu
+
+$(OBJ_DIR)/yukawa_cubin.h: $(OBJ_DIR)/yukawa.cubin $(OBJ_DIR)/yukawa.cubin
+	$(BIN2C) -c -n yukawa $(OBJ_DIR)/yukawa.cubin > $(OBJ_DIR)/yukawa_cubin.h
+
+$(OBJ_DIR)/lal_yukawa.o: $(ALL_H) lal_yukawa.h lal_yukawa.cpp $(OBJ_DIR)/yukawa_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_yukawa.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_yukawa_ext.o: $(ALL_H) lal_yukawa.h lal_yukawa_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_yukawa_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born.cubin: lal_born.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_born.cu
+
+$(OBJ_DIR)/born_cubin.h: $(OBJ_DIR)/born.cubin $(OBJ_DIR)/born.cubin
+	$(BIN2C) -c -n born $(OBJ_DIR)/born.cubin > $(OBJ_DIR)/born_cubin.h
+
+$(OBJ_DIR)/lal_born.o: $(ALL_H) lal_born.h lal_born.cpp $(OBJ_DIR)/born_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_born.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_ext.o: $(ALL_H) lal_born.h lal_born_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_born_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf.cubin: lal_born_coul_wolf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_born_coul_wolf.cu
+
+$(OBJ_DIR)/born_coul_wolf_cubin.h: $(OBJ_DIR)/born_coul_wolf.cubin $(OBJ_DIR)/born_coul_wolf.cubin
+	$(BIN2C) -c -n born_coul_wolf $(OBJ_DIR)/born_coul_wolf.cubin > $(OBJ_DIR)/born_coul_wolf_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_wolf.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf.cpp $(OBJ_DIR)/born_coul_wolf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_born_coul_wolf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_ext.o: $(ALL_H) lal_born_coul_wolf.h lal_born_coul_wolf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_born_coul_wolf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long.cubin: lal_born_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_born_coul_long.cu
+
+$(OBJ_DIR)/born_coul_long_cubin.h: $(OBJ_DIR)/born_coul_long.cubin $(OBJ_DIR)/born_coul_long.cubin
+	$(BIN2C) -c -n born_coul_long $(OBJ_DIR)/born_coul_long.cubin > $(OBJ_DIR)/born_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_born_coul_long.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long.cpp $(OBJ_DIR)/born_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_born_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_ext.o: $(ALL_H) lal_born_coul_long.h lal_born_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_born_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj.cubin: lal_dipole_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_dipole_lj.cu
+
+$(OBJ_DIR)/dipole_lj_cubin.h: $(OBJ_DIR)/dipole_lj.cubin $(OBJ_DIR)/dipole_lj.cubin
+	$(BIN2C) -c -n dipole_lj $(OBJ_DIR)/dipole_lj.cubin > $(OBJ_DIR)/dipole_lj_cubin.h
+
+$(OBJ_DIR)/lal_dipole_lj.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj.cpp $(OBJ_DIR)/dipole_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_ext.o: $(ALL_H) lal_dipole_lj.h lal_dipole_lj_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_lj_sf.cubin: lal_dipole_lj_sf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_dipole_lj_sf.cu
+
+$(OBJ_DIR)/dipole_lj_sf_cubin.h: $(OBJ_DIR)/dipole_lj_sf.cubin $(OBJ_DIR)/dipole_lj_sf.cubin
+	$(BIN2C) -c -n dipole_lj_sf $(OBJ_DIR)/dipole_lj_sf.cubin > $(OBJ_DIR)/dipole_lj_sf_cubin.h
+
+$(OBJ_DIR)/lal_dipole_lj_sf.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf.cpp $(OBJ_DIR)/dipole_lj_sf_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_lj_sf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_lj_sf_ext.o: $(ALL_H) lal_dipole_lj_sf.h lal_dipole_lj_sf_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_lj_sf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/colloid.cubin: lal_colloid.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_colloid.cu
+
+$(OBJ_DIR)/colloid_cubin.h: $(OBJ_DIR)/colloid.cubin $(OBJ_DIR)/colloid.cubin
+	$(BIN2C) -c -n colloid $(OBJ_DIR)/colloid.cubin > $(OBJ_DIR)/colloid_cubin.h
+
+$(OBJ_DIR)/lal_colloid.o: $(ALL_H) lal_colloid.h lal_colloid.cpp $(OBJ_DIR)/colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_colloid_ext.o: $(ALL_H) lal_colloid.h lal_colloid_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/gauss.cubin: lal_gauss.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_gauss.cu
+
+$(OBJ_DIR)/gauss_cubin.h: $(OBJ_DIR)/gauss.cubin $(OBJ_DIR)/gauss.cubin
+	$(BIN2C) -c -n gauss $(OBJ_DIR)/gauss.cubin > $(OBJ_DIR)/gauss_cubin.h
+
+$(OBJ_DIR)/lal_gauss.o: $(ALL_H) lal_gauss.h lal_gauss.cpp $(OBJ_DIR)/gauss_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_gauss.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_gauss_ext.o: $(ALL_H) lal_gauss.h lal_gauss_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_gauss_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/yukawa_colloid.cubin: lal_yukawa_colloid.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_yukawa_colloid.cu
+
+$(OBJ_DIR)/yukawa_colloid_cubin.h: $(OBJ_DIR)/yukawa_colloid.cubin $(OBJ_DIR)/yukawa_colloid.cubin
+	$(BIN2C) -c -n yukawa_colloid $(OBJ_DIR)/yukawa_colloid.cubin > $(OBJ_DIR)/yukawa_colloid_cubin.h
+
+$(OBJ_DIR)/lal_yukawa_colloid.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid.cpp $(OBJ_DIR)/yukawa_colloid_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_yukawa_colloid.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_yukawa_colloid_ext.o: $(ALL_H) lal_yukawa_colloid.h lal_yukawa_colloid_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_yukawa_colloid_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_debye.cubin: lal_lj_coul_debye.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_coul_debye.cu
+
+$(OBJ_DIR)/lj_coul_debye_cubin.h: $(OBJ_DIR)/lj_coul_debye.cubin $(OBJ_DIR)/lj_coul_debye.cubin
+	$(BIN2C) -c -n lj_coul_debye $(OBJ_DIR)/lj_coul_debye.cubin > $(OBJ_DIR)/lj_coul_debye_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul_debye.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye.cpp $(OBJ_DIR)/lj_coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_debye_ext.o: $(ALL_H) lal_lj_coul_debye.h lal_lj_coul_debye_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_debye_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_dsf.cubin: lal_coul_dsf.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_coul_dsf.cu
+
+$(OBJ_DIR)/coul_dsf_cubin.h: $(OBJ_DIR)/coul_dsf.cubin $(OBJ_DIR)/coul_dsf.cubin
+	$(BIN2C) -c -n coul_dsf $(OBJ_DIR)/coul_dsf.cubin > $(OBJ_DIR)/coul_dsf_cubin.h
+
+$(OBJ_DIR)/lal_coul_dsf.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf.cpp $(OBJ_DIR)/coul_dsf_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul_dsf.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_dsf_ext.o: $(ALL_H) lal_coul_dsf.h lal_coul_dsf_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_dsf_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/vashishta.cubin: lal_vashishta.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_vashishta.cu
+
+$(OBJ_DIR)/vashishta_cubin.h: $(OBJ_DIR)/vashishta.cubin $(OBJ_DIR)/vashishta.cubin
+	$(BIN2C) -c -n vashishta $(OBJ_DIR)/vashishta.cubin > $(OBJ_DIR)/vashishta_cubin.h
+
+$(OBJ_DIR)/lal_vashishta.o: $(ALL_H) lal_vashishta.h lal_vashishta.cpp $(OBJ_DIR)/vashishta_cubin.h $(OBJ_DIR)/lal_base_three.o
+	$(CUDR) -o $@ -c lal_vashishta.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_vashishta_ext.o: $(ALL_H) lal_vashishta.h lal_vashishta_ext.cpp lal_base_three.h
+	$(CUDR) -o $@ -c lal_vashishta_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/sw.cubin: lal_sw.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_sw.cu
+
+$(OBJ_DIR)/sw_cubin.h: $(OBJ_DIR)/sw.cubin $(OBJ_DIR)/sw.cubin
+	$(BIN2C) -c -n sw $(OBJ_DIR)/sw.cubin > $(OBJ_DIR)/sw_cubin.h
+
+$(OBJ_DIR)/lal_sw.o: $(ALL_H) lal_sw.h lal_sw.cpp $(OBJ_DIR)/sw_cubin.h $(OBJ_DIR)/lal_base_three.o
+	$(CUDR) -o $@ -c lal_sw.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_sw_ext.o: $(ALL_H) lal_sw.h lal_sw_ext.cpp lal_base_three.h
+	$(CUDR) -o $@ -c lal_sw_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/beck.cubin: lal_beck.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_beck.cu
+
+$(OBJ_DIR)/beck_cubin.h: $(OBJ_DIR)/beck.cubin $(OBJ_DIR)/beck.cubin
+	$(BIN2C) -c -n beck $(OBJ_DIR)/beck.cubin > $(OBJ_DIR)/beck_cubin.h
+
+$(OBJ_DIR)/lal_beck.o: $(ALL_H) lal_beck.h lal_beck.cpp $(OBJ_DIR)/beck_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_beck.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_beck_ext.o: $(ALL_H) lal_beck.h lal_beck_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_beck_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/mie.cubin: lal_mie.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_mie.cu
+
+$(OBJ_DIR)/mie_cubin.h: $(OBJ_DIR)/mie.cubin $(OBJ_DIR)/mie.cubin
+	$(BIN2C) -c -n mie $(OBJ_DIR)/mie.cubin > $(OBJ_DIR)/mie_cubin.h
+
+$(OBJ_DIR)/lal_mie.o: $(ALL_H) lal_mie.h lal_mie.cpp $(OBJ_DIR)/mie_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_mie.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_mie_ext.o: $(ALL_H) lal_mie.h lal_mie_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_mie_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/soft.cubin: lal_soft.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_soft.cu
+
+$(OBJ_DIR)/soft_cubin.h: $(OBJ_DIR)/soft.cubin $(OBJ_DIR)/soft.cubin
+	$(BIN2C) -c -n soft $(OBJ_DIR)/soft.cubin > $(OBJ_DIR)/soft_cubin.h
+
+$(OBJ_DIR)/lal_soft.o: $(ALL_H) lal_soft.h lal_soft.cpp $(OBJ_DIR)/soft_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_soft.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_soft_ext.o: $(ALL_H) lal_soft.h lal_soft_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_soft_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_coul_msm.cubin: lal_lj_coul_msm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_coul_msm.cu
+
+$(OBJ_DIR)/lj_coul_msm_cubin.h: $(OBJ_DIR)/lj_coul_msm.cubin $(OBJ_DIR)/lj_coul_msm.cubin
+	$(BIN2C) -c -n lj_coul_msm $(OBJ_DIR)/lj_coul_msm.cubin > $(OBJ_DIR)/lj_coul_msm_cubin.h
+
+$(OBJ_DIR)/lal_lj_coul_msm.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm.cpp $(OBJ_DIR)/lj_coul_msm_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_coul_msm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_coul_msm_ext.o: $(ALL_H) lal_lj_coul_msm.h lal_lj_coul_msm_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_coul_msm_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_gromacs.cubin: lal_lj_gromacs.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_gromacs.cu
+
+$(OBJ_DIR)/lj_gromacs_cubin.h: $(OBJ_DIR)/lj_gromacs.cubin $(OBJ_DIR)/lj_gromacs.cubin
+	$(BIN2C) -c -n lj_gromacs $(OBJ_DIR)/lj_gromacs.cubin > $(OBJ_DIR)/lj_gromacs_cubin.h
+
+$(OBJ_DIR)/lal_lj_gromacs.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs.cpp $(OBJ_DIR)/lj_gromacs_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_gromacs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_gromacs_ext.o: $(ALL_H) lal_lj_gromacs.h lal_lj_gromacs_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj_gromacs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dpd.cubin: lal_dpd.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_dpd.cu
+
+$(OBJ_DIR)/dpd_cubin.h: $(OBJ_DIR)/dpd.cubin $(OBJ_DIR)/dpd.cubin
+	$(BIN2C) -c -n dpd $(OBJ_DIR)/dpd.cubin > $(OBJ_DIR)/dpd_cubin.h
+
+$(OBJ_DIR)/ufm.cubin: lal_ufm.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_ufm.cu
+
+$(OBJ_DIR)/ufm_cubin.h: $(OBJ_DIR)/ufm.cubin $(OBJ_DIR)/ufm.cubin
+	$(BIN2C) -c -n ufm $(OBJ_DIR)/ufm.cubin > $(OBJ_DIR)/ufm_cubin.h
+
+$(OBJ_DIR)/lal_ufm.o: $(ALL_H) lal_ufm.h lal_ufm.cpp $(OBJ_DIR)/ufm_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_ufm.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_ufm_ext.o: $(ALL_H) lal_ufm.h lal_ufm_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_ufm_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd.o: $(ALL_H) lal_dpd.h lal_dpd.cpp $(OBJ_DIR)/dpd_cubin.h $(OBJ_DIR)/lal_base_dpd.o
+	$(CUDR) -o $@ -c lal_dpd.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dpd_ext.o: $(ALL_H) lal_dpd.h lal_dpd_ext.cpp lal_base_dpd.h
+	$(CUDR) -o $@ -c lal_dpd_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/tersoff.cubin: lal_tersoff.cu lal_precision.h lal_tersoff_extra.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_tersoff.cu
+
+$(OBJ_DIR)/tersoff_cubin.h: $(OBJ_DIR)/tersoff.cubin $(OBJ_DIR)/tersoff.cubin
+	$(BIN2C) -c -n tersoff $(OBJ_DIR)/tersoff.cubin > $(OBJ_DIR)/tersoff_cubin.h
+
+$(OBJ_DIR)/lal_tersoff.o: $(ALL_H) lal_tersoff.h lal_tersoff.cpp $(OBJ_DIR)/tersoff_cubin.h $(OBJ_DIR)/lal_base_three.o
+	$(CUDR) -o $@ -c lal_tersoff.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_tersoff_ext.o: $(ALL_H) lal_tersoff.h lal_tersoff_ext.cpp lal_base_three.h
+	$(CUDR) -o $@ -c lal_tersoff_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/tersoff_zbl.cubin: lal_tersoff_zbl.cu lal_precision.h lal_tersoff_zbl_extra.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_tersoff_zbl.cu
+
+$(OBJ_DIR)/tersoff_zbl_cubin.h: $(OBJ_DIR)/tersoff_zbl.cubin $(OBJ_DIR)/tersoff_zbl.cubin
+	$(BIN2C) -c -n tersoff_zbl $(OBJ_DIR)/tersoff_zbl.cubin > $(OBJ_DIR)/tersoff_zbl_cubin.h
+
+$(OBJ_DIR)/lal_tersoff_zbl.o: $(ALL_H) lal_tersoff_zbl.h lal_tersoff_zbl.cpp $(OBJ_DIR)/tersoff_zbl_cubin.h $(OBJ_DIR)/lal_base_three.o
+	$(CUDR) -o $@ -c lal_tersoff_zbl.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_tersoff_zbl_ext.o: $(ALL_H) lal_tersoff_zbl.h lal_tersoff_zbl_ext.cpp lal_base_three.h
+	$(CUDR) -o $@ -c lal_tersoff_zbl_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/tersoff_mod.cubin: lal_tersoff_mod.cu lal_precision.h lal_tersoff_mod_extra.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_tersoff_mod.cu
+
+$(OBJ_DIR)/tersoff_mod_cubin.h: $(OBJ_DIR)/tersoff_mod.cubin $(OBJ_DIR)/tersoff_mod.cubin
+	$(BIN2C) -c -n tersoff_mod $(OBJ_DIR)/tersoff_mod.cubin > $(OBJ_DIR)/tersoff_mod_cubin.h
+
+$(OBJ_DIR)/lal_tersoff_mod.o: $(ALL_H) lal_tersoff_mod.h lal_tersoff_mod.cpp $(OBJ_DIR)/tersoff_mod_cubin.h $(OBJ_DIR)/lal_base_three.o
+	$(CUDR) -o $@ -c lal_tersoff_mod.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_tersoff_mod_ext.o: $(ALL_H) lal_tersoff_mod.h lal_tersoff_mod_ext.cpp lal_base_three.h
+	$(CUDR) -o $@ -c lal_tersoff_mod_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul.cubin: lal_coul.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_coul.cu
+
+$(OBJ_DIR)/coul_cubin.h: $(OBJ_DIR)/coul.cubin $(OBJ_DIR)/coul.cubin
+	$(BIN2C) -c -n coul $(OBJ_DIR)/coul.cubin > $(OBJ_DIR)/coul_cubin.h
+
+$(OBJ_DIR)/lal_coul.o: $(ALL_H) lal_coul.h lal_coul.cpp $(OBJ_DIR)/coul_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_ext.o: $(ALL_H) lal_coul.h lal_coul_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_debye.cubin: lal_coul_debye.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_coul_debye.cu
+
+$(OBJ_DIR)/coul_debye_cubin.h: $(OBJ_DIR)/coul_debye.cubin $(OBJ_DIR)/coul_debye.cubin
+	$(BIN2C) -c -n coul_debye $(OBJ_DIR)/coul_debye.cubin > $(OBJ_DIR)/coul_debye_cubin.h
+
+$(OBJ_DIR)/lal_coul_debye.o: $(ALL_H) lal_coul_debye.h lal_coul_debye.cpp $(OBJ_DIR)/coul_debye_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_coul_debye.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_debye_ext.o: $(ALL_H) lal_coul_debye.h lal_coul_debye_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_coul_debye_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/zbl.cubin: lal_zbl.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_zbl.cu
+
+$(OBJ_DIR)/zbl_cubin.h: $(OBJ_DIR)/zbl.cubin $(OBJ_DIR)/zbl.cubin
+	$(BIN2C) -c -n zbl $(OBJ_DIR)/zbl.cubin > $(OBJ_DIR)/zbl_cubin.h
+
+$(OBJ_DIR)/lal_zbl.o: $(ALL_H) lal_zbl.h lal_zbl.cpp $(OBJ_DIR)/zbl_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_zbl.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_zbl_ext.o: $(ALL_H) lal_zbl.h lal_zbl_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_zbl_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_cubic.cubin: lal_lj_cubic.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_cubic.cu
+
+$(OBJ_DIR)/lj_cubic_cubin.h: $(OBJ_DIR)/lj_cubic.cubin $(OBJ_DIR)/lj_cubic.cubin
+	$(BIN2C) -c -n lj_cubic $(OBJ_DIR)/lj_cubic.cubin > $(OBJ_DIR)/lj_cubic_cubin.h
+
+$(OBJ_DIR)/lal_lj_cubic.o: $(ALL_H) lal_lj_cubic.h lal_lj_cubic.cpp $(OBJ_DIR)/lj_cubic_cubin.h $(OBJ_DIR)/lal_base_atomic.o
+	$(CUDR) -o $@ -c lal_lj_cubic.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_cubic_ext.o: $(ALL_H) lal_lj_cubic.h lal_lj_cubic_ext.cpp lal_base_atomic.h
+	$(CUDR) -o $@ -c lal_lj_cubic_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/dipole_long_lj.cubin: lal_dipole_long_lj.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_dipole_long_lj.cu
+
+$(OBJ_DIR)/dipole_long_lj_cubin.h: $(OBJ_DIR)/dipole_long_lj.cubin $(OBJ_DIR)/dipole_long_lj.cubin
+	$(BIN2C) -c -n dipole_long_lj $(OBJ_DIR)/dipole_long_lj.cubin > $(OBJ_DIR)/dipole_long_lj_cubin.h
+
+$(OBJ_DIR)/lal_dipole_long_lj.o: $(ALL_H) lal_dipole_long_lj.h lal_dipole_long_lj.cpp $(OBJ_DIR)/dipole_long_lj_cubin.h $(OBJ_DIR)/lal_base_dipole.o
+	$(CUDR) -o $@ -c lal_dipole_long_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_long_lj_ext.o: $(ALL_H) lal_dipole_long_lj.h lal_dipole_long_lj_ext.cpp lal_base_dipole.h
+	$(CUDR) -o $@ -c lal_dipole_long_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand_coul_long.cubin: lal_lj_expand_coul_long.cu lal_precision.h lal_preprocessor.h
+	$(CUDA) --fatbin -DNV_KERNEL -o $@ lal_lj_expand_coul_long.cu
+
+$(OBJ_DIR)/lj_expand_coul_long_cubin.h: $(OBJ_DIR)/lj_expand_coul_long.cubin $(OBJ_DIR)/lj_expand_coul_long.cubin
+	$(BIN2C) -c -n lj_expand_coul_long $(OBJ_DIR)/lj_expand_coul_long.cubin > $(OBJ_DIR)/lj_expand_coul_long_cubin.h
+
+$(OBJ_DIR)/lal_lj_expand_coul_long.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_expand_coul_long.cpp $(OBJ_DIR)/lj_expand_coul_long_cubin.h $(OBJ_DIR)/lal_base_charge.o
+	$(CUDR) -o $@ -c lal_lj_expand_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_expand_coul_long_ext.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_expand_coul_long_ext.cpp lal_base_charge.h
+	$(CUDR) -o $@ -c lal_lj_expand_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(BIN_DIR)/nvc_get_devices: ./geryon/ucl_get_devices.cpp $(NVD_H)
+	$(CUDR) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_CUDADR $(CUDA_LIB) -lcuda 
+
+$(GPU_LIB): $(OBJS) $(CUDPP)
+	$(AR) -crusv $(GPU_LIB) $(OBJS) $(CUDPP)
+	@cp $(EXTRAMAKE) Makefile.lammps
+
+clean:
+	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CUDPP) $(CBNS) *.linkinfo
+
+veryclean: clean
+	-rm -rf *~ *.linkinfo
+
+cleanlib:
+	-rm -f $(EXECS) $(GPU_LIB) $(OBJS) $(CBNS) *.linkinfo
--- a/lib/gpu/Opencl.makefile
+++ b/lib/gpu/Opencl.makefile
@ -66,7 +66,12 @@ OBJS = $(OBJ_DIR)/lal_atom.o $(OBJ_DIR)/lal_answer.o \
       $(OBJ_DIR)/lal_coul_debye.o $(OBJ_DIR)/lal_coul_debye_ext.o \
       $(OBJ_DIR)/lal_zbl.o $(OBJ_DIR)/lal_zbl_ext.o \
       $(OBJ_DIR)/lal_lj_cubic.o $(OBJ_DIR)/lal_lj_cubic_ext.o \
-       $(OBJ_DIR)/lal_ufm.o $(OBJ_DIR)/lal_ufm_ext.o
+       $(OBJ_DIR)/lal_ufm.o $(OBJ_DIR)/lal_ufm_ext.o \
+       $(OBJ_DIR)/lal_dipole_long_lj.o $(OBJ_DIR)/lal_dipole_long_lj_ext.o \
+       $(OBJ_DIR)/lal_lj_expand_coul_long.o $(OBJ_DIR)/lal_lj_expand_coul_long_ext.o \
+       $(OBJ_DIR)/lal_coul_long_cs.o $(OBJ_DIR)/lal_coul_long_cs_ext.o \
+       $(OBJ_DIR)/lal_born_coul_long_cs.o $(OBJ_DIR)/lal_born_coul_long_cs_ext.o \
+       $(OBJ_DIR)/lal_born_coul_wolf_cs.o $(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o

 KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/neighbor_cpu_cl.h $(OBJ_DIR)/pppm_cl.h \
@ -95,7 +100,9 @@ KERS = $(OBJ_DIR)/device_cl.h $(OBJ_DIR)/atom_cl.h \
       $(OBJ_DIR)/tersoff_mod_cl.h $(OBJ_DIR)/coul_cl.h \
       $(OBJ_DIR)/coul_debye_cl.h $(OBJ_DIR)/zbl_cl.h \
       $(OBJ_DIR)/lj_cubic_cl.h $(OBJ_DIR)/vashishta_cl.h \
-       $(OBJ_DIR)/ufm_cl.h
+       $(OBJ_DIR)/ufm_cl.h  $(OBJ_DIR)/dipole_long_lj_cl.h \
+       $(OBJ_DIR)/lj_expand_coul_long_cl.h $(OBJ_DIR)/coul_long_cs_cl.h \
+       $(OBJ_DIR)/born_coul_long_cs_cl.h $(OBJ_DIR)/born_coul_wolf_cs_cl.h


 OCL_EXECS = $(BIN_DIR)/ocl_get_devices
@ -588,7 +595,52 @@ $(OBJ_DIR)/lal_ufm.o: $(ALL_H) lal_ufm.h lal_ufm.cpp  $(OBJ_DIR)/ufm_cl.h $(OBJ_
 $(OBJ_DIR)/lal_ufm_ext.o: $(ALL_H) lal_ufm.h lal_ufm_ext.cpp lal_base_atomic.h
 	$(OCL) -o $@ -c lal_ufm_ext.cpp -I$(OBJ_DIR)

-$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp
+$(OBJ_DIR)/dipole_long_lj_cl.h: lal_dipole_long_lj.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh dipole_long_lj $(PRE1_H) lal_dipole_long_lj.cu $(OBJ_DIR)/dipole_long_lj_cl.h;
+
+$(OBJ_DIR)/lal_dipole_long_lj.o: $(ALL_H) lal_dipole_long_lj.h lal_dipole_long_lj.cpp  $(OBJ_DIR)/dipole_long_lj_cl.h $(OBJ_DIR)/lj_expand_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_dipole_long_lj.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_dipole_long_lj_ext.o: $(ALL_H) lal_dipole_long_lj.h lal_dipole_long_lj_ext.cpp lal_base_dipole.h
+	$(OCL) -o $@ -c lal_dipole_long_lj_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lj_expand_coul_long_cl.h: lal_lj_expand_coul_long.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh lj_expand_coul_long $(PRE1_H) lal_lj_expand_coul_long.cu $(OBJ_DIR)/lj_expand_coul_long_cl.h;
+
+$(OBJ_DIR)/lal_lj_expand_coul_long.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_expand_coul_long.cpp  $(OBJ_DIR)/lj_expand_coul_long_cl.h $(OBJ_DIR)/lj_expand_coul_long_cl.h $(OBJ_DIR)/lal_base_charge.o
+	$(OCL) -o $@ -c lal_lj_expand_coul_long.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_lj_expand_coul_long_ext.o: $(ALL_H) lal_lj_expand_coul_long.h lal_lj_expand_coul_long_ext.cpp lal_base_charge.h
+	$(OCL) -o $@ -c lal_lj_expand_coul_long_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/coul_long_cs_cl.h: lal_coul_long_cs.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh coul_long_cs $(PRE1_H) lal_coul_long_cs.cu $(OBJ_DIR)/coul_long_cs_cl.h;
+
+$(OBJ_DIR)/lal_coul_long_cs.o: $(ALL_H) lal_coul_long_cs.h lal_coul_long_cs.cpp  $(OBJ_DIR)/coul_long_cs_cl.h $(OBJ_DIR)/coul_long_cs_cl.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_coul_long.o
+	$(OCL) -o $@ -c lal_coul_long_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_coul_long_cs_ext.o: $(ALL_H) lal_coul_long_cs.h lal_coul_long_cs_ext.cpp lal_coul_long.h
+	$(OCL) -o $@ -c lal_coul_long_cs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_long_cs_cl.h: lal_born_coul_long_cs.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born_coul_long_cs $(PRE1_H) lal_born_coul_long_cs.cu $(OBJ_DIR)/born_coul_long_cs_cl.h;
+
+$(OBJ_DIR)/lal_born_coul_long_cs.o: $(ALL_H) lal_born_coul_long_cs.h lal_born_coul_long_cs.cpp  $(OBJ_DIR)/born_coul_long_cs_cl.h $(OBJ_DIR)/born_coul_long_cs_cl.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_born_coul_long.o
+	$(OCL) -o $@ -c lal_born_coul_long_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_long_cs_ext.o: $(ALL_H) lal_born_coul_long_cs.h lal_born_coul_long_cs_ext.cpp lal_born_coul_long.h
+	$(OCL) -o $@ -c lal_born_coul_long_cs_ext.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/born_coul_wolf_cs_cl.h: lal_born_coul_wolf_cs.cu $(PRE1_H)
+	$(BSH) ./geryon/file_to_cstr.sh born_coul_wolf_cs $(PRE1_H) lal_born_coul_wolf_cs.cu $(OBJ_DIR)/born_coul_wolf_cs_cl.h;
+
+$(OBJ_DIR)/lal_born_coul_wolf_cs.o: $(ALL_H) lal_born_coul_wolf_cs.h lal_born_coul_wolf_cs.cpp  $(OBJ_DIR)/born_coul_wolf_cs_cl.h $(OBJ_DIR)/born_coul_wolf_cs_cl.h $(OBJ_DIR)/lal_base_charge.o $(OBJ_DIR)/lal_born_coul_wolf.o
+	$(OCL) -o $@ -c lal_born_coul_wolf_cs.cpp -I$(OBJ_DIR)
+
+$(OBJ_DIR)/lal_born_coul_wolf_cs_ext.o: $(ALL_H) lal_born_coul_wolf_cs.h lal_born_coul_wolf_cs_ext.cpp lal_born_coul_wolf.h
+	$(OCL) -o $@ -c lal_born_coul_wolf_cs_ext.cpp -I$(OBJ_DIR)
+
+$(BIN_DIR)/ocl_get_devices: ./geryon/ucl_get_devices.cpp $(OCL_H)
 	$(OCL) -o $@ ./geryon/ucl_get_devices.cpp -DUCL_OPENCL $(OCL_LINK) 

 $(OCL_LIB): $(OBJS) $(PTXS)
--- a/lib/gpu/geryon/ocl_device.h
+++ b/lib/gpu/geryon/ocl_device.h
@ -280,6 +280,9 @@ class UCL_Device {
  /// Return the OpenCL type for the device
  inline cl_device_id & cl_device() { return _cl_device; }

+  /// Select the platform that has accelerators
+  inline void set_platform_accelerator(int pid=-1);
+
 private:
  int _num_platforms;          // Number of platforms
  int _platform;               // UCL_Device ID for current platform
@ -311,8 +314,8 @@ UCL_Device::UCL_Device() {
    return;
  } else
    _num_platforms=static_cast<int>(nplatforms);
-
-  set_platform(0);
+  // note that platform 0 may not necessarily be associated with accelerators
+  set_platform_accelerator();
 }

 UCL_Device::~UCL_Device() {
@ -320,6 +323,7 @@ UCL_Device::~UCL_Device() {
 }

 void UCL_Device::clear() {
+  _properties.clear();
  if (_device>-1) {
    for (size_t i=0; i<_cq.size(); i++) {
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
@ -529,75 +533,105 @@ int UCL_Device::set(int num) {
  return create_context();
 }

-// List all devices along with all properties
+// List all devices from all platforms along with all properties
 void UCL_Device::print_all(std::ostream &out) {
-  if (num_devices() == 0)
-    out << "There is no device supporting OpenCL\n";
-  for (int i=0; i<num_devices(); ++i) {
-    out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
-    out << "  Type of device:                                "
-        << device_type_name(i).c_str() << std::endl;
-    out << "  Double precision support:                      ";
-    if (double_precision(i))
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Total amount of global memory:                 "
-        << gigabytes(i) << " GB\n";
-    out << "  Number of compute units/multiprocessors:       "
-        << _properties[i].compute_units << std::endl;
-    //out << "  Number of cores:                               "
-    //    << cores(i) << std::endl;
-    out << "  Total amount of constant memory:               "
-        << _properties[i].const_mem << " bytes\n";
-    out << "  Total amount of local/shared memory per block: "
-        << _properties[i].shared_mem << " bytes\n";
-    //out << "  Total number of registers available per block: "
-    //    << _properties[i].regsPerBlock << std::endl;
-    //out << "  Warp size:                                     "
-    //    << _properties[i].warpSize << std::endl;
-    out << "  Maximum group size (# of threads per block)    "
-        << _properties[i].work_group_size << std::endl;
-    out << "  Maximum item sizes (# threads for each dim)    "
-        << _properties[i].work_item_size[0] << " x "
-        << _properties[i].work_item_size[1] << " x "
-        << _properties[i].work_item_size[2] << std::endl;
-    //out << "  Maximum sizes of each dimension of a grid:     "
-    //    << _properties[i].maxGridSize[0] << " x "
-    //    << _properties[i].maxGridSize[1] << " x "
-    //    << _properties[i].maxGridSize[2] << std::endl;
-    //out << "  Maximum memory pitch:                          "
-    //    << _properties[i].memPitch) << " bytes\n";
-    //out << "  Texture alignment:                             "
-    //    << _properties[i].textureAlignment << " bytes\n";
-    out << "  Clock rate:                                    "
-        << clock_rate(i) << " GHz\n";
-    //out << "  Concurrent copy and execution:                 ";
-    out << "  ECC support:                                   ";
-    if (_properties[i].ecc_support)
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Device fission into equal partitions:          ";
-    if (fission_equal(i))
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Device fission by counts:                      ";
-    if (fission_by_counts(i))
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Device fission by affinity:                    ";
-    if (fission_by_affinity(i))
-      out << "Yes\n";
-    else
-      out << "No\n";
-    out << "  Maximum subdevices from fission:               "
-        << max_sub_devices(i) << std::endl;
+  // --- loop through the platforms
+  for (int n=0; n<_num_platforms; n++) {
+
+    set_platform(n);
+
+    out << "\nPlatform " << n << ":\n";
+
+    if (num_devices() == 0)
+      out << "There is no device supporting OpenCL\n";
+    for (int i=0; i<num_devices(); ++i) {
+      out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
+      out << "  Type of device:                                "
+          << device_type_name(i).c_str() << std::endl;
+      out << "  Double precision support:                      ";
+      if (double_precision(i))
+        out << "Yes\n";
+      else
+        out << "No\n";
+      out << "  Total amount of global memory:                 "
+          << gigabytes(i) << " GB\n";
+      out << "  Number of compute units/multiprocessors:       "
+          << _properties[i].compute_units << std::endl;
+      //out << "  Number of cores:                               "
+      //    << cores(i) << std::endl;
+      out << "  Total amount of constant memory:               "
+          << _properties[i].const_mem << " bytes\n";
+      out << "  Total amount of local/shared memory per block: "
+          << _properties[i].shared_mem << " bytes\n";
+      //out << "  Total number of registers available per block: "
+      //    << _properties[i].regsPerBlock << std::endl;
+      //out << "  Warp size:                                     "
+      //    << _properties[i].warpSize << std::endl;
+      out << "  Maximum group size (# of threads per block)    "
+          << _properties[i].work_group_size << std::endl;
+      out << "  Maximum item sizes (# threads for each dim)    "
+          << _properties[i].work_item_size[0] << " x "
+          << _properties[i].work_item_size[1] << " x "
+          << _properties[i].work_item_size[2] << std::endl;
+      //out << "  Maximum sizes of each dimension of a grid:     "
+      //    << _properties[i].maxGridSize[0] << " x "
+      //    << _properties[i].maxGridSize[1] << " x "
+      //    << _properties[i].maxGridSize[2] << std::endl;
+      //out << "  Maximum memory pitch:                          "
+      //    << _properties[i].memPitch) << " bytes\n";
+      //out << "  Texture alignment:                             "
+      //    << _properties[i].textureAlignment << " bytes\n";
+      out << "  Clock rate:                                    "
+          << clock_rate(i) << " GHz\n";
+      //out << "  Concurrent copy and execution:                 ";
+      out << "  ECC support:                                   ";
+      if (_properties[i].ecc_support)
+        out << "Yes\n";
+      else
+        out << "No\n";
+      out << "  Device fission into equal partitions:          ";
+      if (fission_equal(i))
+        out << "Yes\n";
+      else
+        out << "No\n";
+      out << "  Device fission by counts:                      ";
+      if (fission_by_counts(i))
+        out << "Yes\n";
+      else
+        out << "No\n";
+      out << "  Device fission by affinity:                    ";
+      if (fission_by_affinity(i))
+        out << "Yes\n";
+      else
+        out << "No\n";
+      out << "  Maximum subdevices from fission:               "
+          << max_sub_devices(i) << std::endl;
+    }
  }
 }

+// Select the platform that is associated with accelerators
+// if pid < 0, select the first platform
+void UCL_Device::set_platform_accelerator(int pid) {
+  if (pid < 0) {
+    int found = 0;
+    for (int n=0; n<_num_platforms; n++) {
+      set_platform(n);
+      for (int i=0; i<num_devices(); i++) {
+        if (_properties[i].device_type==CL_DEVICE_TYPE_CPU ||
+            _properties[i].device_type==CL_DEVICE_TYPE_GPU ||
+            _properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR) {
+          found = 1;
+          break;
+        }
+      }
+      if (found) break;
+    }
+  } else {
+    set_platform(pid);
+  }
 }

+} // namespace ucl_opencl 
+
 #endif
--- a/lib/gpu/lal_born_coul_long.cpp
+++ b/lib/gpu/lal_born_coul_long.cpp
@ -57,7 +57,7 @@ int BornCoulLongT::init(const int ntypes, double **host_cutsq, double **host_rho
                       const double g_ewald) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,born_coul_long,"k_born_long");
+                            _screen,born_coul_long,"k_born_coul_long");
  if (success!=0)
    return success;

--- a/lib/gpu/lal_born_coul_long.cu
+++ b/lib/gpu/lal_born_coul_long.cu
@ -29,7 +29,7 @@ texture<int2> q_tex;
 #define q_tex q_
 #endif

-__kernel void k_born_long(const __global numtyp4 *restrict x_,
+__kernel void k_born_coul_long(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1,
                          const __global numtyp4 *restrict coeff2,
                          const int lj_types,
@ -110,7 +110,7 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else forcecoul = (numtyp)0.0;

-        if (rsq < cutsq_sigma[mtype].y) {
+        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
@ -127,7 +127,7 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
+          if (rsq < cutsq_sigma[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
@ -149,7 +149,7 @@ __kernel void k_born_long(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
+__kernel void k_born_coul_long_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict coeff1_in,
                               const __global numtyp4 *restrict coeff2_in,
                               const __global numtyp *restrict sp_lj_in,
@ -232,7 +232,7 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
        } else forcecoul = (numtyp)0.0;

-        if (rsq < cutsq_sigma[mtype].y) {
+        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
          numtyp r = ucl_sqrt(rsq);
          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
          r6inv = r2inv*r2inv*r2inv;
@ -249,7 +249,7 @@ __kernel void k_born_long_fast(const __global numtyp4 *restrict x_,
        if (eflag>0) {
          if (rsq < cut_coulsq)
            e_coul += prefactor*(_erfc-factor_coul);
-          if (rsq < coeff1[mtype].w) {
+          if (rsq < cutsq_sigma[mtype].y) {
            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
              + coeff2[mtype].z*r2inv*r6inv;
            energy+=factor_lj*(e-coeff2[mtype].w);
--- a/lib/gpu/lal_born_coul_long.h
+++ b/lib/gpu/lal_born_coul_long.h
@ -78,7 +78,7 @@ class BornCoulLong : public BaseCharge<numtyp, acctyp> {

  numtyp _cut_coulsq, _qqrd2e, _g_ewald;

- private:
+ protected:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
--- a/lib/gpu/lal_born_coul_long_cs.cpp
+++ b/lib/gpu/lal_born_coul_long_cs.cpp
@ -0,0 +1,95 @@
+/***************************************************************************
+                            born_coul_long_cs.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the born/coul/long/cs pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "born_coul_long_cs_cl.h"
+#elif defined(USE_CUDART)
+const char *born_coul_long_cs=0;
+#else
+#include "born_coul_long_cs_cubin.h"
+#endif
+
+#include "lal_born_coul_long_cs.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define BornCoulLongCST BornCoulLongCS<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+int BornCoulLongCST::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                       double **host_born1, double **host_born2, double **host_born3,
+                       double **host_a, double **host_c, double **host_d,
+                       double **host_sigma, double **host_offset,
+                       double *host_special_lj, const int nlocal,
+                       const int nall, const int max_nbors,
+                       const int maxspecial, const double cell_size,
+                       const double gpu_split, FILE *_screen,
+                       double **host_cut_ljsq, const double host_cut_coulsq,
+                       double *host_special_coul, const double qqrd2e,
+                       const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,born_coul_long_cs,"k_born_coul_long_cs");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  this->shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    this->shared_types=true;
+  }
+  this->_lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  this->coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,this->coeff1,host_write,host_rhoinv,
+                         host_born1,host_born2,host_born3);
+
+  this->coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,this->coeff2,host_write,host_a,host_c,
+                         host_d,host_offset);
+
+  this->cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,this->cutsq_sigma,host_write,host_cutsq,
+             host_cut_ljsq,host_sigma);
+
+  this->sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(this->sp_lj,host_write,8,false);
+
+  this->_cut_coulsq=host_cut_coulsq;
+  this->_qqrd2e=qqrd2e;
+  this->_g_ewald=g_ewald;
+
+  this->_allocated=true;
+  this->_max_bytes=this->coeff1.row_bytes()+this->coeff2.row_bytes()
+      +this->cutsq_sigma.row_bytes()+this->sp_lj.row_bytes();
+  return 0;
+}
+
+template class BornCoulLongCS<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born_coul_long_cs.cu
+++ b/lib/gpu/lal_born_coul_long_cs.cu
@ -0,0 +1,325 @@
+// **************************************************************************
+//                            born_coul_long_cs.cu
+//                             -------------------
+//                           Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the born/coul/long/cs pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : June 2018
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+// Note: EWALD_P is different from that in lal_preprocessor.h
+//       acctyp is needed for these parameters
+#define CS_EWALD_P (acctyp)9.95473818e-1
+#define B0        (acctyp)-0.1335096380159268
+#define B1        (acctyp)-2.57839507e-1
+#define B2        (acctyp)-1.37203639e-1
+#define B3        (acctyp)-8.88822059e-3
+#define B4        (acctyp)-5.80844129e-3
+#define B5        (acctyp)1.14652755e-1
+
+#define EPSILON (acctyp)(1.0e-20)
+#define EPS_EWALD (acctyp)(1.0e-6)
+#define EPS_EWALD_SQR (acctyp)(1.0e-12)
+
+__kernel void k_born_coul_long_cs(const __global numtyp4 *restrict x_,
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch,
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict cutsq_sigma,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq 
+        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
+
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+        numtyp r2inv = ucl_recip(rsq);
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_sqrt(rsq);
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp;
+          if (factor_coul<(numtyp)1.0) {
+            numtyp grij = g_ewald * (r+EPS_EWALD);
+            numtyp expm2 = ucl_exp(-grij*grij);
+            acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+            numtyp u = (numtyp)1.0 - t;
+            _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+            prefactor /= (r+EPS_EWALD);
+            forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+            // Additionally r2inv needs to be accordingly modified since the later
+            // scaling of the overall force shall be consistent
+            r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
+          } else {
+            numtyp grij = g_ewald * r;
+            numtyp expm2 = ucl_exp(-grij*grij);
+            acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+            numtyp u = (numtyp)1.0 - t;
+            _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+            prefactor /= r;
+            forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2);
+          }
+        } else forcecoul = (numtyp)0.0;
+
+        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        force = (forcecoul + forceborn) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            numtyp e = prefactor*_erfc;
+            if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
+          if (rsq < cutsq_sigma[mtype].y) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_coul_long_cs_fast(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq 
+        numtyp forcecoul,forceborn,force,r6inv,prefactor,_erfc,rexp;
+
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+        numtyp r2inv = ucl_recip(rsq);
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_sqrt(rsq);
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp;
+          if (factor_coul<(numtyp)1.0) {
+            numtyp grij = g_ewald * (r+EPS_EWALD);
+            numtyp expm2 = ucl_exp(-grij*grij);
+            acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+            numtyp u = (numtyp)1.0 - t;
+            _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+            prefactor /= (r+EPS_EWALD);
+            forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+            // Additionally r2inv needs to be accordingly modified since the later
+            // scaling of the overall force shall be consistent
+            r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
+          } else {
+            numtyp grij = g_ewald * r;
+            numtyp expm2 = ucl_exp(-grij*grij);
+            acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+            numtyp u = (numtyp)1.0 - t;
+            _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+            prefactor /= r;
+            forcecoul = prefactor*(_erfc + EWALD_F*grij*expm2);
+          }
+        } else forcecoul = (numtyp)0.0;
+
+        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        force = (forcecoul + forceborn) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            numtyp e = prefactor*_erfc;
+            if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
+          if (rsq < cutsq_sigma[mtype].y) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_born_coul_long_cs.h
+++ b/lib/gpu/lal_born_coul_long_cs.h
@ -0,0 +1,53 @@
+/***************************************************************************
+                             born_coul_long_cs.h
+                             -------------------
+                         Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the born/coul/long/cs pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#ifndef LAL_BORN_COUL_LONG_CS_H
+#define LAL_BORN_COUL_LONG_CS_H
+
+#include "lal_born_coul_long.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BornCoulLongCS : public BornCoulLong<numtyp, acctyp> {
+ public:
+  BornCoulLongCS() {}
+  ~BornCoulLongCS() {}
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
+           double **host_sigma, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_born_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_long_cs_ext.cpp
@ -0,0 +1,132 @@
+/***************************************************************************
+                           born_coul_long_cs_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to born/coul/long/cs acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_born_coul_long_cs.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static BornCoulLongCS<PRECISION,ACC_PRECISION> BCLCSMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int bornclcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
+                    double **host_born1, double **host_born2, double **host_born3,
+                    double **host_a, double **host_c, double **host_d,
+                    double **sigma, double **offset, double *special_lj,
+                    const int inum, const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size, int &gpu_mode,
+                    FILE *screen, double **host_cut_ljsq, double host_cut_coulsq,
+                    double *host_special_coul, const double qqrd2e,
+                    const double g_ewald) {
+  BCLCSMF.clear();
+  gpu_mode=BCLCSMF.device->gpu_mode();
+  double gpu_split=BCLCSMF.device->particle_split();
+  int first_gpu=BCLCSMF.device->first_device();
+  int last_gpu=BCLCSMF.device->last_device();
+  int world_me=BCLCSMF.device->world_me();
+  int gpu_rank=BCLCSMF.device->gpu_rank();
+  int procs_per_gpu=BCLCSMF.device->procs_per_gpu();
+
+  BCLCSMF.device->init_message(screen,"born/coul/long/cs",first_gpu,last_gpu);
+
+  bool message=false;
+  if (BCLCSMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma, offset,
+                          special_lj, inum, nall, 300, maxspecial, cell_size,
+                          gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                          host_special_coul, qqrd2e, g_ewald);
+
+  BCLCSMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=BCLCSMF.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma, offset,
+                            special_lj, inum, nall, 300, maxspecial, cell_size,
+                            gpu_split, screen, host_cut_ljsq, host_cut_coulsq,
+                            host_special_coul, qqrd2e, g_ewald);
+
+    BCLCSMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    BCLCSMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void bornclcs_gpu_clear() {
+  BCLCSMF.clear();
+}
+
+int** bornclcs_gpu_compute_n(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum,  const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd) {
+  return BCLCSMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                          subhi, tag, nspecial, special, eflag, vflag, eatom,
+                          vatom, host_start, ilist, jnum, cpu_time, success,
+                          host_q, boxlo, prd);
+}
+
+void bornclcs_gpu_compute(const int ago, const int inum_full, const int nall,
+                        double **host_x, int *host_type, int *ilist, int *numj,
+                        int **firstneigh, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q,
+                        const int nlocal, double *boxlo, double *prd) {
+  BCLCSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                   host_q,nlocal,boxlo,prd);
+}
+
+double bornclcs_gpu_bytes() {
+  return BCLCSMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_born_coul_wolf.cpp
+++ b/lib/gpu/lal_born_coul_wolf.cpp
@ -57,7 +57,7 @@ int BornCoulWolfT::init(const int ntypes, double **host_cutsq, double **host_rho
                        const double alf, const double e_shift, const double f_shift) {
  int success;
  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
-                            _screen,born_coul_wolf,"k_born_wolf");
+                            _screen,born_coul_wolf,"k_born_coul_wolf");
  if (success!=0)
    return success;

--- a/lib/gpu/lal_born_coul_wolf.cu
+++ b/lib/gpu/lal_born_coul_wolf.cu
@ -31,7 +31,7 @@ texture<int2> q_tex;

 #define MY_PIS (acctyp)1.77245385090551602729

-__kernel void k_born_wolf(const __global numtyp4 *restrict x_,
+__kernel void k_born_coul_wolf(const __global numtyp4 *restrict x_,
                          const __global numtyp4 *restrict coeff1,
                          const __global numtyp4 *restrict coeff2,
                          const int lj_types,
@ -165,7 +165,7 @@ __kernel void k_born_wolf(const __global numtyp4 *restrict x_,
  } // if ii
 }

-__kernel void k_born_wolf_fast(const __global numtyp4 *restrict x_,
+__kernel void k_born_coul_wolf_fast(const __global numtyp4 *restrict x_,
                               const __global numtyp4 *restrict coeff1_in,
                               const __global numtyp4 *restrict coeff2_in,
                               const __global numtyp *restrict sp_lj_in,
--- a/lib/gpu/lal_born_coul_wolf.h
+++ b/lib/gpu/lal_born_coul_wolf.h
@ -13,8 +13,8 @@
    email                : nguyentd@ornl.gov
 ***************************************************************************/

-#ifndef LAL_BORN_COUL_LONG_H
-#define LAL_BORN_COUL_LONG_H
+#ifndef LAL_BORN_COUL_WOLF_H
+#define LAL_BORN_COUL_WOLF_H

 #include "lal_base_charge.h"

@ -79,7 +79,7 @@ class BornCoulWolf : public BaseCharge<numtyp, acctyp> {

  numtyp _cut_coulsq,_qqrd2e,_alf,_e_shift,_f_shift;

- private:
+ protected:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
--- a/lib/gpu/lal_born_coul_wolf_cs.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs.cpp
@ -0,0 +1,97 @@
+/***************************************************************************
+                            born_coul_wolf_cs.cpp
+                             -------------------
+                        Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the born/coul/wolf/cs pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "born_coul_wolf_cs_cl.h"
+#elif defined(USE_CUDART)
+const char *born_coul_wolf_cs=0;
+#else
+#include "born_coul_wolf_cs_cubin.h"
+#endif
+
+#include "lal_born_coul_wolf_cs.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define BornCoulWolfCST BornCoulWolfCS<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+int BornCoulWolfCST::init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+                        double **host_born1, double **host_born2, double **host_born3,
+                        double **host_a, double **host_c, double **host_d,
+                        double **host_sigma, double **host_offset,
+                        double *host_special_lj, const int nlocal,
+                        const int nall, const int max_nbors,
+                        const int maxspecial, const double cell_size,
+                        const double gpu_split, FILE *_screen,
+                        double **host_cut_ljsq, const double host_cut_coulsq,
+                        double *host_special_coul, const double qqrd2e,
+                        const double alf, const double e_shift, const double f_shift) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,born_coul_wolf_cs,"k_born_coul_wolf_cs");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  this->shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    this->shared_types=true;
+  }
+  this->_lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  this->coeff1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,this->coeff1,host_write,host_rhoinv,
+                         host_born1,host_born2,host_born3);
+
+  this->coeff2.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,this->coeff2,host_write,host_a,host_c,
+                                     host_d,host_offset);
+
+  this->cutsq_sigma.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,this->cutsq_sigma,host_write,host_cutsq,
+                         host_cut_ljsq,host_sigma);
+
+  this->sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(this->sp_lj,host_write,8,false);
+
+  this->_cut_coulsq=host_cut_coulsq;
+  this->_qqrd2e=qqrd2e;
+  this->_alf=alf;
+  this->_e_shift=e_shift;
+  this->_f_shift=f_shift;
+
+  this->_allocated=true;
+  this->_max_bytes=this->coeff1.row_bytes()+this->coeff2.row_bytes()
+      +this->cutsq_sigma.row_bytes()+this->sp_lj.row_bytes();
+  return 0;
+}
+
+template class BornCoulWolfCS<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_born_coul_wolf_cs.cu
+++ b/lib/gpu/lal_born_coul_wolf_cs.cu
@ -0,0 +1,306 @@
+// **************************************************************************
+//                            born_coul_wolf_cs.cu
+//                             -------------------
+//                         Trung Dac Nguyen (Northwestern)
+//
+//  Device code for acceleration of the born/coul/wolf/cs pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+#define EPSILON (acctyp)(1.0e-20)
+#define MY_PIS (acctyp)1.77245385090551602729
+
+__kernel void k_born_coul_wolf_cs(const __global numtyp4 *restrict x_,
+                          const __global numtyp4 *restrict coeff1,
+                          const __global numtyp4 *restrict coeff2,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch,
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict cutsq_sigma,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp alf, const numtyp e_shift,
+                          const numtyp f_shift, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq_sigma[mtype].x) { // cutsq
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+        acctyp r2inv = ucl_recip(rsq);
+
+        numtyp forcecoul,forceborn,force,prefactor,rexp;
+        acctyp v_sh,r6inv;
+
+        if (rsq < cutsq_sigma[mtype].y) { // cut_ljsq
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          r6inv = r2inv*r2inv*r2inv;
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp r = ucl_rsqrt(r2inv);
+          acctyp arij = alf * r;
+          acctyp erfcd = ucl_exp(-arij*arij);
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+
+          const acctyp erfcc = erfc(arij);
+          v_sh = (erfcc - e_shift*r)*prefactor;
+          acctyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = prefactor * dvdrr*rsq;
+          if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
+        } else forcecoul = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            acctyp e=v_sh;
+            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
+          if (rsq < cutsq_sigma[mtype].y) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_born_coul_wolf_cs_fast(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict coeff1_in,
+                               const __global numtyp4 *restrict coeff2_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict cutsq_sigma,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp alf, const numtyp e_shift,
+                               const numtyp f_shift, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 coeff1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 coeff2[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    coeff1[tid]=coeff1_in[tid];
+    if (eflag>0)
+      coeff2[tid]=coeff2_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    if (eflag>0) {
+      acctyp e_self = -((acctyp)0.5*e_shift + alf/MY_PIS) *
+        qtmp*qtmp*qqrd2e/(acctyp)t_per_atom;
+      e_coul += (acctyp)2.0*e_self;
+    }
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      acctyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq_sigma[mtype].x) {
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+        acctyp r2inv = ucl_recip(rsq);
+
+        numtyp forcecoul,forceborn,force,prefactor,rexp;
+        acctyp v_sh,r6inv;
+
+        if (rsq < cutsq_sigma[mtype].y) {
+          r6inv = r2inv*r2inv*r2inv;
+          numtyp r = ucl_sqrt(rsq);
+          rexp = ucl_exp((cutsq_sigma[mtype].z-r)*coeff1[mtype].x);
+          forceborn = (coeff1[mtype].y*r*rexp - coeff1[mtype].z*r6inv
+            + coeff1[mtype].w*r2inv*r6inv)*factor_lj;
+        } else forceborn = (numtyp)0.0;
+
+       if (rsq < cut_coulsq) {
+          numtyp r = ucl_sqrt(rsq);
+          acctyp arij = alf * r;
+          acctyp erfcd = ucl_exp(-arij*arij);
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+
+          const acctyp erfcc = erfc(arij);
+          v_sh = (erfcc - e_shift*r)*prefactor;
+          acctyp dvdrr = (erfcc/rsq + (numtyp)2.0*alf/MY_PIS * erfcd/r) + f_shift;
+          forcecoul = prefactor * dvdrr*rsq;
+          if (factor_coul < (numtyp)1.0) forcecoul -= ((numtyp)1.0-factor_coul)*prefactor;
+        } else forcecoul = (numtyp)0.0;
+
+        force = (forceborn + forcecoul) * r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq) {
+            acctyp e=v_sh;
+            if (factor_coul < (numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+            e_coul += e;
+          }
+          if (rsq < cutsq_sigma[mtype].y) {
+            numtyp e=coeff2[mtype].x*rexp - coeff2[mtype].y*r6inv
+              + coeff2[mtype].z*r2inv*r6inv;
+            energy+=factor_lj*(e-coeff2[mtype].w);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_born_coul_wolf_cs.h
+++ b/lib/gpu/lal_born_coul_wolf_cs.h
@ -0,0 +1,54 @@
+/***************************************************************************
+                              born_coul_wolf_cs.h
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the born/coul/wolf/cs pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#ifndef LAL_BORN_COUL_WOLF_CS_H
+#define LAL_BORN_COUL_WOLF_CS_H
+
+#include "lal_born_coul_wolf.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class BornCoulWolfCS : public BornCoulWolf<numtyp, acctyp> {
+ public:
+  BornCoulWolfCS() {}
+  ~BornCoulWolfCS() {}
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_rhoinv,
+           double **host_born1, double **host_born2, double **host_born3,
+           double **host_a, double **host_c, double **host_d,
+           double **host_sigma, double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double alf, const double e_shift,
+           const double f_shift);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
+++ b/lib/gpu/lal_born_coul_wolf_cs_ext.cpp
@ -0,0 +1,134 @@
+/***************************************************************************
+                           born_coul_wolf_cs_ext.cpp
+                             -------------------
+                           Trung Dac Nguyen (Northwestern)
+
+  Functions for LAMMPS access to born/coul/wolf/cs acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_born_coul_wolf_cs.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static BornCoulWolfCS<PRECISION,ACC_PRECISION> BornCWCST;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int borncwcs_gpu_init(const int ntypes, double **cutsq, double **host_rhoinv,
+                    double **host_born1, double **host_born2, double **host_born3,
+                    double **host_a, double **host_c, double **host_d,
+                    double **sigma, double **offset, double *special_lj, const int inum,
+                    const int nall, const int max_nbors, const int maxspecial,
+                    const double cell_size, int &gpu_mode, FILE *screen,
+                    double **host_cut_ljsq, double host_cut_coulsq,
+                    double *host_special_coul, const double qqrd2e,
+                    const double alf, const double e_shift, const double f_shift) {
+  BornCWCST.clear();
+  gpu_mode=BornCWCST.device->gpu_mode();
+  double gpu_split=BornCWCST.device->particle_split();
+  int first_gpu=BornCWCST.device->first_device();
+  int last_gpu=BornCWCST.device->last_device();
+  int world_me=BornCWCST.device->world_me();
+  int gpu_rank=BornCWCST.device->gpu_rank();
+  int procs_per_gpu=BornCWCST.device->procs_per_gpu();
+
+  BornCWCST.device->init_message(screen,"born/coul/wolf/cs",first_gpu,last_gpu);
+
+  bool message=false;
+  if (BornCWCST.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                          host_born3, host_a, host_c, host_d, sigma,
+                          offset, special_lj, inum, nall, 300,
+                          maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e,
+                          alf, e_shift, f_shift);
+
+  BornCWCST.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=BornCWCST.init(ntypes, cutsq, host_rhoinv, host_born1, host_born2,
+                            host_born3, host_a, host_c, host_d, sigma,
+                            offset, special_lj, inum, nall, 300,
+                            maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                            host_cut_coulsq, host_special_coul, qqrd2e,
+                            alf, e_shift, f_shift);
+
+    BornCWCST.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    BornCWCST.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void borncwcs_gpu_clear() {
+  BornCWCST.clear();
+}
+
+int** borncwcs_gpu_compute_n(const int ago, const int inum_full,
+                           const int nall, double **host_x, int *host_type,
+                           double *sublo, double *subhi, tagint *tag, int **nspecial,
+                           tagint **special, const bool eflag, const bool vflag,
+                           const bool eatom, const bool vatom, int &host_start,
+                           int **ilist, int **jnum,  const double cpu_time,
+                           bool &success, double *host_q, double *boxlo,
+                           double *prd) {
+  return BornCWCST.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                          subhi, tag, nspecial, special, eflag, vflag, eatom,
+                          vatom, host_start, ilist, jnum, cpu_time, success,
+                          host_q, boxlo, prd);
+}
+
+void borncwcs_gpu_compute(const int ago, const int inum_full, const int nall,
+                        double **host_x, int *host_type, int *ilist, int *numj,
+                        int **firstneigh, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        const double cpu_time, bool &success, double *host_q,
+                        const int nlocal, double *boxlo, double *prd) {
+  BornCWCST.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                   firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                   host_q,nlocal,boxlo,prd);
+}
+
+double borncwcs_gpu_bytes() {
+  return BornCWCST.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_coul_long.h
+++ b/lib/gpu/lal_coul_long.h
@ -72,7 +72,7 @@ class CoulLong : public BaseCharge<numtyp, acctyp> {

  numtyp _cut_coulsq, _qqrd2e, _g_ewald;

- private:
+ protected:
  bool _allocated;
  void loop(const bool _eflag, const bool _vflag);
 };
--- a/lib/gpu/lal_coul_long_cs.cpp
+++ b/lib/gpu/lal_coul_long_cs.cpp
@ -0,0 +1,78 @@
+/***************************************************************************
+                              coul_long_cs.cpp
+                             -------------------
+                           Trung Nguyen (Northwestern)
+
+  Class for acceleration of the coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : June 2018
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "coul_long_cs_cl.h"
+#elif defined(USE_CUDART)
+const char *coul_long_cs=0;
+#else
+#include "coul_long_cs_cubin.h"
+#endif
+
+#include "lal_coul_long_cs.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define CoulLongCST CoulLongCS<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> pair_gpu_device;
+
+template <class numtyp, class acctyp>
+int CoulLongCST::init(const int ntypes, double **host_scale,
+                    const int nlocal, const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size,
+                    const double gpu_split, FILE *_screen,
+                    const double host_cut_coulsq, double *host_special_coul,
+                    const double qqrd2e, const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,
+                            gpu_split,_screen,coul_long_cs,"k_coul_long_cs");
+  if (success!=0)
+    return success;
+
+  int lj_types=ntypes;
+  this->shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    this->shared_types=true;
+  }
+  this->_lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  this->scale.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,this->scale,host_write,host_scale);
+
+  this->sp_cl.alloc(4,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_coul[i];
+  }
+  ucl_copy(this->sp_cl,host_write,4,false);
+
+  this->_cut_coulsq=host_cut_coulsq;
+  this->_qqrd2e=qqrd2e;
+  this->_g_ewald=g_ewald;
+
+  this->_allocated=true;
+  this->_max_bytes=this->scale.row_bytes()+this->sp_cl.row_bytes();
+  return 0;
+}
+
+template class CoulLongCS<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_coul_long_cs.cu
+++ b/lib/gpu/lal_coul_long_cs.cu
@ -0,0 +1,367 @@
+// **************************************************************************
+//                               coul_long_cs.cu
+//                             -------------------
+//                           Trung Nguyen (Northwestern)
+//
+//  Device code for acceleration of the coul/long/cs pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                : June 2018
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+// Note: EWALD_P is different from that in lal_preprocessor.h
+//       acctyp is needed for these parameters
+#define CS_EWALD_P (acctyp)9.95473818e-1
+#define B0        (acctyp)-0.1335096380159268
+#define B1        (acctyp)-2.57839507e-1
+#define B2        (acctyp)-1.37203639e-1
+#define B3        (acctyp)-8.88822059e-3
+#define B4        (acctyp)-5.80844129e-3
+#define B5        (acctyp)1.14652755e-1
+
+#define EPSILON (acctyp)(1.0e-20)
+#define EPS_EWALD (acctyp)(1.0e-6)
+#define EPS_EWALD_SQR (acctyp)(1.0e-12)
+
+#if (ARCH < 300)
+
+#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[6][BLOCK_PAIR];                                  \
+                                                                            \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=e_coul;                                                 \
+                                                                            \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<4; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    e_coul=red_acc[3][tid];                                                 \
+                                                                            \
+    if (vflag>0) {                                                          \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+                                                                            \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<6; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+    }                                                                       \
+  }                                                                         \
+                                                                            \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=(acctyp)0;                                                       \
+      ap1+=inum;                                                            \
+      *ap1=e_coul*(acctyp)0.5;                                              \
+      ap1+=inum;                                                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
+        ap1+=inum;                                                          \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#else
+
+#define store_answers_lq(f, e_coul, virial, ii, inum, tid,                  \
+                         t_per_atom, offset, eflag, vflag, ans, engv)       \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    __global acctyp *ap1=engv+ii;                                           \
+    if (eflag>0) {                                                          \
+      *ap1=(acctyp)0;                                                       \
+      ap1+=inum;                                                            \
+      *ap1=e_coul*(acctyp)0.5;                                              \
+      ap1+=inum;                                                            \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        *ap1=virial[i]*(acctyp)0.5;                                         \
+        ap1+=inum;                                                          \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+  }
+
+#endif
+
+__kernel void k_coul_long_cs(const __global numtyp4 *restrict x_,
+                          const __global numtyp *restrict scale,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_cl_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch,
+                          const __global numtyp *restrict q_,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_cl[4];
+  sp_cl[0]=sp_cl_in[0];
+  sp_cl[1]=sp_cl_in[1];
+  sp_cl[2]=sp_cl_in[2];
+  sp_cl[3]=sp_cl_in[3];
+
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    int itype=ix.w;
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_coul;
+      factor_coul = sp_cl[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq < cut_coulsq) {
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+
+        numtyp force,prefactor,_erfc;
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp r = ucl_rsqrt(r2inv);
+        fetch(prefactor,j,q_tex);
+        prefactor *= qqrd2e * scale[mtype] * qtmp;
+        if (factor_coul<(numtyp)1.0) {
+          numtyp grij = g_ewald * (r+EPS_EWALD);
+          numtyp expm2 = ucl_exp(-grij*grij);
+          acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+          numtyp u = (numtyp)1.0 - t;
+          _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+          prefactor /= (r+EPS_EWALD);
+          force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+          // Additionally r2inv needs to be accordingly modified since the later
+          // scaling of the overall force shall be consistent
+          r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
+          force *= r2inv;
+        } else {
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+          numtyp u = (numtyp)1.0 - t;
+          _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+          prefactor /= r;
+          force = prefactor*(_erfc + EWALD_F*grij*expm2);
+          force *= r2inv;
+        }
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e = prefactor*_erfc;
+          if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+          e_coul += e;
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_coul_long_cs_fast(const __global numtyp4 *restrict x_,
+                               const __global numtyp *restrict scale_in,
+                               const __global numtyp *restrict sp_cl_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
+                               const __global numtyp *restrict q_,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp scale[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_cl[4];
+  if (tid<4)
+    sp_cl[tid]=sp_cl_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES)
+    scale[tid]=scale_in[tid];
+
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_coul;
+      factor_coul = sp_cl[sbmask(j)];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq < cut_coulsq) {
+        rsq += EPSILON; // Add Epsilon for case: r = 0; Interaction must be removed by special bond;
+
+        numtyp force,prefactor,_erfc;
+        numtyp r2inv = ucl_recip(rsq);
+        numtyp r = ucl_rsqrt(r2inv);
+        fetch(prefactor,j,q_tex);
+        prefactor *= qqrd2e * scale[mtype] * qtmp;
+        if (factor_coul<(numtyp)1.0) {
+          numtyp grij = g_ewald * (r+EPS_EWALD);
+          numtyp expm2 = ucl_exp(-grij*grij);
+          acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+          numtyp u = (numtyp)1.0 - t;
+          _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+          prefactor /= (r+EPS_EWALD);
+          force = prefactor * (_erfc + EWALD_F*grij*expm2 - ((numtyp)1.0-factor_coul));
+          // Additionally r2inv needs to be accordingly modified since the later
+          // scaling of the overall force shall be consistent
+          r2inv = ucl_recip(rsq + EPS_EWALD_SQR);
+        } else {
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          acctyp t = ucl_recip((numtyp)1.0 + CS_EWALD_P*grij);
+          numtyp u = (numtyp)1.0 - t;
+          _erfc = t * ((numtyp)1.0 + u*(B0+u*(B1+u*(B2+u*(B3+u*(B4+u*B5)))))) * expm2;
+          prefactor /= r;
+          force = prefactor * (_erfc + EWALD_F*grij*expm2);
+        }
+
+        force *= r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          numtyp e = prefactor*_erfc;
+          if (factor_coul<(numtyp)1.0) e -= ((numtyp)1.0-factor_coul)*prefactor;
+          e_coul += e;
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_lq(f,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                     vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_coul_long_cs.h
+++ b/lib/gpu/lal_coul_long_cs.h
@ -0,0 +1,50 @@
+/***************************************************************************
+                               coul_long_cs.h
+                             -------------------
+                           Trung Nguyen (Northwestern)
+
+  Class for acceleration of the coul/long/cs pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : June 2018
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#ifndef LAL_COUL_LONG_CS_H
+#define LAL_COUL_LONG_CS_H
+
+#include "lal_coul_long.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class CoulLongCS : public CoulLong<numtyp, acctyp> {
+ public:
+  CoulLongCS() {}
+  ~CoulLongCS() {}
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **scale,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_coul_long_cs_ext.cpp
+++ b/lib/gpu/lal_coul_long_cs_ext.cpp
@ -0,0 +1,145 @@
+/***************************************************************************
+                             coul_long_cs_ext.cpp
+                             -------------------
+                           Trung Nguyen (Northwestern)
+
+  Functions for LAMMPS access to coul/long/cs acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                : June 2018
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_coul_long_cs.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static CoulLongCS<PRECISION,ACC_PRECISION> CLCSMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int clcs_gpu_init(const int ntypes, double **host_scale,
+                const int inum, const int nall, const int max_nbors,
+                const int maxspecial, const double cell_size, int &gpu_mode,
+                FILE *screen, double host_cut_coulsq, double *host_special_coul,
+                const double qqrd2e, const double g_ewald) {
+  CLCSMF.clear();
+  gpu_mode=CLCSMF.device->gpu_mode();
+  double gpu_split=CLCSMF.device->particle_split();
+  int first_gpu=CLCSMF.device->first_device();
+  int last_gpu=CLCSMF.device->last_device();
+  int world_me=CLCSMF.device->world_me();
+  int gpu_rank=CLCSMF.device->gpu_rank();
+  int procs_per_gpu=CLCSMF.device->procs_per_gpu();
+
+  CLCSMF.device->init_message(screen,"coul/long/cs",first_gpu,last_gpu);
+
+  bool message=false;
+  if (CLCSMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+                      cell_size, gpu_split, screen, host_cut_coulsq,
+                      host_special_coul, qqrd2e, g_ewald);
+
+  CLCSMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=CLCSMF.init(ntypes, host_scale, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_coulsq,
+                        host_special_coul, qqrd2e, g_ewald);
+
+    CLCSMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    CLCSMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void clcs_gpu_reinit(const int ntypes, double **host_scale) {
+  int world_me=CLCSMF.device->world_me();
+  int gpu_rank=CLCSMF.device->gpu_rank();
+  int procs_per_gpu=CLCSMF.device->procs_per_gpu();
+
+  if (world_me==0)
+    CLCSMF.reinit(ntypes, host_scale);
+
+  CLCSMF.device->world_barrier();
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      CLCSMF.reinit(ntypes, host_scale);
+
+    CLCSMF.device->gpu_barrier();
+  }
+}
+
+void clcs_gpu_clear() {
+  CLCSMF.clear();
+}
+
+int** clcs_gpu_compute_n(const int ago, const int inum_full,
+                       const int nall, double **host_x, int *host_type,
+                       double *sublo, double *subhi, tagint *tag, int **nspecial,
+                       tagint **special, const bool eflag, const bool vflag,
+                       const bool eatom, const bool vatom, int &host_start,
+                       int **ilist, int **jnum,  const double cpu_time,
+                       bool &success, double *host_q, double *boxlo,
+                       double *prd) {
+  return CLCSMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                      subhi, tag, nspecial, special, eflag, vflag, eatom,
+                      vatom, host_start, ilist, jnum, cpu_time, success,
+                      host_q, boxlo, prd);
+}
+
+void clcs_gpu_compute(const int ago, const int inum_full, const int nall,
+                    double **host_x, int *host_type, int *ilist, int *numj,
+                    int **firstneigh, const bool eflag, const bool vflag,
+                    const bool eatom, const bool vatom, int &host_start,
+                    const double cpu_time, bool &success, double *host_q,
+                    const int nlocal, double *boxlo, double *prd) {
+  CLCSMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+               firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+               host_q,nlocal,boxlo,prd);
+}
+
+double clcs_gpu_bytes() {
+  return CLCSMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -80,7 +80,7 @@ int DeviceT::init_device(MPI_Comm world, MPI_Comm replica, const int first_gpu,
  char node_name[MPI_MAX_PROCESSOR_NAME];
  char *node_names = new char[MPI_MAX_PROCESSOR_NAME*_world_size];
  MPI_Get_processor_name(node_name,&name_length);
-  MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names,
+  MPI_Allgather(&node_name,MPI_MAX_PROCESSOR_NAME,MPI_CHAR,&node_names[0],
                MPI_MAX_PROCESSOR_NAME,MPI_CHAR,_comm_world);
  std::string node_string=std::string(node_name);

--- a/lib/gpu/lal_dipole_long_lj.cpp
+++ b/lib/gpu/lal_dipole_long_lj.cpp
@ -0,0 +1,173 @@
+/***************************************************************************
+                                 dipole_lj.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Class for acceleration of the dipole/cut pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#ifdef USE_OPENCL
+#include "dipole_long_lj_cl.h"
+#elif defined(USE_CUDART)
+const char *dipole_long_lj=0;
+#else
+#include "dipole_long_lj_cubin.h"
+#endif
+
+#include "lal_dipole_long_lj.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define DipoleLongLJT DipoleLongLJ<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+DipoleLongLJT::DipoleLongLJ() : BaseDipole<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+DipoleLongLJT::~DipoleLongLJ() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int DipoleLongLJT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int DipoleLongLJT::init(const int ntypes,
+                    double **host_cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3,
+                    double **host_lj4, double **host_offset,
+                    double *host_special_lj, const int nlocal,
+                    const int nall, const int max_nbors,
+                    const int maxspecial, const double cell_size,
+                    const double gpu_split, FILE *_screen,
+                    double **host_cut_ljsq, const double host_cut_coulsq,
+                    double *host_special_coul, const double qqrd2e,
+                    const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,dipole_long_lj,"k_dipole_long_lj");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cut_ljsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset);
+
+  cutsq.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack1(ntypes,lj_types,cutsq,host_write,host_cutsq);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+cutsq.row_bytes()+
+                   sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void DipoleLongLJT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  cutsq.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double DipoleLongLJT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(DipoleLongLJ<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void DipoleLongLJT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor,
+                          &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag, &vflag,
+                          &ainum, &nbor_pitch, &this->atom->q,
+                          &this->atom->quat, &cutsq, &_cut_coulsq,
+                          &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->q,
+                     &this->atom->quat, &cutsq, &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class DipoleLongLJ<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_dipole_long_lj.cu
+++ b/lib/gpu/lal_dipole_long_lj.cu
@ -0,0 +1,640 @@
+// **************************************************************************
+//                                dipole_lj.cu
+//                             -------------------
+//                           Trung Dac Nguyen (ORNL)
+//
+//  Device code for acceleration of the dipole/cut pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : nguyentd@ornl.gov
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+texture<float4> mu_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+texture<int4,1> mu_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#define mu_tex mu_
+#endif
+
+#if (ARCH < 300)
+
+#define store_answers_tq(f, tor, energy, ecoul, virial, ii, inum, tid,      \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    __local acctyp red_acc[8][BLOCK_PAIR];                                  \
+    red_acc[0][tid]=f.x;                                                    \
+    red_acc[1][tid]=f.y;                                                    \
+    red_acc[2][tid]=f.z;                                                    \
+    red_acc[3][tid]=tor.x;                                                  \
+    red_acc[4][tid]=tor.y;                                                  \
+    red_acc[5][tid]=tor.z;                                                  \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+      if (offset < s) {                                                     \
+        for (int r=0; r<6; r++)                                             \
+          red_acc[r][tid] += red_acc[r][tid+s];                             \
+      }                                                                     \
+    }                                                                       \
+    f.x=red_acc[0][tid];                                                    \
+    f.y=red_acc[1][tid];                                                    \
+    f.z=red_acc[2][tid];                                                    \
+    tor.x=red_acc[3][tid];                                                  \
+    tor.y=red_acc[4][tid];                                                  \
+    tor.z=red_acc[5][tid];                                                  \
+    if (eflag>0 || vflag>0) {                                               \
+      for (int r=0; r<6; r++)                                               \
+        red_acc[r][tid]=virial[r];                                          \
+      red_acc[6][tid]=energy;                                               \
+      red_acc[7][tid]=ecoul;                                                \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+        if (offset < s) {                                                   \
+          for (int r=0; r<8; r++)                                           \
+            red_acc[r][tid] += red_acc[r][tid+s];                           \
+        }                                                                   \
+      }                                                                     \
+      for (int r=0; r<6; r++)                                               \
+        virial[r]=red_acc[r][tid];                                          \
+      energy=red_acc[6][tid];                                               \
+      ecoul=red_acc[7][tid];                                                \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    int ei=ii;                                                              \
+    if (eflag>0) {                                                          \
+      engv[ei]=energy*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+      engv[ei]=e_coul*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        engv[ei]=virial[i]*(acctyp)0.5;                                        \
+        ei+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
+#else
+
+#define store_answers_tq(f, tor, energy, e_coul, virial, ii, inum, tid,     \
+                        t_per_atom, offset, eflag, vflag, ans, engv)        \
+  if (t_per_atom>1) {                                                       \
+    for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
+        f.x += shfl_xor(f.x, s, t_per_atom);                                \
+        f.y += shfl_xor(f.y, s, t_per_atom);                                \
+        f.z += shfl_xor(f.z, s, t_per_atom);                                \
+        tor.x += shfl_xor(tor.x, s, t_per_atom);                            \
+        tor.y += shfl_xor(tor.y, s, t_per_atom);                            \
+        tor.z += shfl_xor(tor.z, s, t_per_atom);                            \
+        energy += shfl_xor(energy, s, t_per_atom);                          \
+        e_coul += shfl_xor(e_coul, s, t_per_atom);                          \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                       \
+          for (int r=0; r<6; r++)                                           \
+            virial[r] += shfl_xor(virial[r], s, t_per_atom);                \
+      }                                                                     \
+    }                                                                       \
+  }                                                                         \
+  if (offset==0) {                                                          \
+    int ei=ii;                                                              \
+    if (eflag>0) {                                                          \
+      engv[ei]=energy*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+      engv[ei]=e_coul*(acctyp)0.5;                                             \
+      ei+=inum;                                                           \
+    }                                                                       \
+    if (vflag>0) {                                                          \
+      for (int i=0; i<6; i++) {                                             \
+        engv[ei]=virial[i]*(acctyp)0.5;                                        \
+        ei+=inum;                                                         \
+      }                                                                     \
+    }                                                                       \
+    ans[ii]=f;                                                              \
+    ans[ii+inum]=tor;                                                       \
+  }
+
+#endif
+
+#define MY_PIS (acctyp)1.77245385090551602729
+
+__kernel void k_dipole_long_lj(const __global numtyp4 *restrict x_,
+                          const __global numtyp4 *restrict lj1,
+                          const __global numtyp4 *restrict lj3,
+                          const int lj_types,
+                          const __global numtyp *restrict sp_lj_in,
+                          const __global int *dev_nbor,
+                          const __global int *dev_packed,
+                          __global acctyp4 *restrict ans,
+                          __global acctyp *restrict engv,
+                          const int eflag, const int vflag, const int inum,
+                          const int nbor_pitch,
+                          const __global numtyp *restrict q_,
+                          const __global numtyp4 *restrict mu_,
+                          const __global numtyp *restrict cutsq,
+                          const numtyp cut_coulsq, const numtyp qqrd2e,
+                          const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  numtyp pre1 = numtyp(2.0) * g_ewald / MY_PIS;
+  numtyp pre2 = numtyp(4.0) * (g_ewald*g_ewald*g_ewald) / MY_PIS;
+  numtyp pre3 = numtyp(8.0) * (g_ewald*g_ewald*g_ewald*g_ewald*g_ewald) / MY_PIS;
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp qj; fetch(qj,j,q_tex);
+      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force_lj,rinv,r6inv;
+        numtyp pdotp, pidotr, pjdotr, _erfc;
+        numtyp g0,g1,g2,b0,b1,b2,b3,d0,d1,d2,d3;
+        numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
+        numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
+        numtyp fdx,fdy,fdz,fax,fay,faz;
+        acctyp4 forcecoul, ticoul;
+        acctyp4 force;
+
+        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
+        } else force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          rinv = ucl_rsqrt(rsq);
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+          pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
+          pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+          pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+
+          g0 = qtmp*qj;
+          g1 = qtmp*pjdotr - qj*pidotr + pdotp;
+          g2 = -pidotr*pjdotr;
+
+          if (factor_coul > (numtyp)0.0) {
+            b0 = _erfc * rinv;
+            b1 = (b0 + pre1*expm2) * r2inv;
+            b2 = ((numtyp)3.0*b1 + pre2*expm2) * r2inv;
+            b3 = ((numtyp)5.0*b2 + pre3*expm2) * r2inv;
+
+            g0b1_g1b2_g2b3 = g0*b1 + g1*b2 + g2*b3;
+            fdx = delx * g0b1_g1b2_g2b3 -
+              b1 * (qtmp*muj.x - qj*mui.x) +
+              b2 * (pjdotr*mui.x + pidotr*muj.x);
+            fdy = dely * g0b1_g1b2_g2b3 -
+              b1 * (qtmp*muj.y - qj*mui.y) +
+              b2 * (pjdotr*mui.y + pidotr*muj.y);
+            fdz = delz * g0b1_g1b2_g2b3 -
+              b1 * (qtmp*muj.z - qj*mui.z) +
+              b2 * (pjdotr*mui.z + pidotr*muj.z);
+
+            zdix = delx * (qj*b1 + b2*pjdotr) - b1*muj.x;
+            zdiy = dely * (qj*b1 + b2*pjdotr) - b1*muj.y;
+            zdiz = delz * (qj*b1 + b2*pjdotr) - b1*muj.z;
+            zdjx = delx * (-qtmp*b1 + b2*pidotr) - b1*mui.x;
+            zdjy = dely * (-qtmp*b1 + b2*pidotr) - b1*mui.y;
+            zdjz = delz * (-qtmp*b1 + b2*pidotr) - b1*mui.z;
+
+            if (factor_coul < (numtyp)1.0) {
+              fdx *= factor_coul;
+              fdy *= factor_coul;
+              fdz *= factor_coul;
+              zdix *= factor_coul;
+              zdiy *= factor_coul;
+              zdiz *= factor_coul;
+              zdjx *= factor_coul;
+              zdjy *= factor_coul;
+              zdjz *= factor_coul;
+            }
+          } else {
+            fdx = fdy = fdz = (numtyp)0.0;
+            zdix = zdiy = zdiz = (numtyp)0.0;
+            zdjx = zdjy = zdjz = (numtyp)0.0;
+          }
+
+          if (factor_coul < (numtyp)1.0) {
+            d0 = (_erfc - (numtyp)1.0) * rinv;
+            d1 = (d0 + pre1*expm2) * r2inv;
+            d2 = ((numtyp)3.0*d1 + pre2*expm2) * r2inv;
+            d3 = ((numtyp)5.0*d2 + pre3*expm2) * r2inv;
+
+            g0d1_g1d2_g2d3 = g0*d1 + g1*d2 + g2*d3;
+            fax = delx * g0d1_g1d2_g2d3 -
+              d1 * (qtmp*muj.x - qj*mui.x) +
+              d2 * (pjdotr*mui.x + pidotr*muj.x);
+            fay = dely * g0d1_g1d2_g2d3 -
+              d1 * (qtmp*muj.y - qj*mui.y) +
+              d2 * (pjdotr*mui.y + pidotr*muj.y);
+            faz = delz * g0d1_g1d2_g2d3 -
+              d1 * (qtmp*muj.z - qj*mui.z) +
+              d2 * (pjdotr*mui.z + pidotr*muj.z);
+
+            zaix = delx * (qj*d1 + d2*pjdotr) - d1*muj.x;
+            zaiy = dely * (qj*d1 + d2*pjdotr) - d1*muj.y;
+            zaiz = delz * (qj*d1 + d2*pjdotr) - d1*muj.z;
+            zajx = delx * (-qtmp*d1 + d2*pidotr) - d1*mui.x;
+            zajy = dely * (-qtmp*d1 + d2*pidotr) - d1*mui.y;
+            zajz = delz * (-qtmp*d1 + d2*pidotr) - d1*mui.z;
+
+            if (factor_coul > (numtyp)0.0) {
+              facm1 = (numtyp)1.0 - factor_coul;
+              fax *= facm1;
+              fay *= facm1;
+              faz *= facm1;
+              zaix *= facm1;
+              zaiy *= facm1;
+              zaiz *= facm1;
+              zajx *= facm1;
+              zajy *= facm1;
+              zajz *= facm1;
+            }
+          } else {
+            fax = fay = faz = (numtyp)0.0;
+            zaix = zaiy = zaiz = (numtyp)0.0;
+            zajx = zajy = zajz = (numtyp)0.0;
+          }
+
+          forcecoul.x = fdx + fax;
+          forcecoul.y = fdy + fay;
+          forcecoul.z = fdz + faz;
+
+          ticoul.x = mui.y*(zdiz + zaiz) - mui.z*(zdiy + zaiy);
+          ticoul.y = mui.z*(zdix + zaix) - mui.x*(zdiz + zaiz);
+          ticoul.z = mui.x*(zdiy + zaiy) - mui.y*(zdix + zaix);
+
+        } else {
+          forcecoul.x = forcecoul.y = forcecoul.z = (numtyp)0.0;
+          ticoul.x = ticoul.y = ticoul.z = (numtyp)0.0;
+        }
+
+        force.x = qqrd2e*forcecoul.x + delx*force_lj;
+        force.y = qqrd2e*forcecoul.y + dely*force_lj;
+        force.z = qqrd2e*forcecoul.z + delz*force_lj;
+        f.x+=force.x;
+        f.y+=force.y;
+        f.z+=force.z;
+        tor.x+=qqrd2e*ticoul.x;
+        tor.y+=qqrd2e*ticoul.y;
+        tor.z+=qqrd2e*ticoul.z;
+
+        if (eflag>0) {
+          acctyp e = (acctyp)0.0;
+          if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) {
+            e = qqrd2e*(b0*g0 + b1*g1 + b2*g2);
+            if (factor_coul < (numtyp)1.0) {
+              e_coul *= factor_coul;
+              e_coul += ((numtyp)1.0-factor_coul) * qqrd2e * (d0*g0 + d1*g1 + d2*g2);
+            }
+          } else e = (acctyp)0.0;
+          e_coul += e;
+
+          if (rsq < lj1[mtype].z) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*force.x;
+          virial[1] += dely*force.y;
+          virial[2] += delz*force.z;
+          virial[3] += delx*force.y;
+          virial[4] += delx*force.z;
+          virial[5] += dely*force.z;
+        }
+      }
+
+    } // for nbor
+    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_dipole_long_lj_fast(const __global numtyp4 *restrict x_,
+                               const __global numtyp4 *restrict lj1_in,
+                               const __global numtyp4 *restrict lj3_in,
+                               const __global numtyp *restrict sp_lj_in,
+                               const __global int *dev_nbor,
+                               const __global int *dev_packed,
+                               __global acctyp4 *restrict ans,
+                               __global acctyp *restrict engv,
+                               const int eflag, const int vflag, const int inum,
+                               const int nbor_pitch,
+                               const __global numtyp *restrict q_,
+                               const __global numtyp4 *restrict mu_,
+                               const __global numtyp *restrict _cutsq,
+                               const numtyp cut_coulsq, const numtyp qqrd2e,
+                               const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp cutsq[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    cutsq[tid]=_cutsq[tid];
+    if (eflag>0)
+      lj3[tid]=lj3_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp4 tor;
+  tor.x=(acctyp)0;
+  tor.y=(acctyp)0;
+  tor.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  numtyp pre1 = numtyp(2.0) * g_ewald / MY_PIS;
+  numtyp pre2 = numtyp(4.0) * (g_ewald*g_ewald*g_ewald) / MY_PIS;
+  numtyp pre3 = numtyp(8.0) * (g_ewald*g_ewald*g_ewald*g_ewald*g_ewald) / MY_PIS;
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    numtyp4 mui; fetch4(mui,i,mu_tex); //mu_[i];
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      numtyp qj; fetch(qj,j,q_tex);
+      numtyp4 muj; fetch4(muj,j,mu_tex); //mu_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<cutsq[mtype]) {
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp force_lj,rinv,r6inv;
+        numtyp pdotp, pidotr, pjdotr, _erfc;
+        numtyp g0,g1,g2,b0,b1,b2,b3,d0,d1,d2,d3;
+        numtyp zdix,zdiy,zdiz,zdjx,zdjy,zdjz,zaix,zaiy,zaiz,zajx,zajy,zajz;
+        numtyp g0b1_g1b2_g2b3,g0d1_g1d2_g2d3,facm1;
+        numtyp fdx,fdy,fdz,fax,fay,faz;
+        acctyp4 forcecoul, ticoul;
+        acctyp4 force;
+
+        forcecoul.x = forcecoul.y = forcecoul.z = (acctyp)0;
+        ticoul.x = ticoul.y = ticoul.z = (acctyp)0;
+
+        if (rsq < lj1[mtype].z) {
+          r6inv = r2inv*r2inv*r2inv;
+          force_lj = factor_lj*r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y)*r2inv;
+        } else force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          rinv = ucl_rsqrt(rsq);
+          numtyp r = ucl_rsqrt(r2inv);
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+
+          pdotp  = mui.x*muj.x + mui.y*muj.y + mui.z*muj.z;
+          pidotr = mui.x*delx + mui.y*dely + mui.z*delz;
+          pjdotr = muj.x*delx + muj.y*dely + muj.z*delz;
+
+          g0 = qtmp*qj;
+          g1 = qtmp*pjdotr - qj*pidotr + pdotp;
+          g2 = -pidotr*pjdotr;
+
+          if (factor_coul > (numtyp)0.0) {
+            b0 = _erfc * rinv;
+            b1 = (b0 + pre1*expm2) * r2inv;
+            b2 = ((numtyp)3.0*b1 + pre2*expm2) * r2inv;
+            b3 = ((numtyp)5.0*b2 + pre3*expm2) * r2inv;
+
+            g0b1_g1b2_g2b3 = g0*b1 + g1*b2 + g2*b3;
+            fdx = delx * g0b1_g1b2_g2b3 -
+              b1 * (qtmp*muj.x - qj*mui.x) +
+              b2 * (pjdotr*mui.x + pidotr*muj.x);
+            fdy = dely * g0b1_g1b2_g2b3 -
+              b1 * (qtmp*muj.y - qj*mui.y) +
+              b2 * (pjdotr*mui.y + pidotr*muj.y);
+            fdz = delz * g0b1_g1b2_g2b3 -
+              b1 * (qtmp*muj.z - qj*mui.z) +
+              b2 * (pjdotr*mui.z + pidotr*muj.z);
+
+            zdix = delx * (qj*b1 + b2*pjdotr) - b1*muj.x;
+            zdiy = dely * (qj*b1 + b2*pjdotr) - b1*muj.y;
+            zdiz = delz * (qj*b1 + b2*pjdotr) - b1*muj.z;
+            zdjx = delx * (-qtmp*b1 + b2*pidotr) - b1*mui.x;
+            zdjy = dely * (-qtmp*b1 + b2*pidotr) - b1*mui.y;
+            zdjz = delz * (-qtmp*b1 + b2*pidotr) - b1*mui.z;
+
+            if (factor_coul < (numtyp)1.0) {
+              fdx *= factor_coul;
+              fdy *= factor_coul;
+              fdz *= factor_coul;
+              zdix *= factor_coul;
+              zdiy *= factor_coul;
+              zdiz *= factor_coul;
+              zdjx *= factor_coul;
+              zdjy *= factor_coul;
+              zdjz *= factor_coul;
+            }
+          } else {
+            fdx = fdy = fdz = (numtyp)0.0;
+            zdix = zdiy = zdiz = (numtyp)0.0;
+            zdjx = zdjy = zdjz = (numtyp)0.0;
+          }
+
+          if (factor_coul < (numtyp)1.0) {
+            d0 = (_erfc - (numtyp)1.0) * rinv;
+            d1 = (d0 + pre1*expm2) * r2inv;
+            d2 = ((numtyp)3.0*d1 + pre2*expm2) * r2inv;
+            d3 = ((numtyp)5.0*d2 + pre3*expm2) * r2inv;
+
+            g0d1_g1d2_g2d3 = g0*d1 + g1*d2 + g2*d3;
+            fax = delx * g0d1_g1d2_g2d3 -
+              d1 * (qtmp*muj.x - qj*mui.x) +
+              d2 * (pjdotr*mui.x + pidotr*muj.x);
+            fay = dely * g0d1_g1d2_g2d3 -
+              d1 * (qtmp*muj.y - qj*mui.y) +
+              d2 * (pjdotr*mui.y + pidotr*muj.y);
+            faz = delz * g0d1_g1d2_g2d3 -
+              d1 * (qtmp*muj.z - qj*mui.z) +
+              d2 * (pjdotr*mui.z + pidotr*muj.z);
+
+            zaix = delx * (qj*d1 + d2*pjdotr) - d1*muj.x;
+            zaiy = dely * (qj*d1 + d2*pjdotr) - d1*muj.y;
+            zaiz = delz * (qj*d1 + d2*pjdotr) - d1*muj.z;
+            zajx = delx * (-qtmp*d1 + d2*pidotr) - d1*mui.x;
+            zajy = dely * (-qtmp*d1 + d2*pidotr) - d1*mui.y;
+            zajz = delz * (-qtmp*d1 + d2*pidotr) - d1*mui.z;
+
+            if (factor_coul > (numtyp)0.0) {
+              facm1 = (numtyp)1.0 - factor_coul;
+              fax *= facm1;
+              fay *= facm1;
+              faz *= facm1;
+              zaix *= facm1;
+              zaiy *= facm1;
+              zaiz *= facm1;
+              zajx *= facm1;
+              zajy *= facm1;
+              zajz *= facm1;
+            }
+          } else {
+            fax = fay = faz = (numtyp)0.0;
+            zaix = zaiy = zaiz = (numtyp)0.0;
+            zajx = zajy = zajz = (numtyp)0.0;
+          }
+
+          forcecoul.x = fdx + fax;
+          forcecoul.y = fdy + fay;
+          forcecoul.z = fdz + faz;
+
+          ticoul.x = mui.y*(zdiz + zaiz) - mui.z*(zdiy + zaiy);
+          ticoul.y = mui.z*(zdix + zaix) - mui.x*(zdiz + zaiz);
+          ticoul.z = mui.x*(zdiy + zaiy) - mui.y*(zdix + zaix);
+
+        } else {
+          forcecoul.x = forcecoul.y = forcecoul.z = (numtyp)0.0;
+          ticoul.x = ticoul.y = ticoul.z = (numtyp)0.0;
+        }
+
+        force.x = qqrd2e*forcecoul.x + delx*force_lj;
+        force.y = qqrd2e*forcecoul.y + dely*force_lj;
+        force.z = qqrd2e*forcecoul.z + delz*force_lj;
+        f.x+=force.x;
+        f.y+=force.y;
+        f.z+=force.z;
+        tor.x+=qqrd2e*ticoul.x;
+        tor.y+=qqrd2e*ticoul.y;
+        tor.z+=qqrd2e*ticoul.z;
+
+        if (eflag>0) {
+          acctyp e = (acctyp)0.0;
+          if (rsq < cut_coulsq && factor_coul > (numtyp)0.0) {
+            e = qqrd2e*(b0*g0 + b1*g1 + b2*g2);
+            if (factor_coul < (numtyp)1.0) {
+              e_coul *= factor_coul;
+              e_coul += ((numtyp)1.0-factor_coul) * qqrd2e * (d0*g0 + d1*g1 + d2*g2);
+            }
+          } else e = (acctyp)0.0;
+          e_coul += e;
+
+          if (rsq < lj1[mtype].z) {
+            e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*force.x;
+          virial[1] += dely*force.y;
+          virial[2] += delz*force.z;
+          virial[3] += delx*force.y;
+          virial[4] += delx*force.z;
+          virial[5] += dely*force.z;
+        }
+      }
+
+    } // for nbor
+    store_answers_tq(f,tor,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_dipole_long_lj.h
+++ b/lib/gpu/lal_dipole_long_lj.h
@ -0,0 +1,85 @@
+/***************************************************************************
+                               dipole_long_lj.h
+                             -------------------
+                            Trung Dac Nguyen (Northwestern)
+
+  Class for acceleration of the lj/cut/dipole/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#ifndef LAL_DIPOLE_LONG_LJ_H
+#define LAL_DIPOLE_LONG_LJ_H
+
+#include "lal_base_dipole.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class DipoleLongLJ : public BaseDipole<numtyp, acctyp> {
+ public:
+  DipoleLongLJ();
+  ~DipoleLongLJ();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq, double **host_lj1,
+           double **host_lj2, double **host_lj3, double **host_lj4,
+           double **host_offset, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq_vdw, lj1.w = cutsq_coul
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset
+  UCL_D_Vec<numtyp4> lj3;
+  /// cutsq
+  UCL_D_Vec<numtyp> cutsq;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_dipole_long_lj_ext.cpp
+++ b/lib/gpu/lal_dipole_long_lj_ext.cpp
@ -0,0 +1,129 @@
+/***************************************************************************
+                               dipole_long_lj_ext.cpp
+                             -------------------
+                            Trung Dac Nguyen (ORNL)
+
+  Functions for LAMMPS access to dipole/cut acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : nguyentd@ornl.gov
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_dipole_long_lj.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static DipoleLongLJ<PRECISION,ACC_PRECISION> DPLJMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int dplj_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                 double **host_lj2, double **host_lj3, double **host_lj4,
+                 double **offset, double *special_lj, const int inum,
+                 const int nall, const int max_nbors, const int maxspecial,
+                 const double cell_size, int &gpu_mode, FILE *screen,
+                 double **host_cut_ljsq, const double host_cut_coulsq,
+                 double *host_special_coul, const double qqrd2e,
+                 const double g_ewald) {
+  DPLJMF.clear();
+  gpu_mode=DPLJMF.device->gpu_mode();
+  double gpu_split=DPLJMF.device->particle_split();
+  int first_gpu=DPLJMF.device->first_device();
+  int last_gpu=DPLJMF.device->last_device();
+  int world_me=DPLJMF.device->world_me();
+  int gpu_rank=DPLJMF.device->gpu_rank();
+  int procs_per_gpu=DPLJMF.device->procs_per_gpu();
+
+  DPLJMF.device->init_message(screen,"lj/cut/dipole/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (DPLJMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3,
+                       host_lj4, offset, special_lj, inum, nall, 300,
+                       maxspecial, cell_size, gpu_split, screen, host_cut_ljsq,
+                       host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+  DPLJMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=DPLJMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                         offset, special_lj, inum, nall, 300, maxspecial,
+                         cell_size, gpu_split, screen, host_cut_ljsq,
+                         host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+    DPLJMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    DPLJMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+void dplj_gpu_clear() {
+  DPLJMF.clear();
+}
+
+int** dplj_gpu_compute_n(const int ago, const int inum_full,
+                        const int nall, double **host_x, int *host_type,
+                        double *sublo, double *subhi, tagint *tag, int **nspecial,
+                        tagint **special, const bool eflag, const bool vflag,
+                        const bool eatom, const bool vatom, int &host_start,
+                        int **ilist, int **jnum, const double cpu_time,
+                        bool &success, double *host_q, double **host_mu,
+                        double *boxlo, double *prd) {
+  return DPLJMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                       subhi, tag, nspecial, special, eflag, vflag, eatom,
+                       vatom, host_start, ilist, jnum, cpu_time, success,
+                       host_q, host_mu, boxlo, prd);
+}
+
+void dplj_gpu_compute(const int ago, const int inum_full, const int nall,
+                     double **host_x, int *host_type, int *ilist, int *numj,
+                     int **firstneigh, const bool eflag, const bool vflag,
+                     const bool eatom, const bool vatom, int &host_start,
+                     const double cpu_time, bool &success, double *host_q,
+                     double **host_mu, const int nlocal, double *boxlo, double *prd) {
+  DPLJMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,firstneigh,eflag,
+                vflag,eatom,vatom,host_start,cpu_time,success,host_q,host_mu,
+                nlocal,boxlo,prd);
+}
+
+double dplj_gpu_bytes() {
+  return DPLJMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_lj_expand_coul_long.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long.cpp
@ -0,0 +1,183 @@
+/***************************************************************************
+                               lj_expand_coul_long.cpp
+                             --------------------------
+                             Trung Nguyen (Northwestern)
+
+  Class for acceleration of the lj/expand/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#if defined(USE_OPENCL)
+#include "lj_expand_coul_long_cl.h"
+#elif defined(USE_CUDART)
+const char *lj_expand_coul_long=0;
+#else
+#include "lj_expand_coul_long_cubin.h"
+#endif
+
+#include "lal_lj_expand_coul_long.h"
+#include <cassert>
+using namespace LAMMPS_AL;
+#define LJExpandCoulLongT LJExpandCoulLong<numtyp, acctyp>
+
+extern Device<PRECISION,ACC_PRECISION> device;
+
+template <class numtyp, class acctyp>
+LJExpandCoulLongT::LJExpandCoulLong() : BaseCharge<numtyp,acctyp>(),
+                                    _allocated(false) {
+}
+
+template <class numtyp, class acctyp>
+LJExpandCoulLongT::~LJExpandCoulLong() {
+  clear();
+}
+
+template <class numtyp, class acctyp>
+int LJExpandCoulLongT::bytes_per_atom(const int max_nbors) const {
+  return this->bytes_per_atom_atomic(max_nbors);
+}
+
+template <class numtyp, class acctyp>
+int LJExpandCoulLongT::init(const int ntypes,
+                           double **host_cutsq, double **host_lj1,
+                           double **host_lj2, double **host_lj3,
+                           double **host_lj4, double **host_offset, double **host_shift,
+                           double *host_special_lj, const int nlocal,
+                           const int nall, const int max_nbors,
+                           const int maxspecial, const double cell_size,
+                           const double gpu_split, FILE *_screen,
+                           double **host_cut_ljsq, const double host_cut_coulsq,
+                           double *host_special_coul, const double qqrd2e,
+                           const double g_ewald) {
+  int success;
+  success=this->init_atomic(nlocal,nall,max_nbors,maxspecial,cell_size,gpu_split,
+                            _screen,lj_expand_coul_long,"k_lj_expand_coul_long");
+  if (success!=0)
+    return success;
+
+  // If atom type constants fit in shared memory use fast kernel
+  int lj_types=ntypes;
+  shared_types=false;
+  int max_shared_types=this->device->max_shared_types();
+  if (lj_types<=max_shared_types && this->_block_size>=max_shared_types) {
+    lj_types=max_shared_types;
+    shared_types=true;
+  }
+  _lj_types=lj_types;
+
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(lj_types*lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+
+  for (int i=0; i<lj_types*lj_types; i++)
+    host_write[i]=0.0;
+
+  lj1.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj1,host_write,host_lj1,host_lj2,
+           host_cutsq, host_cut_ljsq);
+
+  lj3.alloc(lj_types*lj_types,*(this->ucl_device),UCL_READ_ONLY);
+  this->atom->type_pack4(ntypes,lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset,host_shift);
+
+  sp_lj.alloc(8,*(this->ucl_device),UCL_READ_ONLY);
+  for (int i=0; i<4; i++) {
+    host_write[i]=host_special_lj[i];
+    host_write[i+4]=host_special_coul[i];
+  }
+  ucl_copy(sp_lj,host_write,8,false);
+
+  _cut_coulsq=host_cut_coulsq;
+  _qqrd2e=qqrd2e;
+  _g_ewald=g_ewald;
+
+  _allocated=true;
+  this->_max_bytes=lj1.row_bytes()+lj3.row_bytes()+sp_lj.row_bytes();
+  return 0;
+}
+
+template <class numtyp, class acctyp>
+void LJExpandCoulLongT::reinit(const int ntypes, double **host_cutsq, double **host_lj1,
+                         double **host_lj2, double **host_lj3, double **host_lj4,
+                         double **host_offset, double **host_shift, double **host_cut_ljsq) {
+  // Allocate a host write buffer for data initialization
+  UCL_H_Vec<numtyp> host_write(_lj_types*_lj_types*32,*(this->ucl_device),
+                               UCL_WRITE_ONLY);
+
+  for (int i=0; i<_lj_types*_lj_types; i++)
+    host_write[i]=0.0;
+
+  this->atom->type_pack4(ntypes,_lj_types,lj1,host_write,host_lj1,host_lj2,
+                         host_cutsq, host_cut_ljsq);
+  this->atom->type_pack4(ntypes,_lj_types,lj3,host_write,host_lj3,host_lj4,
+                         host_offset,host_shift);
+}
+
+template <class numtyp, class acctyp>
+void LJExpandCoulLongT::clear() {
+  if (!_allocated)
+    return;
+  _allocated=false;
+
+  lj1.clear();
+  lj3.clear();
+  sp_lj.clear();
+  this->clear_atomic();
+}
+
+template <class numtyp, class acctyp>
+double LJExpandCoulLongT::host_memory_usage() const {
+  return this->host_memory_usage_atomic()+sizeof(LJExpandCoulLong<numtyp,acctyp>);
+}
+
+// ---------------------------------------------------------------------------
+// Calculate energies, forces, and torques
+// ---------------------------------------------------------------------------
+template <class numtyp, class acctyp>
+void LJExpandCoulLongT::loop(const bool _eflag, const bool _vflag) {
+  // Compute the block size and grid size to keep all cores busy
+  const int BX=this->block_size();
+  int eflag, vflag;
+  if (_eflag)
+    eflag=1;
+  else
+    eflag=0;
+
+  if (_vflag)
+    vflag=1;
+  else
+    vflag=0;
+
+  int GX=static_cast<int>(ceil(static_cast<double>(this->ans->inum())/
+                               (BX/this->_threads_per_atom)));
+
+  int ainum=this->ans->inum();
+  int nbor_pitch=this->nbor->nbor_pitch();
+  this->time_pair.start();
+  if (shared_types) {
+    this->k_pair_fast.set_size(GX,BX);
+    this->k_pair_fast.run(&this->atom->x, &lj1, &lj3, &sp_lj,
+                          &this->nbor->dev_nbor, &this->_nbor_data->begin(),
+                          &this->ans->force, &this->ans->engv, &eflag,
+                          &vflag, &ainum, &nbor_pitch, &this->atom->q,
+                          &_cut_coulsq, &_qqrd2e, &_g_ewald,
+                          &this->_threads_per_atom);
+  } else {
+    this->k_pair.set_size(GX,BX);
+    this->k_pair.run(&this->atom->x, &lj1, &lj3,
+                     &_lj_types, &sp_lj, &this->nbor->dev_nbor,
+                     &this->_nbor_data->begin(), &this->ans->force,
+                     &this->ans->engv, &eflag, &vflag, &ainum,
+                     &nbor_pitch, &this->atom->q, &_cut_coulsq,
+                     &_qqrd2e, &_g_ewald, &this->_threads_per_atom);
+  }
+  this->time_pair.stop();
+}
+
+template class LJExpandCoulLong<PRECISION,ACC_PRECISION>;
--- a/lib/gpu/lal_lj_expand_coul_long.cu
+++ b/lib/gpu/lal_lj_expand_coul_long.cu
@ -0,0 +1,272 @@
+// **************************************************************************
+//                               lj_coul_long.cu
+//                             -------------------
+//                           Trung Nguyen (Northwestern)
+//
+//  Device code for acceleration of the lj/expand/coul/long pair style
+//
+// __________________________________________________________________________
+//    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+// __________________________________________________________________________
+//
+//    begin                :
+//    email                : ndactrung@gmail.com
+// ***************************************************************************/
+
+#ifdef NV_KERNEL
+
+#include "lal_aux_fun1.h"
+#ifndef _DOUBLE_DOUBLE
+texture<float4> pos_tex;
+texture<float> q_tex;
+#else
+texture<int4,1> pos_tex;
+texture<int2> q_tex;
+#endif
+
+#else
+#define pos_tex x_
+#define q_tex q_
+#endif
+
+__kernel void k_lj_expand_coul_long(const __global numtyp4 *restrict x_,
+                             const __global numtyp4 *restrict lj1,
+                             const __global numtyp4 *restrict lj3,
+                             const int lj_types,
+                             const __global numtyp *restrict sp_lj_in,
+                             const __global int *dev_nbor,
+                             const __global int *dev_packed,
+                             __global acctyp4 *restrict ans,
+                             __global acctyp *restrict engv,
+                             const int eflag, const int vflag, const int inum,
+                             const int nbor_pitch,
+                             const __global numtyp *restrict q_,
+                             const numtyp cut_coulsq, const numtyp qqrd2e,
+                             const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp sp_lj[8];
+  sp_lj[0]=sp_lj_in[0];
+  sp_lj[1]=sp_lj_in[1];
+  sp_lj[2]=sp_lj_in[2];
+  sp_lj[3]=sp_lj_in[3];
+  sp_lj[4]=sp_lj_in[4];
+  sp_lj[5]=sp_lj_in[5];
+  sp_lj[6]=sp_lj_in[6];
+  sp_lj[7]=sp_lj_in[7];
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int itype=ix.w;
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int jtype=jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      int mtype=itype*lj_types+jtype;
+      if (rsq<lj1[mtype].z) {
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp r = ucl_sqrt(rsq);
+
+        if (rsq < lj1[mtype].w) {
+          numtyp rshift = r - lj3[mtype].w;
+          numtyp rshiftsq = rshift*rshift;
+          numtyp rshift2inv = ucl_recip(rshiftsq);
+          r6inv = rshift2inv*rshift2inv*rshift2inv;
+          force_lj = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+          force_lj *= factor_lj/rshift/r;
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = force_lj + forcecoul*r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
+__kernel void k_lj_expand_coul_long_fast(const __global numtyp4 *restrict x_,
+                                  const __global numtyp4 *restrict lj1_in,
+                                  const __global numtyp4 *restrict lj3_in,
+                                  const __global numtyp *restrict sp_lj_in,
+                                  const __global int *dev_nbor,
+                                  const __global int *dev_packed,
+                                  __global acctyp4 *restrict ans,
+                                  __global acctyp *restrict engv,
+                                  const int eflag, const int vflag,
+                                  const int inum,  const int nbor_pitch,
+                                  const __global numtyp *restrict q_,
+                                  const numtyp cut_coulsq, const numtyp qqrd2e,
+                                  const numtyp g_ewald, const int t_per_atom) {
+  int tid, ii, offset;
+  atom_info(t_per_atom,ii,tid,offset);
+
+  __local numtyp4 lj1[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp4 lj3[MAX_SHARED_TYPES*MAX_SHARED_TYPES];
+  __local numtyp sp_lj[8];
+  if (tid<8)
+    sp_lj[tid]=sp_lj_in[tid];
+  if (tid<MAX_SHARED_TYPES*MAX_SHARED_TYPES) {
+    lj1[tid]=lj1_in[tid];
+    lj3[tid]=lj3_in[tid];
+  }
+
+  acctyp energy=(acctyp)0;
+  acctyp e_coul=(acctyp)0;
+  acctyp4 f;
+  f.x=(acctyp)0; f.y=(acctyp)0; f.z=(acctyp)0;
+  acctyp virial[6];
+  for (int i=0; i<6; i++)
+    virial[i]=(acctyp)0;
+
+  __syncthreads();
+
+  if (ii<inum) {
+    int nbor, nbor_end;
+    int i, numj;
+    __local int n_stride;
+    nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
+              n_stride,nbor_end,nbor);
+
+    numtyp4 ix; fetch4(ix,i,pos_tex); //x_[i];
+    numtyp qtmp; fetch(qtmp,i,q_tex);
+    int iw=ix.w;
+    int itype=fast_mul((int)MAX_SHARED_TYPES,iw);
+
+    for ( ; nbor<nbor_end; nbor+=n_stride) {
+      int j=dev_packed[nbor];
+
+      numtyp factor_lj, factor_coul;
+      factor_lj = sp_lj[sbmask(j)];
+      factor_coul = (numtyp)1.0-sp_lj[sbmask(j)+4];
+      j &= NEIGHMASK;
+
+      numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
+      int mtype=itype+jx.w;
+
+      // Compute r12
+      numtyp delx = ix.x-jx.x;
+      numtyp dely = ix.y-jx.y;
+      numtyp delz = ix.z-jx.z;
+      numtyp rsq = delx*delx+dely*dely+delz*delz;
+
+      if (rsq<lj1[mtype].z) {
+        numtyp forcecoul, force_lj, force, r6inv, prefactor, _erfc;
+        numtyp r2inv=ucl_recip(rsq);
+        numtyp r = ucl_sqrt(rsq);
+
+        if (rsq < lj1[mtype].w) {
+          numtyp rshift = r - lj3[mtype].w;
+          numtyp rshiftsq = rshift*rshift;
+          numtyp rshift2inv = ucl_recip(rshiftsq);
+          r6inv = rshift2inv*rshift2inv*rshift2inv;
+          force_lj = r6inv*(lj1[mtype].x*r6inv-lj1[mtype].y);
+          force_lj *= factor_lj/rshift/r;
+        } else
+          force_lj = (numtyp)0.0;
+
+        if (rsq < cut_coulsq) {
+          numtyp grij = g_ewald * r;
+          numtyp expm2 = ucl_exp(-grij*grij);
+          numtyp t = ucl_recip((numtyp)1.0 + EWALD_P*grij);
+          _erfc = t * (A1+t*(A2+t*(A3+t*(A4+t*A5)))) * expm2;
+          fetch(prefactor,j,q_tex);
+          prefactor *= qqrd2e * qtmp/r;
+          forcecoul = prefactor * (_erfc + EWALD_F*grij*expm2-factor_coul);
+        } else
+          forcecoul = (numtyp)0.0;
+
+        force = force_lj + forcecoul*r2inv;
+
+        f.x+=delx*force;
+        f.y+=dely*force;
+        f.z+=delz*force;
+
+        if (eflag>0) {
+          if (rsq < cut_coulsq)
+            e_coul += prefactor*(_erfc-factor_coul);
+          if (rsq < lj1[mtype].w) {
+            numtyp e=r6inv*(lj3[mtype].x*r6inv-lj3[mtype].y);
+            energy+=factor_lj*(e-lj3[mtype].z);
+          }
+        }
+        if (vflag>0) {
+          virial[0] += delx*delx*force;
+          virial[1] += dely*dely*force;
+          virial[2] += delz*delz*force;
+          virial[3] += delx*dely*force;
+          virial[4] += delx*delz*force;
+          virial[5] += dely*delz*force;
+        }
+      }
+
+    } // for nbor
+    store_answers_q(f,energy,e_coul,virial,ii,inum,tid,t_per_atom,offset,eflag,
+                    vflag,ans,engv);
+  } // if ii
+}
+
--- a/lib/gpu/lal_lj_expand_coul_long.h
+++ b/lib/gpu/lal_lj_expand_coul_long.h
@ -0,0 +1,88 @@
+/***************************************************************************
+                             lj_expand_coul_long.h
+                             -------------------
+                            Trung Nguyen (Northwestern)
+
+  Class for acceleration of the lj/expand/coul/long pair style.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#ifndef LAL_LJ_EXPAND_COUL_LONG_H
+#define LAL_LJ_EXPAND_COUL_LONG_H
+
+#include "lal_base_charge.h"
+
+namespace LAMMPS_AL {
+
+template <class numtyp, class acctyp>
+class LJExpandCoulLong : public BaseCharge<numtyp, acctyp> {
+ public:
+  LJExpandCoulLong();
+  ~LJExpandCoulLong();
+
+  /// Clear any previous data and set up for a new LAMMPS run
+  /** \param max_nbors initial number of rows in the neighbor matrix
+    * \param cell_size cutoff + skin
+    * \param gpu_split fraction of particles handled by device
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(const int ntypes, double **host_cutsq,
+           double **host_lj1, double **host_lj2, double **host_lj3,
+           double **host_lj4, double **host_offset, double **host_shift, double *host_special_lj,
+           const int nlocal, const int nall, const int max_nbors,
+           const int maxspecial, const double cell_size,
+           const double gpu_split, FILE *screen, double **host_cut_ljsq,
+           const double host_cut_coulsq, double *host_special_coul,
+           const double qqrd2e, const double g_ewald);
+
+  /// Send updated coeffs from host to device (to be compatible with fix adapt)
+  void reinit(const int ntypes, double **host_cutsq,
+              double **host_lj1, double **host_lj2, double **host_lj3,
+              double **host_lj4, double **host_offset, double **host_shift, double **host_cut_ljsq);
+
+  /// Clear all host and device data
+  /** \note This is called at the beginning of the init() routine **/
+  void clear();
+
+  /// Returns memory usage on device per atom
+  int bytes_per_atom(const int max_nbors) const;
+
+  /// Total host memory used by library for pair style
+  double host_memory_usage() const;
+
+  // --------------------------- TYPE DATA --------------------------
+
+  /// lj1.x = lj1, lj1.y = lj2, lj1.z = cutsq, lj1.w = cutsq_vdw
+  UCL_D_Vec<numtyp4> lj1;
+  /// lj3.x = lj3, lj3.y = lj4, lj3.z = offset, lj3.w = shift
+  UCL_D_Vec<numtyp4> lj3;
+  /// Special LJ values [0-3] and Special Coul values [4-7]
+  UCL_D_Vec<numtyp> sp_lj;
+
+  /// If atom type constants fit in shared memory, use fast kernels
+  bool shared_types;
+
+  /// Number of atom types
+  int _lj_types;
+
+  numtyp _cut_coulsq, _qqrd2e, _g_ewald;
+
+ private:
+  bool _allocated;
+  void loop(const bool _eflag, const bool _vflag);
+};
+
+}
+
+#endif
--- a/lib/gpu/lal_lj_expand_coul_long_ext.cpp
+++ b/lib/gpu/lal_lj_expand_coul_long_ext.cpp
@ -0,0 +1,152 @@
+/***************************************************************************
+                           lj_expand_coul_long_ext.cpp
+                            ------------------------
+                            Trung Nguyen (Northwestern)
+
+  Functions for LAMMPS access to lj/expand/coul/long acceleration routines.
+
+ __________________________________________________________________________
+    This file is part of the LAMMPS Accelerator Library (LAMMPS_AL)
+ __________________________________________________________________________
+
+    begin                :
+    email                : ndactrung@gmail.com
+ ***************************************************************************/
+
+#include <iostream>
+#include <cassert>
+#include <cmath>
+
+#include "lal_lj_expand_coul_long.h"
+
+using namespace std;
+using namespace LAMMPS_AL;
+
+static LJExpandCoulLong<PRECISION,ACC_PRECISION> LJECLMF;
+
+// ---------------------------------------------------------------------------
+// Allocate memory on host and device and copy constants to device
+// ---------------------------------------------------------------------------
+int ljecl_gpu_init(const int ntypes, double **cutsq, double **host_lj1,
+                  double **host_lj2, double **host_lj3, double **host_lj4,
+                  double **offset, double **shift, double *special_lj, const int inum,
+                  const int nall, const int max_nbors, const int maxspecial,
+                  const double cell_size, int &gpu_mode, FILE *screen,
+                  double **host_cut_ljsq, double host_cut_coulsq,
+                  double *host_special_coul, const double qqrd2e,
+                  const double g_ewald) {
+  LJECLMF.clear();
+  gpu_mode=LJECLMF.device->gpu_mode();
+  double gpu_split=LJECLMF.device->particle_split();
+  int first_gpu=LJECLMF.device->first_device();
+  int last_gpu=LJECLMF.device->last_device();
+  int world_me=LJECLMF.device->world_me();
+  int gpu_rank=LJECLMF.device->gpu_rank();
+  int procs_per_gpu=LJECLMF.device->procs_per_gpu();
+
+  LJECLMF.device->init_message(screen,"lj/expand/coul/long",first_gpu,last_gpu);
+
+  bool message=false;
+  if (LJECLMF.device->replica_me()==0 && screen)
+    message=true;
+
+  if (message) {
+    fprintf(screen,"Initializing Device and compiling on process 0...");
+    fflush(screen);
+  }
+
+  int init_ok=0;
+  if (world_me==0)
+    init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                        offset, shift, special_lj, inum, nall, 300, maxspecial,
+                        cell_size, gpu_split, screen, host_cut_ljsq,
+                        host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+  LJECLMF.device->world_barrier();
+  if (message)
+    fprintf(screen,"Done.\n");
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (message) {
+      if (last_gpu-first_gpu==0)
+        fprintf(screen,"Initializing Device %d on core %d...",first_gpu,i);
+      else
+        fprintf(screen,"Initializing Devices %d-%d on core %d...",first_gpu,
+                last_gpu,i);
+      fflush(screen);
+    }
+    if (gpu_rank==i && world_me!=0)
+      init_ok=LJECLMF.init(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                          offset, shift, special_lj, inum, nall, 300, maxspecial,
+                          cell_size, gpu_split, screen, host_cut_ljsq,
+                          host_cut_coulsq, host_special_coul, qqrd2e, g_ewald);
+
+    LJECLMF.device->gpu_barrier();
+    if (message)
+      fprintf(screen,"Done.\n");
+  }
+  if (message)
+    fprintf(screen,"\n");
+
+  if (init_ok==0)
+    LJECLMF.estimate_gpu_overhead();
+  return init_ok;
+}
+
+// ---------------------------------------------------------------------------
+// Copy updated coeffs from host to device
+// ---------------------------------------------------------------------------
+void ljecl_gpu_reinit(const int ntypes, double **cutsq, double **host_lj1,
+                    double **host_lj2, double **host_lj3, double **host_lj4,
+                    double **offset, double **shift, double **host_cut_ljsq) {
+  int world_me=LJECLMF.device->world_me();
+  int gpu_rank=LJECLMF.device->gpu_rank();
+  int procs_per_gpu=LJECLMF.device->procs_per_gpu();
+
+  if (world_me==0)
+    LJECLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                  offset, shift, host_cut_ljsq);
+  LJECLMF.device->world_barrier();
+
+  for (int i=0; i<procs_per_gpu; i++) {
+    if (gpu_rank==i && world_me!=0)
+      LJECLMF.reinit(ntypes, cutsq, host_lj1, host_lj2, host_lj3, host_lj4,
+                    offset, shift, host_cut_ljsq);
+    LJECLMF.device->gpu_barrier();
+  }
+}
+
+void ljecl_gpu_clear() {
+  LJECLMF.clear();
+}
+
+int** ljecl_gpu_compute_n(const int ago, const int inum_full,
+                         const int nall, double **host_x, int *host_type,
+                         double *sublo, double *subhi, tagint *tag, int **nspecial,
+                         tagint **special, const bool eflag, const bool vflag,
+                         const bool eatom, const bool vatom, int &host_start,
+                         int **ilist, int **jnum,  const double cpu_time,
+                         bool &success, double *host_q, double *boxlo,
+                         double *prd) {
+  return LJECLMF.compute(ago, inum_full, nall, host_x, host_type, sublo,
+                        subhi, tag, nspecial, special, eflag, vflag, eatom,
+                        vatom, host_start, ilist, jnum, cpu_time, success,
+                        host_q, boxlo, prd);
+}
+
+void ljecl_gpu_compute(const int ago, const int inum_full, const int nall,
+                      double **host_x, int *host_type, int *ilist, int *numj,
+                      int **firstneigh, const bool eflag, const bool vflag,
+                      const bool eatom, const bool vatom, int &host_start,
+                      const double cpu_time, bool &success, double *host_q,
+                      const int nlocal, double *boxlo, double *prd) {
+  LJECLMF.compute(ago,inum_full,nall,host_x,host_type,ilist,numj,
+                firstneigh,eflag,vflag,eatom,vatom,host_start,cpu_time,success,
+                host_q,nlocal,boxlo,prd);
+}
+
+double ljecl_gpu_bytes() {
+  return LJECLMF.host_memory_usage();
+}
+
+
--- a/lib/gpu/lal_neighbor_gpu.cu
+++ b/lib/gpu/lal_neighbor_gpu.cu
@ -118,24 +118,24 @@ __kernel void transpose(__global tagint *restrict out,
                        const __global tagint *restrict in,
                        int columns_in, int rows_in)
 {
-        __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];
+  __local tagint block[BLOCK_CELL_2D][BLOCK_CELL_2D+1];

-        unsigned ti=THREAD_ID_X;
-        unsigned tj=THREAD_ID_Y;
-        unsigned bi=BLOCK_ID_X;
-        unsigned bj=BLOCK_ID_Y;
+  unsigned ti=THREAD_ID_X;
+  unsigned tj=THREAD_ID_Y;
+  unsigned bi=BLOCK_ID_X;
+  unsigned bj=BLOCK_ID_Y;

-        unsigned i=bi*BLOCK_CELL_2D+ti;
-        unsigned j=bj*BLOCK_CELL_2D+tj;
-        if ((i<columns_in) && (j<rows_in))
-                block[tj][ti]=in[j*columns_in+i];
+  unsigned i=bi*BLOCK_CELL_2D+ti;
+  unsigned j=bj*BLOCK_CELL_2D+tj;
+  if ((i<columns_in) && (j<rows_in))
+    block[tj][ti]=in[j*columns_in+i];

-        __syncthreads();
+   __syncthreads();

-        i=bj*BLOCK_CELL_2D+ti;
-        j=bi*BLOCK_CELL_2D+tj;
-        if ((i<rows_in) && (j<columns_in))
-                out[j*rows_in+i] = block[ti][tj];
+  i=bj*BLOCK_CELL_2D+ti;
+  j=bi*BLOCK_CELL_2D+tj;
+  if ((i<rows_in) && (j<columns_in))
+    out[j*rows_in+i] = block[ti][tj];
 }

 __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
@ -191,7 +191,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
      nbor_list[pid_i]=pid_i;
    } else {
      stride=0;
-            neigh_counts=host_numj+pid_i-inum;
+      neigh_counts=host_numj+pid_i-inum;
      neigh_list=host_nbor_list+(pid_i-inum)*neigh_bin_size;
    }

@ -232,7 +232,7 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                diff.z = atom_i.z - pos_sh[j].z;

                r2 = diff.x*diff.x + diff.y*diff.y + diff.z*diff.z;
-                if (r2 < cell_size*cell_size && r2 > 1e-5) {
+                if (r2 < cell_size*cell_size && pid_j != pid_i) { //  && r2 > 1e-5
                  cnt++;
                  if (cnt <= neigh_bin_size) {
                    *neigh_list = pid_j;
@ -243,8 +243,8 @@ __kernel void calc_neigh_list_cell(const __global numtyp4 *restrict x_,
                }
              }
            }
-                  __syncthreads();
-                } // for (k)
+            __syncthreads();
+          } // for (k)
        }
      }
    }
--- a/lib/kim/Install.py
+++ b/lib/kim/Install.py
@ -21,7 +21,7 @@ Syntax from lib dir: python Install.py -b -v version  -a kim-name
 specify one or more options, order does not matter

  -v = version of KIM API library to use
-       default = kim-api-v1.9.4 (current as of Apr 2018)
+       default = kim-api-v1.9.5 (current as of May 2018)
  -b = download and build base KIM API library with example Models
       this will delete any previous installation in the current folder
  -n = do NOT download and build base KIM API library.
@ -109,7 +109,7 @@ nargs = len(args)
 if nargs == 0: error()

 thisdir = os.environ['PWD']
-version = "kim-api-v1.9.4"
+version = "kim-api-v1.9.5"

 buildflag = False
 everythingflag = False
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,58 @@
 # Change Log

+## [2.7.00](https://github.com/kokkos/kokkos/tree/2.7.00) (2018-05-24)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.6.00...2.7.00)
+
+**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.7**
+
+**Implemented enhancements:**
+
+- Deprecate team\_size auto adjusting to maximal value possible [\#1618](https://github.com/kokkos/kokkos/issues/1618)
+- DynamicView - remove restrictions to std::is\_trivial types and value\_type is power of two [\#1586](https://github.com/kokkos/kokkos/issues/1586)
+- Kokkos::StaticCrsGraph does not propagate memory traits \(e.g., Unmanaged\) [\#1581](https://github.com/kokkos/kokkos/issues/1581)
+- Adding ETI for DeepCopy / ViewFill etc. [\#1578](https://github.com/kokkos/kokkos/issues/1578)
+- Deprecate all the left over KOKKOS\_HAVE\_ Macros and Kokkos\_OldMacros.hpp [\#1572](https://github.com/kokkos/kokkos/issues/1572)
+- Error if Kokkos\_ARCH set in CMake [\#1555](https://github.com/kokkos/kokkos/issues/1555)
+- Deprecate ExecSpace::initialize / ExecSpace::finalize [\#1532](https://github.com/kokkos/kokkos/issues/1532)
+- New API for TeamPolicy property setting [\#1531](https://github.com/kokkos/kokkos/issues/1531)
+- clang 6.0 + cuda debug out-of-memory test failure [\#1521](https://github.com/kokkos/kokkos/issues/1521)
+- Cuda UniqueToken interface not consistent with other backends [\#1505](https://github.com/kokkos/kokkos/issues/1505)
+- Move Reducers out of Experimental namespace [\#1494](https://github.com/kokkos/kokkos/issues/1494)
+- Provide scope guard for initialize/finalize [\#1479](https://github.com/kokkos/kokkos/issues/1479)
+- Check Kokkos::is\_initialized in SharedAllocationRecord dtor [\#1465](https://github.com/kokkos/kokkos/issues/1465)
+- Remove static list of allocations [\#1464](https://github.com/kokkos/kokkos/issues/1464)
+- Makefiles: Support single compile/link line use case [\#1402](https://github.com/kokkos/kokkos/issues/1402)
+- ThreadVectorRange with a range  [\#1400](https://github.com/kokkos/kokkos/issues/1400)
+- Exclusive scan + last value API [\#1358](https://github.com/kokkos/kokkos/issues/1358)
+- Install kokkos\_generated\_settings.cmake [\#1348](https://github.com/kokkos/kokkos/issues/1348)
+- Kokkos arrays \(not views!\) don't do bounds checking in debug mode [\#1342](https://github.com/kokkos/kokkos/issues/1342)
+- Expose round-robin GPU assignment outside of initialize\(int, char\*\*\) [\#1318](https://github.com/kokkos/kokkos/issues/1318)
+- DynamicView misses use\_count and label function [\#1298](https://github.com/kokkos/kokkos/issues/1298)
+- View constructor should check arguments [\#1286](https://github.com/kokkos/kokkos/issues/1286)
+- False Positive on Oversubscription Warning [\#1207](https://github.com/kokkos/kokkos/issues/1207)
+- Allow \(require\) execution space for 1st arg of VerifyExecutionCanAccessMemorySpace [\#1192](https://github.com/kokkos/kokkos/issues/1192)
+- ROCm: Add ROCmHostPinnedSpace [\#958](https://github.com/kokkos/kokkos/issues/958)
+- power of two functions [\#656](https://github.com/kokkos/kokkos/issues/656)
+- CUDA 8 has 64bit \_\_shfl [\#361](https://github.com/kokkos/kokkos/issues/361)
+- Add TriBITS/CMake configure information about node types [\#243](https://github.com/kokkos/kokkos/issues/243)
+
+**Fixed bugs:**
+
+- CUDA atomic\_fetch\_sub for doubles is hitting CAS instead of intrinsic [\#1624](https://github.com/kokkos/kokkos/issues/1624)
+- Bug: use of ballot on Volta [\#1612](https://github.com/kokkos/kokkos/issues/1612)
+- Kokkos::deep\_copy memory access failures [\#1583](https://github.com/kokkos/kokkos/issues/1583)
+- g++ -std option doubly set for cmake project [\#1548](https://github.com/kokkos/kokkos/issues/1548)
+- ViewFill for 1D Views of larger 32bit entries fails [\#1541](https://github.com/kokkos/kokkos/issues/1541)
+- CUDA Volta another warpsync bug [\#1520](https://github.com/kokkos/kokkos/issues/1520)
+- triple\_nested\_parallelism fails with KOKKOS\_DEBUG and CUDA [\#1513](https://github.com/kokkos/kokkos/issues/1513)
+- Jenkins errors in Kokkos\_SharedAlloc.cpp with debug build [\#1511](https://github.com/kokkos/kokkos/issues/1511)
+- Kokkos::Sort out-of-bounds with empty bins [\#1504](https://github.com/kokkos/kokkos/issues/1504)
+- Get rid of deprecated functions inside Kokkos [\#1484](https://github.com/kokkos/kokkos/issues/1484)
+- get\_work\_partition casts int64\_t to int, causing a seg fault [\#1481](https://github.com/kokkos/kokkos/issues/1481)
+- NVCC bug with \_\_device\_\_ on defaulted function [\#1470](https://github.com/kokkos/kokkos/issues/1470)
+- CMake example broken with CUDA backend [\#1468](https://github.com/kokkos/kokkos/issues/1468)
+
+
 ## [2.6.00](https://github.com/kokkos/kokkos/tree/2.6.00) (2018-03-07)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.5.00...2.6.00)

--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -44,6 +44,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
        "${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} generate_build_settings")
  endif()
  include(${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake)
+  install(FILES ${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake DESTINATION lib/cmake/Kokkos)
  string(REPLACE " " ";" KOKKOS_TPL_INCLUDE_DIRS "${KOKKOS_GMAKE_TPL_INCLUDE_DIRS}")
  string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_DIRS "${KOKKOS_GMAKE_TPL_LIBRARY_DIRS}")
  string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_NAMES "${KOKKOS_GMAKE_TPL_LIBRARY_NAMES}")
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -1,7 +1,9 @@
 # Default settings common options.

 #LAMMPS specific settings:
-KOKKOS_PATH=../../lib/kokkos
+ifndef KOKKOS_PATH
+  KOKKOS_PATH=../../lib/kokkos
+endif
 CXXFLAGS=$(CCFLAGS)

 # Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
@ -21,8 +23,10 @@ KOKKOS_DEBUG ?= "no"
 KOKKOS_USE_TPLS ?= ""
 # Options: c++11,c++1z
 KOKKOS_CXX_STANDARD ?= "c++11"
-# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code
+# Options: aggressive_vectorization,disable_profiling,disable_deprecated_code,enable_large_mem_tests
 KOKKOS_OPTIONS ?= ""
+# Option for setting ETI path
+KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti

 # Default settings specific options.
 # Options: force_uvm,use_ldg,rdc,enable_lambda
@ -51,10 +55,12 @@ KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),
 KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_deprecated_code)
 KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check)
 KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print)
+KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_large_mem_tests)
 KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),use_ldg)
 KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm)
 KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc)
 KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda)
+KOKKOS_INTERNAL_ENABLE_ETI := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_eti)


 # Check for Kokkos Host Execution Spaces one of which must be on.
@ -78,7 +84,12 @@ KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),O

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
-  CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
+  ifeq ($(origin CUDA_PATH), undefined)
+    CUDA_PATH = $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
+  endif
+  ifeq ($(CUDA_PATH),)
+    CUDA_PATH = $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=)
+  endif
  KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
 endif

@ -116,7 +127,7 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-  KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
+  KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell $(CXX) --version | grep version | cut -d ' ' -f3 | tr -d '.')

  ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
@ -323,12 +334,13 @@ endif

 # Generating the list of Flags.

-KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
+#CPPFLAGS is now unused
+KOKKOS_CPPFLAGS =
+KOKKOS_CXXFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
 KOKKOS_TPL_INCLUDE_DIRS =
 KOKKOS_TPL_LIBRARY_DIRS =
 KOKKOS_TPL_LIBRARY_NAMES =

-KOKKOS_CXXFLAGS =
 ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS)
 endif
@ -336,6 +348,8 @@ endif
 KOKKOS_LIBS = -ldl
 KOKKOS_TPL_LIBRARY_NAMES += dl
 KOKKOS_LDFLAGS = -L$(shell pwd)
+# CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
+KOKKOS_CXXLDFLAGS = -L$(shell pwd)
 KOKKOS_LINK_FLAGS = 
 KOKKOS_SRC =
 KOKKOS_HEADERS =
@ -362,7 +376,7 @@ tmp := $(call kokkos_append_header,'\#endif')
 tmp := $(call kokkos_append_header,"/* Execution Spaces */")

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CUDA")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
@ -374,19 +388,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
-  tmp := $(call kokkos_append_header,'\#define KOKKOS_HAVE_OPENMP')
+  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMP')
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_PTHREAD")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_THREADS")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_QTHREADS")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_QTHREADS")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_SERIAL")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_SERIAL")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_TM), 1)
@ -422,13 +436,13 @@ endif
 tmp := $(call kokkos_append_header,"/* General Settings */")
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX11")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
 endif

 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX11")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX1Z")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX1Z")
 endif

 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
@ -437,9 +451,9 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
  endif

  KOKKOS_CXXFLAGS += -g
-  KOKKOS_LDFLAGS += -g -ldl
+  KOKKOS_LDFLAGS += -g
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_DEBUG")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG")
  ifeq ($(KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK), 0)
    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK")
  endif
@ -451,14 +465,15 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
  ifneq ($(HWLOC_PATH),)
-    KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
+    KOKKOS_CXXFLAGS += -I$(HWLOC_PATH)/include
    KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib
+    KOKKOS_CXXLDFLAGS += -L$(HWLOC_PATH)/lib
    KOKKOS_TPL_INCLUDE_DIRS += $(HWLOC_PATH)/include
    KOKKOS_TPL_LIBRARY_DIRS += $(HWLOC_PATH)/lib
  endif
  KOKKOS_LIBS += -lhwloc
  KOKKOS_TPL_LIBRARY_NAMES += hwloc
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_HWLOC")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HWLOC")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
@ -469,14 +484,15 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
  ifneq ($(MEMKIND_PATH),)
-    KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
+    KOKKOS_CXXFLAGS += -I$(MEMKIND_PATH)/include
    KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
+    KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib
    KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
    KOKKOS_TPL_LIBRARY_DIRS += $(MEMKIND_PATH)/lib
  endif
  KOKKOS_LIBS += -lmemkind -lnuma
  KOKKOS_TPL_LIBRARY_NAMES += memkind numa
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_HBWSPACE")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_HBWSPACE")
 endif

 ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0)
@ -486,6 +502,13 @@ endif
 ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 0)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEPRECATED_CODE")
 endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_ETI")
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_LARGE_MEM_TESTS), 1)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LARGE_MEM_TESTS")
+endif

 tmp := $(call kokkos_append_header,"/* Optimization Settings */")

@ -497,27 +520,35 @@ tmp := $(call kokkos_append_header,"/* Cuda Settings */")

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LDG_INTRINSIC")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
  else
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LDG_INTRINSIC")
+      tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC")
    endif
  endif

  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_UVM")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_UVM")
  endif

  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE")
    KOKKOS_CXXFLAGS += --relocatable-device-code=true
    KOKKOS_LDFLAGS += --relocatable-device-code=true
  endif

+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+    ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -ge 90; echo $$?),0)
+      # This diagnostic is just plain wrong in CUDA 9
+      # See https://github.com/kokkos/kokkos/issues/1470
+      KOKKOS_CXXFLAGS += -Xcudafe --diag_suppress=esa_on_defaulted_function_ignored
+    endif
+  endif
+
  ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
    ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
      ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0)
-        tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LAMBDA")
+        tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LAMBDA")
        KOKKOS_CXXFLAGS += -expt-extended-lambda
      else
        $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
@ -525,12 +556,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
    endif

    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-      tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LAMBDA")
+      tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA_LAMBDA")
    endif
  endif

  ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
-    tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_CLANG_WORKAROUND")
+    tmp := $(call kokkos_append_header,"\#define KOKKOS_IMPL_CUDA_CLANG_WORKAROUND")
  endif
 endif

@ -907,10 +938,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)

  KOKKOS_CXXFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --cxxflags) 
  KOKKOS_LDFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --ldflags) -lhc_am -lm 
+  KOKKOS_CXXLDFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --ldflags) -lhc_am -lm
  KOKKOS_TPL_LIBRARY_NAMES += hc_am m
  KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_ROCM_ARCH_FLAG)

  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.cpp)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/ROCm/*.cpp)
+endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.hpp)
 endif

@ -937,10 +972,14 @@ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Cuda/*.cpp)
+endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
  ifneq ($(CUDA_PATH),)
-    KOKKOS_CPPFLAGS += -I$(CUDA_PATH)/include
+    KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
    KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
+    KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
    KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
    KOKKOS_TPL_LIBRARY_DIRS += $(CUDA_PATH)/lib64
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
@ -964,6 +1003,9 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/OpenMP/*.cpp)
+endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)

  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
@ -978,6 +1020,9 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Threads/*.cpp)
+endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
  KOKKOS_LIBS += -lpthread
  KOKKOS_TPL_LIBRARY_NAMES += pthread
@ -987,8 +1032,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
  ifneq ($(QTHREADS_PATH),)
-    KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
+    KOKKOS_CXXFLAGS += -I$(QTHREADS_PATH)/include
    KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
+    KOKKOS_CXXLDFLAGS += -L$(QTHREADS_PATH)/lib
    KOKKOS_TPL_INCLUDE_DIRS += $(QTHREADS_PATH)/include
    KOKKOS_TPL_LIBRARY_DIRS += $(QTHREADS_PATH)/lib64
  endif
@ -1011,6 +1057,11 @@ endif

 # Don't include Kokkos_Serial.cpp or Kokkos_Serial_Task.cpp if not using Serial
 # device to avoid a link warning.
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  KOKKOS_SRC += $(wildcard $(KOKKOS_ETI_PATH)/Serial/*.cpp)
+endif
+endif
 ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp,$(KOKKOS_SRC))
  KOKKOS_SRC := $(filter-out $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp,$(KOKKOS_SRC))
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@ -31,6 +31,12 @@ Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
 Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp

+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  include $(KOKKOS_ETI_PATH)/Serial/Makefile.eti_Serial
+endif
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -40,6 +46,9 @@ Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cu
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
 Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  include $(KOKKOS_ETI_PATH)/Cuda/Makefile.eti_Cuda
+endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
@ -51,6 +60,9 @@ Kokkos_ROCm_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_RO
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp
 Kokkos_ROCm_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  include $(KOKKOS_ETI_PATH)/ROCm/Makefile.eti_ROCm
+endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
@ -58,6 +70,9 @@ Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
 Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  include $(KOKKOS_ETI_PATH)/Threads/Makefile.eti_Threads
+endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
@ -72,6 +87,9 @@ Kokkos_OpenMP_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokko
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp
 Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
+ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
+  include $(KOKKOS_ETI_PATH)/OpenMP/Makefile.eti_OpenMP
+endif
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -702,7 +702,11 @@ namespace Kokkos {
    }
    Random_XorShift64_Pool(uint64_t seed) {
      num_states_ = 0;
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
      init(seed,DeviceType::max_hardware_threads());
+#else
+      init(seed,DeviceType::impl_max_hardware_threads());
+#endif
    }

    Random_XorShift64_Pool(const Random_XorShift64_Pool& src):
@ -751,7 +755,11 @@ namespace Kokkos {

    KOKKOS_INLINE_FUNCTION
    Random_XorShift64<DeviceType> get_state() const {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
      const int i = DeviceType::hardware_thread_id();;
+#else
+      const int i = DeviceType::impl_hardware_thread_id();;
+#endif
      return Random_XorShift64<DeviceType>(state_(i),i);
    }

@ -957,7 +965,11 @@ namespace Kokkos {
    inline
    Random_XorShift1024_Pool(uint64_t seed){
      num_states_ = 0;
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
      init(seed,DeviceType::max_hardware_threads());
+#else
+      init(seed,DeviceType::impl_max_hardware_threads());
+#endif
    }

    Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src):
@ -1012,7 +1024,11 @@ namespace Kokkos {

    KOKKOS_INLINE_FUNCTION
    Random_XorShift1024<DeviceType> get_state() const {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
      const int i = DeviceType::hardware_thread_id();
+#else
+      const int i = DeviceType::impl_hardware_thread_id();
+#endif
      return Random_XorShift1024<DeviceType>(state_,p_(i),i);
    };

--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -288,6 +288,7 @@ public:
      Kokkos::abort("BinSort::sort: values range length != permutation vector length");
    }

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
    scratch_view_type
      sorted_values("Scratch",
                    len,
@ -298,6 +299,18 @@ public:
                    values.extent(5),
                    values.extent(6),
                    values.extent(7));
+#else
+    scratch_view_type
+      sorted_values("Scratch",
+                  values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                  values.rank_dynamic > 1 ? values.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
+                  values.rank_dynamic > 2 ? values.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                  values.rank_dynamic > 3 ? values.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                  values.rank_dynamic > 4 ? values.extent(4) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                  values.rank_dynamic > 5 ? values.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                  values.rank_dynamic > 6 ? values.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
+                  values.rank_dynamic > 7 ? values.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG);
+#endif

    {
      copy_permute_functor< scratch_view_type /* DstViewType */
@ -362,8 +375,10 @@ public:

  KOKKOS_INLINE_FUNCTION
  void operator() (const bin_sort_bins_tag& tag, const int&i )  const {
+    auto bin_size = bin_count_const(i);
+    if (bin_size <= 1) return;
+    int upper_bound = bin_offsets(i)+bin_size;
    bool sorted = false;
-    int upper_bound = bin_offsets(i)+bin_count_const(i);
    while(!sorted) {
      sorted = true;
      int old_idx = sort_order(bin_offsets(i));
@ -501,7 +516,7 @@ bool try_std_sort(ViewType view) {

 template<class ViewType>
 struct min_max_functor {
-  typedef Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> minmax_scalar;
+  typedef Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> minmax_scalar;

  ViewType view;
  min_max_functor(const ViewType& view_):view(view_) {}
@ -523,8 +538,8 @@ void sort( ViewType const & view , bool const always_use_kokkos_sort = false)
  }
  typedef BinOp1D<ViewType> CompType;

-  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
-  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
+  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);
  parallel_reduce("Kokkos::Sort::FindExtent",Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.extent(0)),
                  Impl::min_max_functor<ViewType>(view),reducer);
  if(result.min_val == result.max_val) return;
@ -542,8 +557,8 @@ void sort( ViewType view
  typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy ;
  typedef BinOp1D<ViewType> CompType;

-  Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
-  Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
+  Kokkos::MinMaxScalar<typename ViewType::non_const_value_type> result;
+  Kokkos::MinMax<typename ViewType::non_const_value_type> reducer(result);

  parallel_reduce("Kokkos::Sort::FindExtent", range_policy( begin , end )
                 , Impl::min_max_functor<ViewType>(view),reducer );
--- a/lib/kokkos/cmake/kokkos_build.cmake
+++ b/lib/kokkos/cmake/kokkos_build.cmake
@ -76,7 +76,11 @@ IF(KOKKOS_SEPARATE_LIBS)
  )

  foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
-    find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
+    if ("${lib}" STREQUAL "cuda")
+      set(LIB_cuda "-lcuda")
+    else()
+      find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
+    endif()
    target_link_libraries(kokkoscore PUBLIC ${LIB_${lib}})
  endforeach()

@ -154,7 +158,11 @@ ELSE()
  )

  foreach(lib IN LISTS KOKKOS_TPL_LIBRARY_NAMES)
-    find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
+    if ("${lib}" STREQUAL "cuda")
+      set(LIB_cuda "-lcuda")
+    else()
+      find_library(LIB_${lib} ${lib} PATHS ${KOKKOS_TPL_LIBRARY_DIRS})
+    endif()
    target_link_libraries(kokkos PUBLIC ${LIB_${lib}})
  endforeach()

--- a/lib/kokkos/cmake/kokkos_options.cmake
+++ b/lib/kokkos/cmake/kokkos_options.cmake
@ -31,6 +31,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
     Profiling_Load_Print
     Aggressive_Vectorization
     Deprecated_Code
+     Explicit_Instantiation
     )

 #-------------------------------------------------------------------------------
@ -40,6 +41,7 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
 foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
  string(TOUPPER ${opt} OPT )
  IF(DEFINED Kokkos_ENABLE_${opt})
+    MESSAGE("Kokkos_ENABLE_${opt} is defined!")
    IF(DEFINED KOKKOS_ENABLE_${OPT})
      IF(NOT ("${KOKKOS_ENABLE_${OPT}}" STREQUAL "${Kokkos_ENABLE_${opt}}"))
        IF(DEFINED KOKKOS_ENABLE_${OPT}_INTERNAL)
@ -57,18 +59,16 @@ foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
      ENDIF()
    ELSE()
      SET(KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT ${Kokkos_ENABLE_${opt}})
+      MESSAGE("set KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT!")
    ENDIF()
  ENDIF()
 endforeach()

+IF(DEFINED Kokkos_ARCH)
+  MESSAGE(FATAL_ERROR "Defined Kokkos_ARCH, use KOKKOS_ARCH instead!")
+ENDIF()
 IF(DEFINED Kokkos_Arch)
-  IF(DEFINED KOKKOS_ARCH)
-    IF(NOT (${KOKKOS_ARCH} STREQUAL "${Kokkos_Arch}"))
-      MESSAGE(FATAL_ERROR "Defined both Kokkos_Arch and KOKKOS_ARCH and they differ!")
-    ENDIF()
-  ELSE()
-    SET(KOKKOS_ARCH ${Kokkos_Arch})
-  ENDIF()
+  MESSAGE(FATAL_ERROR "Defined Kokkos_Arch, use KOKKOS_ARCH instead!")
 ENDIF()
  
 #-------------------------------------------------------------------------------
@ -103,6 +103,8 @@ list(APPEND KOKKOS_ARCH_LIST
     Maxwell53       # (GPU) NVIDIA Maxwell generation CC 5.3
     Pascal60        # (GPU) NVIDIA Pascal generation CC 6.0
     Pascal61        # (GPU) NVIDIA Pascal generation CC 6.1
+     Volta70         # (GPU) NVIDIA Volta generation CC 7.0
+     Volta72         # (GPU) NVIDIA Volta generation CC 7.2
    )

 # List of possible device architectures.
@ -267,6 +269,8 @@ set(KOKKOS_ENABLE_PROFILING_LOAD_PRINT ${KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_P
 set_kokkos_default_default(DEPRECATED_CODE ON)
 set(KOKKOS_ENABLE_DEPRECATED_CODE ${KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE_DEFAULT} CACHE BOOL "Enable deprecated code.")

+set_kokkos_default_default(EXPLICIT_INSTANTIATION ON)
+set(KOKKOS_ENABLE_EXPLICIT_INSTANTIATION ${KOKKOS_INTERNAL_ENABLE_EXPLICIT_INSTANTIATION_DEFAULT} CACHE BOOL "Enable explicit template instantiation.")

 #-------------------------------------------------------------------------------
 #------------------------------- KOKKOS_USE_TPLS -------------------------------
--- a/lib/kokkos/cmake/kokkos_settings.cmake
+++ b/lib/kokkos/cmake/kokkos_settings.cmake
@ -74,6 +74,9 @@ endif()
 if(${KOKKOS_ENABLE_PROFILING_LOAD_PRINT})
      list(APPEND KOKKOS_OPTIONSl enable_profile_load_print)
 endif()
+if(${KOKKOS_ENABLE_EXPLICIT_INSTANTIATION})
+      list(APPEND KOKKOS_OPTIONSl enable_eti)
+endif()
 # List needs to be comma-delimitted
 string(REPLACE ";" "," KOKKOS_GMAKE_OPTIONS "${KOKKOS_OPTIONSl}")

@ -158,6 +161,19 @@ if (NOT "${KOKKOS_INTERNAL_ADDTOPATH}" STREQUAL "")
  set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} "PATH=\"${KOKKOS_INTERNAL_ADDTOPATH}:$ENV{PATH}\"")
 endif()

+if (CMAKE_CXX_STANDARD)
+  if (CMAKE_CXX_STANDARD STREQUAL "98")
+    message(FATAL_ERROR "Kokkos requires C++11 or newer!")
+  endif()
+  set(KOKKOS_CXX_STANDARD "c++${CMAKE_CXX_STANDARD}")
+  if (CMAKE_CXX_EXTENSIONS)
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+      set(KOKKOS_CXX_STANDARD "gnu++${CMAKE_CXX_STANDARD}")
+    endif()
+  endif()
+  set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} "KOKKOS_CXX_STANDARD=\"${KOKKOS_CXX_STANDARD}\"")
+endif()
+
 # Final form that gets passed to make
 set(KOKKOS_SETTINGS env ${KOKKOS_SETTINGS})

--- a/lib/kokkos/cmake/tribits.cmake
+++ b/lib/kokkos/cmake/tribits.cmake
@ -300,7 +300,9 @@ FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
  ENDIF()
 ENDFUNCTION()

-ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
+IF(NOT TARGET check)
+  ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
+ENDIF()

 FUNCTION(TRIBITS_ADD_TEST)
 ENDFUNCTION()
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@ -22,30 +22,38 @@ if [[ "$HOSTNAME" =~ .*bowman.* ]]; then
  module load git
 fi

-if [[ "$HOSTNAME" =~ n.* ]]; then # Warning: very generic name
+if [[ "$HOSTNAME" == n* ]]; then # Warning: very generic name
  if [[ "$PROCESSOR" = "aarch64" ]]; then
    MACHINE=sullivan
    module load git
  fi
 fi

-if [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
+if [[ "$HOSTNAME" == node* ]]; then # Warning: very generic name
  if [[ "$MACHINE" = "" ]]; then
    MACHINE=shepard
    module load git
  fi
 fi

-if [[ "$HOSTNAME" =~ apollo ]]; then
+if [[ "$HOSTNAME" == apollo\.* ]]; then
  MACHINE=apollo
  module load git
 fi

-if [[ "$HOSTNAME" =~ sullivan ]]; then
+if [[ "$HOSTNAME" == sullivan ]]; then
  MACHINE=sullivan
  module load git
 fi

+if [[ "$HOSTNAME" == mayer\.* ]]; then
+  MACHINE=mayer
+#  module load git
+fi
+if [[ "$HOSTNAME" == cn* ]]; then # Warning: very generic name
+  MACHINE=mayer
+fi
+
 if [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
  if [[ "$MACHINE" = "" ]]; then
    MACHINE=sems
@ -83,7 +91,7 @@ CUSTOM_BUILD_LIST=""
 QTHREADS_PATH=""
 DRYRUN=False
 BUILD_ONLY=False
-declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
+declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=1
 TEST_SCRIPT=False
 SKIP_HWLOC=False
 SPOT_CHECK=False
@ -142,6 +150,9 @@ do
    --with-cuda-options*)
      KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
      ;;
+    --with-options*)
+      KOKKOS_OPTIONS="--with-options=enable_large_mem_tests,${key#*=}"
+      ;;
    --cxxflags-extra*)
      CXX_FLAGS_EXTRA="${key#*=}"
      ;;
@ -247,7 +258,7 @@ elif [ "$MACHINE" = "white" ]; then
    ARCH_FLAG="--arch=Power8,Kepler37"
  fi

-  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=1

 elif [ "$MACHINE" = "bowman" ]; then
  source /etc/profile.d/modules.sh
@ -268,7 +279,7 @@ elif [ "$MACHINE" = "bowman" ]; then
    ARCH_FLAG="--arch=KNL"
  fi

-  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=1

 elif [ "$MACHINE" = "sullivan" ]; then
  source /etc/profile.d/modules.sh
@ -284,7 +295,24 @@ elif [ "$MACHINE" = "sullivan" ]; then
    ARCH_FLAG="--arch=ARMv8-ThunderX"
  fi

-  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=1
+
+elif [ "$MACHINE" = "mayer" ]; then
+  SKIP_HWLOC=True
+  export SLURM_TASKS_PER_NODE=96
+
+  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+  ARM_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
+
+  # Format: (compiler module-list build-list exe-name warning-flag)
+  COMPILERS=("gcc/7.2.0 $BASE_MODULE_LIST $ARM_GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "arm/1.4.0 $ARM_MODULE_LIST $ARM_GCC_BUILD_LIST armclang++ $CLANG_WARNING_FLAGS")
+
+  if [ -z "$ARCH_FLAG" ]; then
+    ARCH_FLAG="--arch=ARMv8-TX2"
+  fi
+
+  NUM_JOBS_TO_RUN_IN_PARALLEL=1

 elif [ "$MACHINE" = "shepard" ]; then
  source /etc/profile.d/modules.sh
@ -303,7 +331,7 @@ elif [ "$MACHINE" = "shepard" ]; then
  if [ -z "$ARCH_FLAG" ]; then
    ARCH_FLAG="--arch=HSW"
  fi
-  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=1

 elif [ "$MACHINE" = "apollo" ]; then
  source /projects/sems/modulefiles/utils/sems-modules-init.sh
@ -331,7 +359,7 @@ elif [ "$MACHINE" = "apollo" ]; then
  if [ "$SPOT_CHECK" = "True" ]; then
    # Format: (compiler module-list build-list exe-name warning-flag)
    COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS"
-               "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
+               "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
               "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
               "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
@ -358,7 +386,7 @@ elif [ "$MACHINE" = "apollo" ]; then
    ARCH_FLAG="--arch=SNB,Volta70"
  fi

-  NUM_JOBS_TO_RUN_IN_PARALLEL=2
+  NUM_JOBS_TO_RUN_IN_PARALLEL=1

 else
  echo "Unhandled machine $MACHINE" >&2
@ -627,6 +655,11 @@ single_build_and_test() {
  if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
    local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
  fi
+  if [[ "$KOKKOS_OPTIONS" != "" ]]; then
+    local extra_args="$extra_args $KOKKOS_OPTIONS"
+  else
+    local extra_args="$extra_args --with-options=enable_large_mem_tests"
+  fi    

  echo "  Starting job $desc"

@ -642,7 +675,7 @@ single_build_and_test() {
  else
    run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
    local -i build_start_time=$(date +%s)
-    run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
+    run_cmd make -j 48 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
    local -i build_end_time=$(date +%s)
    comment="build_time=$(($build_end_time-$build_start_time))"

@ -682,6 +715,9 @@ run_in_background() {
    if [[ "$compiler" == cuda* ]]; then
      num_jobs=1
    fi
+    if [[ "$compiler" == clang ]]; then 
+      num_jobs=1
+    fi
  # fi
  wait_for_jobs $num_jobs

--- a/lib/kokkos/containers/performance_tests/TestCuda.cpp
+++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp
@ -70,13 +70,12 @@ protected:
  static void SetUpTestCase()
  {
    std::cout << std::setprecision(5) << std::scientific;
-    Kokkos::HostSpace::execution_space::initialize();
-    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+    Kokkos::InitArguments args(-1, -1, 0);
+    Kokkos::initialize(args);
  }
  static void TearDownTestCase()
  {
-    Kokkos::Cuda::finalize();
-    Kokkos::HostSpace::execution_space::finalize();
+    Kokkos::finalize();
  }
 };

--- a/Show More
+++ b/Show More