Merge pull request #1522 from akohlmey/next_version

Step version string for next patch release
Merge pull request #1518 from akohlmey/collected-small-changes
2019-06-18 22:01:42 -04:00 · 2019-06-18 16:21:41 -04:00 · 2019-06-18 15:00:27 -04:00 · 2019-06-18 11:52:09 -04:00 · 2019-06-18 11:50:06 -04:00 · 2019-06-18 00:14:48 -04:00
197 changed files with 12220 additions and 4739 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
--- a/cmake/Modules/CodeCoverage.cmake
+++ b/cmake/Modules/CodeCoverage.cmake
@ -0,0 +1,28 @@
+###############################################################################
+# Coverage
+#
+# Requires latest gcovr (for GCC 8.1 support):#
+# pip install git+https://github.com/gcovr/gcovr.git
+###############################################################################
+if(ENABLE_COVERAGE)
+    find_program(GCOVR_BINARY gcovr)
+    find_package_handle_standard_args(GCOVR DEFAULT_MSG GCOVR_BINARY)
+
+    if(GCOVR_FOUND)
+        get_filename_component(ABSOLUTE_LAMMPS_SOURCE_DIR ${LAMMPS_SOURCE_DIR} ABSOLUTE)
+
+        add_custom_target(
+            gen_coverage_xml
+            COMMAND ${GCOVR_BINARY} -s -x -r ${ABSOLUTE_LAMMPS_SOURCE_DIR} --object-directory=${CMAKE_BINARY_DIR} -o coverage.xml
+            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+            COMMENT "Generating XML Coverage Report..."
+        )
+
+        add_custom_target(
+            gen_coverage_html
+            COMMAND ${GCOVR_BINARY} -s  --html --html-details -r ${ABSOLUTE_LAMMPS_SOURCE_DIR} --object-directory=${CMAKE_BINARY_DIR} -o coverage.html
+            WORKING_DIRECTORY ${CMAKE_BINARY_DIR}
+            COMMENT "Generating HTML Coverage Report..."
+        )
+    endif()
+endif()
--- a/cmake/Modules/Documentation.cmake
+++ b/cmake/Modules/Documentation.cmake
@ -0,0 +1,59 @@
+###############################################################################
+# Build documentation
+###############################################################################
+option(BUILD_DOC "Build LAMMPS documentation" OFF)
+if(BUILD_DOC)
+  include(ProcessorCount)
+  ProcessorCount(NPROCS)
+  find_package(PythonInterp 3 REQUIRED)
+
+  set(VIRTUALENV ${PYTHON_EXECUTABLE} -m virtualenv)
+
+  file(GLOB DOC_SOURCES ${LAMMPS_DOC_DIR}/src/[^.]*.txt)
+  file(GLOB PDF_EXTRA_SOURCES ${LAMMPS_DOC_DIR}/src/lammps_commands*.txt ${LAMMPS_DOC_DIR}/src/lammps_support.txt ${LAMMPS_DOC_DIR}/src/lammps_tutorials.txt)
+  list(REMOVE_ITEM DOC_SOURCES ${PDF_EXTRA_SOURCES})
+
+  add_custom_command(
+    OUTPUT docenv
+    COMMAND ${VIRTUALENV} docenv
+  )
+
+  set(DOCENV_BINARY_DIR ${CMAKE_BINARY_DIR}/docenv/bin)
+
+  add_custom_command(
+    OUTPUT requirements.txt
+    DEPENDS docenv
+    COMMAND ${CMAKE_COMMAND} -E copy ${LAMMPS_DOC_DIR}/utils/requirements.txt requirements.txt
+    COMMAND ${DOCENV_BINARY_DIR}/pip install -r requirements.txt --upgrade
+    COMMAND ${DOCENV_BINARY_DIR}/pip install --upgrade ${LAMMPS_DOC_DIR}/utils/converters
+  )
+
+  set(RST_FILES "")
+  set(RST_DIR ${CMAKE_BINARY_DIR}/rst)
+  file(MAKE_DIRECTORY ${RST_DIR})
+  foreach(TXT_FILE ${DOC_SOURCES})
+    get_filename_component(FILENAME ${TXT_FILE} NAME_WE)
+    set(RST_FILE ${RST_DIR}/${FILENAME}.rst)
+    list(APPEND RST_FILES ${RST_FILE})
+    add_custom_command(
+      OUTPUT ${RST_FILE}
+      DEPENDS requirements.txt docenv ${TXT_FILE}
+      COMMAND ${DOCENV_BINARY_DIR}/txt2rst -o ${RST_DIR} ${TXT_FILE}
+    )
+  endforeach()
+
+  add_custom_command(
+    OUTPUT html
+    DEPENDS ${RST_FILES}
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${LAMMPS_DOC_DIR}/src ${RST_DIR}
+    COMMAND ${DOCENV_BINARY_DIR}/sphinx-build -j ${NPROCS} -b html -c ${LAMMPS_DOC_DIR}/utils/sphinx-config -d ${CMAKE_BINARY_DIR}/doctrees ${RST_DIR} html
+  )
+
+  add_custom_target(
+    doc ALL
+    DEPENDS html
+    SOURCES ${LAMMPS_DOC_DIR}/utils/requirements.txt ${DOC_SOURCES}
+  )
+
+  install(DIRECTORY ${CMAKE_BINARY_DIR}/html DESTINATION ${CMAKE_INSTALL_DOCDIR})
+endif()
--- a/cmake/Modules/LAMMPSUtils.cmake
+++ b/cmake/Modules/LAMMPSUtils.cmake
@ -0,0 +1,71 @@
+# Utility functions
+function(list_to_bulletpoints result)
+    list(REMOVE_AT ARGV 0)
+    set(temp "")
+    foreach(item ${ARGV})
+        set(temp "${temp}* ${item}\n")
+    endforeach()
+    set(${result} "${temp}" PARENT_SCOPE)
+endfunction(list_to_bulletpoints)
+
+function(validate_option name values)
+    string(TOLOWER ${${name}} needle_lower)
+    string(TOUPPER ${${name}} needle_upper)
+    list(FIND ${values} ${needle_lower} IDX_LOWER)
+    list(FIND ${values} ${needle_upper} IDX_UPPER)
+    if(${IDX_LOWER} LESS 0 AND ${IDX_UPPER} LESS 0)
+        list_to_bulletpoints(POSSIBLE_VALUE_LIST ${${values}})
+        message(FATAL_ERROR "\n########################################################################\n"
+                            "Invalid value '${${name}}' for option ${name}\n"
+                            "\n"
+                            "Possible values are:\n"
+                            "${POSSIBLE_VALUE_LIST}"
+                            "########################################################################")
+    endif()
+endfunction(validate_option)
+
+function(get_lammps_version version_header variable)
+    file(READ ${version_header} line)
+    set(MONTHS x Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec)
+    string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\1" day "${line}")
+    string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\2" month "${line}")
+    string(REGEX REPLACE "#define LAMMPS_VERSION \"([0-9]+) ([A-Za-z]+) ([0-9]+)\"" "\\3" year "${line}")
+    string(STRIP ${day} day)
+    string(STRIP ${month} month)
+    string(STRIP ${year} year)
+    list(FIND MONTHS "${month}" month)
+    string(LENGTH ${day} day_length)
+    string(LENGTH ${month} month_length)
+    if(day_length EQUAL 1)
+        set(day "0${day}")
+    endif()
+    if(month_length EQUAL 1)
+        set(month "0${month}")
+    endif()
+    set(${variable} "${year}${month}${day}" PARENT_SCOPE)
+endfunction()
+
+function(check_for_autogen_files source_dir)
+    message(STATUS "Running check for auto-generated files from make-based build system")
+    file(GLOB SRC_AUTOGEN_FILES ${source_dir}/style_*.h)
+    file(GLOB SRC_AUTOGEN_PACKAGES ${source_dir}/packages_*.h)
+    list(APPEND SRC_AUTOGEN_FILES ${SRC_AUTOGEN_PACKAGES} ${source_dir}/lmpinstalledpkgs.h ${source_dir}/lmpgitversion.h)
+    foreach(_SRC ${SRC_AUTOGEN_FILES})
+      get_filename_component(FILENAME "${_SRC}" NAME)
+      if(EXISTS ${source_dir}/${FILENAME})
+        message(FATAL_ERROR "\n########################################################################\n"
+                              "Found header file(s) generated by the make-based build system\n"
+                              "\n"
+                              "Please run\n"
+                              "make -C ${source_dir} purge\n"
+                              "to remove\n"
+                              "########################################################################")
+      endif()
+    endforeach()
+endfunction()
+
+macro(pkg_depends PKG1 PKG2)
+  if(PKG_${PKG1} AND NOT (PKG_${PKG2} OR BUILD_${PKG2}))
+    message(FATAL_ERROR "${PKG1} package needs LAMMPS to be build with ${PKG2}")
+  endif()
+endmacro()
--- a/cmake/Modules/Packages/COMPRESS.cmake
+++ b/cmake/Modules/Packages/COMPRESS.cmake
@ -0,0 +1,5 @@
+if(PKG_COMPRESS)
+  find_package(ZLIB REQUIRED)
+  include_directories(${ZLIB_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${ZLIB_LIBRARIES})
+endif()
--- a/cmake/Modules/Packages/CORESHELL.cmake
+++ b/cmake/Modules/Packages/CORESHELL.cmake
@ -0,0 +1,13 @@
+if(PKG_CORESHELL)
+    set(CORESHELL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/CORESHELL)
+    set(CORESHELL_SOURCES)
+    set_property(GLOBAL PROPERTY "CORESHELL_SOURCES" "${CORESHELL_SOURCES}")
+
+    # detects styles which have a CORESHELL version
+    RegisterStylesExt(${CORESHELL_SOURCES_DIR} cs CORESHELL_SOURCES)
+
+    get_property(CORESHELL_SOURCES GLOBAL PROPERTY CORESHELL_SOURCES)
+
+    list(APPEND LIB_SOURCES ${CORESHELL_SOURCES})
+    include_directories(${CORESHELL_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/GPU.cmake
+++ b/cmake/Modules/Packages/GPU.cmake
@ -0,0 +1,194 @@
+if(PKG_GPU)
+    if (CMAKE_VERSION VERSION_LESS "3.1")
+      message(FATAL_ERROR "For the GPU package you need at least cmake-3.1")
+    endif()
+    set(GPU_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/GPU)
+    set(GPU_SOURCES ${GPU_SOURCES_DIR}/gpu_extra.h
+                    ${GPU_SOURCES_DIR}/fix_gpu.h
+                    ${GPU_SOURCES_DIR}/fix_gpu.cpp)
+
+    set(GPU_API "opencl" CACHE STRING "API used by GPU package")
+    set(GPU_API_VALUES opencl cuda)
+    set_property(CACHE GPU_API PROPERTY STRINGS ${GPU_API_VALUES})
+    validate_option(GPU_API GPU_API_VALUES)
+    string(TOUPPER ${GPU_API} GPU_API)
+
+    set(GPU_PREC "mixed" CACHE STRING "LAMMPS GPU precision")
+    set(GPU_PREC_VALUES double mixed single)
+    set_property(CACHE GPU_PREC PROPERTY STRINGS ${GPU_PREC_VALUES})
+    validate_option(GPU_PREC GPU_PREC_VALUES)
+    string(TOUPPER ${GPU_PREC} GPU_PREC)
+
+    if(GPU_PREC STREQUAL "DOUBLE")
+      set(GPU_PREC_SETTING "DOUBLE_DOUBLE")
+    elseif(GPU_PREC STREQUAL "MIXED")
+      set(GPU_PREC_SETTING "SINGLE_DOUBLE")
+    elseif(GPU_PREC STREQUAL "SINGLE")
+      set(GPU_PREC_SETTING "SINGLE_SINGLE")
+    endif()
+
+    file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cpp)
+    file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)
+
+    if(GPU_API STREQUAL "CUDA")
+      find_package(CUDA REQUIRED)
+      find_program(BIN2C bin2c)
+      if(NOT BIN2C)
+        message(FATAL_ERROR "Could not find bin2c, use -DBIN2C=/path/to/bin2c to help cmake finding it.")
+      endif()
+      option(CUDPP_OPT "Enable CUDPP_OPT" ON)
+      option(CUDA_MPS_SUPPORT "Enable tweaks to support CUDA Multi-process service (MPS)" OFF)
+      if(CUDA_MPS_SUPPORT)
+        set(GPU_CUDA_MPS_FLAGS "-DCUDA_PROXY")
+      endif()
+
+      set(GPU_ARCH "sm_30" CACHE STRING "LAMMPS GPU CUDA SM primary architecture (e.g. sm_60)")
+
+      file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/[^.]*.cu)
+      list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
+
+      cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu ${LAMMPS_LIB_BINARY_DIR}/gpu)
+
+      if(CUDPP_OPT)
+        cuda_include_directories(${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
+        file(GLOB GPU_LIB_CUDPP_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cpp)
+        file(GLOB GPU_LIB_CUDPP_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini/[^.]*.cu)
+      endif()
+
+      # build arch/gencode commands for nvcc based on CUDA toolkit version and use choice
+      # --arch translates directly instead of JIT, so this should be for the preferred or most common architecture
+      set(GPU_CUDA_GENCODE "-arch=${GPU_ARCH} ")
+      # Fermi (GPU Arch 2.x) is supported by CUDA 3.2 to CUDA 8.0
+      if((CUDA_VERSION VERSION_GREATER "3.1") AND (CUDA_VERSION VERSION_LESS "9.0"))
+        string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_20,code=[sm_20,compute_20] ")
+      endif()
+      # Kepler (GPU Arch 3.x) is supported by CUDA 5 and later
+      if(CUDA_VERSION VERSION_GREATER "4.9")
+        string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_30,code=[sm_30,compute_30] -gencode arch=compute_35,code=[sm_35,compute_35] ")
+      endif()
+      # Maxwell (GPU Arch 5.x) is supported by CUDA 6 and later
+      if(CUDA_VERSION VERSION_GREATER "5.9")
+        string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_50,code=[sm_50,compute_50] -gencode arch=compute_52,code=[sm_52,compute_52] ")
+      endif()
+      # Pascal (GPU Arch 6.x) is supported by CUDA 8 and later
+      if(CUDA_VERSION VERSION_GREATER "7.9")
+        string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_60,code=[sm_60,compute_60] -gencode arch=compute_61,code=[sm_61,compute_61] ")
+      endif()
+      # Volta (GPU Arch 7.0) is supported by CUDA 9 and later
+      if(CUDA_VERSION VERSION_GREATER "8.9")
+        string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_70,code=[sm_70,compute_70] ")
+      endif()
+      # Turing (GPU Arch 7.5) is supported by CUDA 10 and later
+      if(CUDA_VERSION VERSION_GREATER "9.9")
+        string(APPEND GPU_CUDA_GENCODE "-gencode arch=compute_75,code=[sm_75,compute_75] ")
+      endif()
+
+      cuda_compile_fatbin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS
+              -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DNV_KERNEL -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
+
+      cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS ${CUDA_REQUEST_PIC}
+              -DUNIX -O3 --use_fast_math -Wno-deprecated-gpu-targets -DUCL_CUDADR ${GPU_CUDA_GENCODE} -D_${GPU_PREC_SETTING})
+
+      foreach(CU_OBJ ${GPU_GEN_OBJS})
+        get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
+        string(REGEX REPLACE "^.*_lal_" "" CU_NAME "${CU_NAME}")
+        add_custom_command(OUTPUT ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+          COMMAND ${BIN2C} -c -n ${CU_NAME} ${CU_OBJ} > ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h
+          DEPENDS ${CU_OBJ}
+          COMMENT "Generating ${CU_NAME}_cubin.h")
+        list(APPEND GPU_LIB_SOURCES ${LAMMPS_LIB_BINARY_DIR}/gpu/${CU_NAME}_cubin.h)
+      endforeach()
+      set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES "${LAMMPS_LIB_BINARY_DIR}/gpu/*_cubin.h")
+
+
+      add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
+      target_link_libraries(gpu ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+      target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
+      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT ${GPU_CUDA_MPS_FLAGS})
+      if(CUDPP_OPT)
+        target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
+        target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
+      endif()
+
+      list(APPEND LAMMPS_LINK_LIBS gpu)
+
+      add_executable(nvc_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+      target_compile_definitions(nvc_get_devices PRIVATE -DUCL_CUDADR)
+      target_link_libraries(nvc_get_devices PRIVATE ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+      target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})
+
+
+    elseif(GPU_API STREQUAL "OPENCL")
+      find_package(OpenCL REQUIRED)
+      set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
+      set(OCL_TUNE_VALUES intel fermi kepler cypress generic)
+      set_property(CACHE OCL_TUNE PROPERTY STRINGS ${OCL_TUNE_VALUES})
+      validate_option(OCL_TUNE OCL_TUNE_VALUES)
+      string(TOUPPER ${OCL_TUNE} OCL_TUNE)
+
+      include(OpenCLUtils)
+      set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
+
+      file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/[^.]*.cu)
+      list(REMOVE_ITEM GPU_LIB_CU
+        ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu
+        ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu
+        ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu
+        ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu
+        ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu
+        ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu
+        ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu
+      )
+
+      foreach(GPU_KERNEL ${GPU_LIB_CU})
+          get_filename_component(basename ${GPU_KERNEL} NAME_WE)
+          string(SUBSTRING ${basename} 4 -1 KERNEL_NAME)
+          GenerateOpenCLHeader(${KERNEL_NAME} ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h ${OCL_COMMON_HEADERS} ${GPU_KERNEL})
+          list(APPEND GPU_LIB_SOURCES ${CMAKE_CURRENT_BINARY_DIR}/gpu/${KERNEL_NAME}_cl.h)
+      endforeach()
+
+      GenerateOpenCLHeader(gayberne ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne.cu)
+      GenerateOpenCLHeader(gayberne_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_gayberne_lj.cu)
+      GenerateOpenCLHeader(re_squared ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared.cu)
+      GenerateOpenCLHeader(re_squared_lj ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_ellipsoid_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_re_squared_lj.cu)
+      GenerateOpenCLHeader(tersoff ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff.cu)
+      GenerateOpenCLHeader(tersoff_zbl ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_zbl.cu)
+      GenerateOpenCLHeader(tersoff_mod ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h ${OCL_COMMON_HEADERS} ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod_extra.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_tersoff_mod.cu)
+
+      list(APPEND GPU_LIB_SOURCES
+        ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_cl.h
+        ${CMAKE_CURRENT_BINARY_DIR}/gpu/gayberne_lj_cl.h
+        ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_cl.h
+        ${CMAKE_CURRENT_BINARY_DIR}/gpu/re_squared_lj_cl.h
+        ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_cl.h
+        ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_zbl_cl.h
+        ${CMAKE_CURRENT_BINARY_DIR}/gpu/tersoff_mod_cl.h
+      )
+
+      add_library(gpu STATIC ${GPU_LIB_SOURCES})
+      target_link_libraries(gpu ${OpenCL_LIBRARIES})
+      target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu ${OpenCL_INCLUDE_DIRS})
+      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
+      target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)
+
+      list(APPEND LAMMPS_LINK_LIBS gpu)
+
+      add_executable(ocl_get_devices ${LAMMPS_LIB_SOURCE_DIR}/gpu/geryon/ucl_get_devices.cpp)
+      target_compile_definitions(ocl_get_devices PRIVATE -DUCL_OPENCL)
+      target_link_libraries(ocl_get_devices PRIVATE ${OpenCL_LIBRARIES})
+      target_include_directories(ocl_get_devices PRIVATE ${OpenCL_INCLUDE_DIRS})
+    endif()
+
+    # GPU package
+    FindStyleHeaders(${GPU_SOURCES_DIR} FIX_CLASS fix_ FIX)
+
+    set_property(GLOBAL PROPERTY "GPU_SOURCES" "${GPU_SOURCES}")
+
+    # detects styles which have GPU version
+    RegisterStylesExt(${GPU_SOURCES_DIR} gpu GPU_SOURCES)
+
+    get_property(GPU_SOURCES GLOBAL PROPERTY GPU_SOURCES)
+
+    list(APPEND LIB_SOURCES ${GPU_SOURCES})
+    include_directories(${GPU_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/KIM.cmake
+++ b/cmake/Modules/Packages/KIM.cmake
@ -0,0 +1,42 @@
+if(PKG_KIM)
+  find_package(CURL)
+  if(CURL_FOUND)
+    include_directories(${CURL_INCLUDE_DIRS})
+    list(APPEND LAMMPS_LINK_LIBS ${CURL_LIBRARIES})
+    add_definitions(-DLMP_KIM_CURL)
+  endif()
+  find_package(KIM-API QUIET)
+  if(KIM-API_FOUND)
+    set(DOWNLOAD_KIM_DEFAULT OFF)
+  else()
+    set(DOWNLOAD_KIM_DEFAULT ON)
+  endif()
+  option(DOWNLOAD_KIM "Download KIM-API from OpenKIM instead of using an already installed one" ${DOWNLOAD_KIM_DEFAULT})
+  if(DOWNLOAD_KIM)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded KIM-API library with Ninja build tool")
+    endif()
+    message(STATUS "KIM-API download requested - we will build our own")
+    enable_language(C)
+    enable_language(Fortran)
+    include(ExternalProject)
+    ExternalProject_Add(kim_build
+      URL https://s3.openkim.org/kim-api/kim-api-2.0.2.txz
+      URL_MD5 537d9c0abd30f85b875ebb584f9143fa
+      BINARY_DIR build
+      CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                 -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                 -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER}
+                 -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR>
+                 -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+      )
+    ExternalProject_get_property(kim_build INSTALL_DIR)
+    set(KIM-API_INCLUDE_DIRS ${INSTALL_DIR}/include/kim-api)
+    set(KIM-API_LDFLAGS ${INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/libkim-api${CMAKE_SHARED_LIBRARY_SUFFIX})
+    list(APPEND LAMMPS_DEPS kim_build)
+  else()
+    find_package(KIM-API REQUIRED)
+  endif()
+  list(APPEND LAMMPS_LINK_LIBS "${KIM-API_LDFLAGS}")
+  include_directories(${KIM-API_INCLUDE_DIRS})
+endif()
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@ -0,0 +1,53 @@
+if(PKG_KOKKOS)
+  set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos)
+  set(LAMMPS_LIB_KOKKOS_BIN_DIR ${LAMMPS_LIB_BINARY_DIR}/kokkos)
+  add_definitions(-DLMP_KOKKOS)
+  add_subdirectory(${LAMMPS_LIB_KOKKOS_SRC_DIR} ${LAMMPS_LIB_KOKKOS_BIN_DIR})
+
+  set(Kokkos_INCLUDE_DIRS ${LAMMPS_LIB_KOKKOS_SRC_DIR}/core/src
+                          ${LAMMPS_LIB_KOKKOS_SRC_DIR}/containers/src
+                          ${LAMMPS_LIB_KOKKOS_SRC_DIR}/algorithms/src
+                          ${LAMMPS_LIB_KOKKOS_BIN_DIR})
+  include_directories(${Kokkos_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS kokkos)
+
+  set(KOKKOS_PKG_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/KOKKOS)
+  set(KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/atom_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/atom_vec_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/comm_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/comm_tiled_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neighbor_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neigh_list_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/neigh_bond_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/fix_nh_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/nbin_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/npair_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/domain_kokkos.cpp
+                         ${KOKKOS_PKG_SOURCES_DIR}/modify_kokkos.cpp)
+
+  if(PKG_KSPACE)
+    list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/gridcomm_kokkos.cpp)
+  endif()
+
+  set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
+
+  # detects styles which have KOKKOS version
+  RegisterStylesExt(${KOKKOS_PKG_SOURCES_DIR} kokkos KOKKOS_PKG_SOURCES)
+
+  # register kokkos-only styles
+  RegisterNBinStyle(${KOKKOS_PKG_SOURCES_DIR}/nbin_kokkos.h)
+  RegisterNPairStyle(${KOKKOS_PKG_SOURCES_DIR}/npair_kokkos.h)
+
+  if(PKG_USER-DPD)
+    get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES)
+    list(APPEND KOKKOS_PKG_SOURCES ${KOKKOS_PKG_SOURCES_DIR}/npair_ssa_kokkos.cpp)
+    RegisterNPairStyle(${KOKKOS_PKG_SOURCES_DIR}/npair_ssa_kokkos.h)
+    set_property(GLOBAL PROPERTY "KOKKOS_PKG_SOURCES" "${KOKKOS_PKG_SOURCES}")
+  endif()
+
+  get_property(KOKKOS_PKG_SOURCES GLOBAL PROPERTY KOKKOS_PKG_SOURCES)
+
+  list(APPEND LIB_SOURCES ${KOKKOS_PKG_SOURCES})
+  include_directories(${KOKKOS_PKG_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/KSPACE.cmake
+++ b/cmake/Modules/Packages/KSPACE.cmake
@ -0,0 +1,38 @@
+if(PKG_KSPACE)
+  option(FFT_SINGLE "Use single precision FFT instead of double" OFF)
+  set(FFTW "FFTW3")
+  if(FFT_SINGLE)
+    set(FFTW "FFTW3F")
+    add_definitions(-DFFT_SINGLE)
+  endif()
+  find_package(${FFTW} QUIET)
+  if(${FFTW}_FOUND)
+    set(FFT "${FFTW}" CACHE STRING "FFT library for KSPACE package")
+  else()
+    set(FFT "KISS" CACHE STRING "FFT library for KSPACE package")
+  endif()
+  set(FFT_VALUES KISS ${FFTW} MKL)
+  set_property(CACHE FFT PROPERTY STRINGS ${FFT_VALUES})
+  validate_option(FFT FFT_VALUES)
+  string(TOUPPER ${FFT} FFT)
+  if(NOT FFT STREQUAL "KISS")
+    find_package(${FFT} REQUIRED)
+    if(NOT FFT STREQUAL "FFTW3F")
+      add_definitions(-DFFT_FFTW)
+    else()
+      add_definitions(-DFFT_${FFT})
+    endif()
+    include_directories(${${FFT}_INCLUDE_DIRS})
+    list(APPEND LAMMPS_LINK_LIBS ${${FFT}_LIBRARIES})
+  else()
+    add_definitions(-DFFT_KISS)
+  endif()
+  set(FFT_PACK "array" CACHE STRING "Optimization for FFT")
+  set(FFT_PACK_VALUES array pointer memcpy)
+  set_property(CACHE FFT_PACK PROPERTY STRINGS ${FFT_PACK_VALUES})
+  validate_option(FFT_PACK FFT_PACK_VALUES)
+  if(NOT FFT_PACK STREQUAL "array")
+    string(TOUPPER ${FFT_PACK} FFT_PACK)
+    add_definitions(-DFFT_PACK_${FFT_PACK})
+  endif()
+endif()
--- a/cmake/Modules/Packages/LATTE.cmake
+++ b/cmake/Modules/Packages/LATTE.cmake
@ -0,0 +1,38 @@
+if(PKG_LATTE)
+  enable_language(Fortran)
+  find_package(LATTE)
+  if(LATTE_FOUND)
+    set(DOWNLOAD_LATTE_DEFAULT OFF)
+  else()
+    set(DOWNLOAD_LATTE_DEFAULT ON)
+  endif()
+  option(DOWNLOAD_LATTE "Download the LATTE library instead of using an already installed one" ${DOWNLOAD_LATTE_DEFAULT})
+  if(DOWNLOAD_LATTE)
+    if (CMAKE_VERSION VERSION_LESS "3.7") # due to SOURCE_SUBDIR
+      message(FATAL_ERROR "For downlading LATTE you need at least cmake-3.7")
+    endif()
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded LATTE library with Ninja build tool")
+    endif()
+    message(STATUS "LATTE download requested - we will build our own")
+    include(ExternalProject)
+    ExternalProject_Add(latte_build
+      URL https://github.com/lanl/LATTE/archive/v1.2.1.tar.gz
+      URL_MD5 85ac414fdada2d04619c8f936344df14
+      SOURCE_SUBDIR cmake
+      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${CMAKE_REQUEST_PIC}
+      -DBLAS_LIBRARIES=${BLAS_LIBRARIES} -DLAPACK_LIBRARIES=${LAPACK_LIBRARIES}
+      -DCMAKE_Fortran_COMPILER=${CMAKE_Fortran_COMPILER} -DCMAKE_Fortran_FLAGS=${CMAKE_Fortran_FLAGS}
+      -DCMAKE_Fortran_FLAGS_${BTYPE}=${CMAKE_Fortran_FLAGS_${BTYPE}} -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
+    )
+    ExternalProject_get_property(latte_build INSTALL_DIR)
+    set(LATTE_LIBRARIES ${INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR}/liblatte.a)
+    list(APPEND LAMMPS_DEPS latte_build)
+  else()
+    find_package(LATTE)
+    if(NOT LATTE_FOUND)
+      message(FATAL_ERROR "LATTE library not found, help CMake to find it by setting LATTE_LIBRARY, or set DOWNLOAD_LATTE=ON to download it")
+    endif()
+  endif()
+  list(APPEND LAMMPS_LINK_LIBS ${LATTE_LIBRARIES} ${LAPACK_LIBRARIES})
+endif()
--- a/cmake/Modules/Packages/MESSAGE.cmake
+++ b/cmake/Modules/Packages/MESSAGE.cmake
@ -0,0 +1,29 @@
+if(PKG_MESSAGE)
+  option(MESSAGE_ZMQ "Use ZeroMQ in MESSAGE package" OFF)
+  file(GLOB_RECURSE cslib_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.F
+      ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.c
+      ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/[^.]*.cpp)
+
+  add_library(cslib STATIC ${cslib_SOURCES})
+  if(BUILD_MPI)
+    target_compile_definitions(cslib PRIVATE -DMPI_YES)
+    set_target_properties(cslib PROPERTIES OUTPUT_NAME "csmpi")
+  else()
+    target_compile_definitions(cslib PRIVATE -DMPI_NO)
+    target_include_directories(cslib PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/src/STUBS_MPI)
+    set_target_properties(cslib PROPERTIES OUTPUT_NAME "csnompi")
+  endif()
+
+  if(MESSAGE_ZMQ)
+    target_compile_definitions(cslib PRIVATE -DZMQ_YES)
+    find_package(ZMQ REQUIRED)
+    target_include_directories(cslib PRIVATE ${ZMQ_INCLUDE_DIRS})
+    target_link_libraries(cslib PUBLIC ${ZMQ_LIBRARIES})
+  else()
+    target_compile_definitions(cslib PRIVATE -DZMQ_NO)
+    target_include_directories(cslib PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/message/cslib/src/STUBS_ZMQ)
+  endif()
+
+  list(APPEND LAMMPS_LINK_LIBS cslib)
+  include_directories(${LAMMPS_LIB_SOURCE_DIR}/message/cslib/src)
+endif()
--- a/cmake/Modules/Packages/MSCG.cmake
+++ b/cmake/Modules/Packages/MSCG.cmake
@ -0,0 +1,45 @@
+if(PKG_MSCG)
+  find_package(GSL REQUIRED)
+  find_package(MSCG QUIET)
+  if(MSGC_FOUND)
+    set(DOWNLOAD_MSCG_DEFAULT OFF)
+  else()
+    set(DOWNLOAD_MSCG_DEFAULT ON)
+  endif()
+  option(DOWNLOAD_MSCG "Download MSCG library instead of using an already installed one)" ${DOWNLOAD_MSCG_DEFAULT})
+  if(DOWNLOAD_MSCG)
+    if (CMAKE_VERSION VERSION_LESS "3.7") # due to SOURCE_SUBDIR
+      message(FATAL_ERROR "For downlading MSCG you need at least cmake-3.7")
+    endif()
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded MSCG library with Ninja build tool")
+    endif()
+    include(ExternalProject)
+    if(NOT LAPACK_FOUND)
+      set(EXTRA_MSCG_OPTS "-DLAPACK_LIBRARIES=${CMAKE_CURRENT_BINARY_DIR}/liblinalg.a")
+    endif()
+    ExternalProject_Add(mscg_build
+      URL https://github.com/uchicago-voth/MSCG-release/archive/1.7.3.1.tar.gz
+      URL_MD5 8c45e269ee13f60b303edd7823866a91
+      SOURCE_SUBDIR src/CMake
+      CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=<INSTALL_DIR> ${CMAKE_REQUEST_PIC} ${EXTRA_MSCG_OPTS}
+      BUILD_COMMAND make mscg INSTALL_COMMAND ""
+      )
+    ExternalProject_get_property(mscg_build BINARY_DIR)
+    set(MSCG_LIBRARIES ${BINARY_DIR}/libmscg.a)
+    ExternalProject_get_property(mscg_build SOURCE_DIR)
+    set(MSCG_INCLUDE_DIRS ${SOURCE_DIR}/src)
+    list(APPEND LAMMPS_DEPS mscg_build)
+    if(NOT LAPACK_FOUND)
+      file(MAKE_DIRECTORY ${MSCG_INCLUDE_DIRS})
+      add_dependencies(mscg_build linalg)
+    endif()
+  else()
+    find_package(MSCG)
+    if(NOT MSCG_FOUND)
+      message(FATAL_ERROR "MSCG not found, help CMake to find it by setting MSCG_LIBRARY and MSCG_INCLUDE_DIRS, or set DOWNLOAD_MSCG=ON to download it")
+    endif()
+  endif()
+  list(APPEND LAMMPS_LINK_LIBS ${MSCG_LIBRARIES} ${GSL_LIBRARIES} ${LAPACK_LIBRARIES})
+  include_directories(${MSCG_INCLUDE_DIRS})
+endif()
--- a/cmake/Modules/Packages/OPT.cmake
+++ b/cmake/Modules/Packages/OPT.cmake
@ -0,0 +1,13 @@
+if(PKG_OPT)
+    set(OPT_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/OPT)
+    set(OPT_SOURCES)
+    set_property(GLOBAL PROPERTY "OPT_SOURCES" "${OPT_SOURCES}")
+
+    # detects styles which have OPT version
+    RegisterStylesExt(${OPT_SOURCES_DIR} opt OPT_SOURCES)
+
+    get_property(OPT_SOURCES GLOBAL PROPERTY OPT_SOURCES)
+
+    list(APPEND LIB_SOURCES ${OPT_SOURCES})
+    include_directories(${OPT_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/PYTHON.cmake
+++ b/cmake/Modules/Packages/PYTHON.cmake
@ -0,0 +1,6 @@
+if(PKG_PYTHON)
+  find_package(PythonLibs REQUIRED)
+  add_definitions(-DLMP_PYTHON)
+  include_directories(${PYTHON_INCLUDE_DIR})
+  list(APPEND LAMMPS_LINK_LIBS ${PYTHON_LIBRARY})
+endif()
--- a/cmake/Modules/Packages/QEQ.cmake
+++ b/cmake/Modules/Packages/QEQ.cmake
@ -0,0 +1,20 @@
+# Fix qeq/fire requires MANYBODY (i.e. COMB and COMB3) to be installed
+if(PKG_QEQ)
+  set(QEQ_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/QEQ)
+  file(GLOB QEQ_HEADERS ${QEQ_SOURCES_DIR}/fix*.h)
+  file(GLOB QEQ_SOURCES ${QEQ_SOURCES_DIR}/fix*.cpp)
+
+  if(NOT PKG_MANYBODY)
+    list(REMOVE_ITEM QEQ_HEADERS ${QEQ_SOURCES_DIR}/fix_qeq_fire.h)
+    list(REMOVE_ITEM QEQ_SOURCES ${QEQ_SOURCES_DIR}/fix_qeq_fire.cpp)
+  endif()
+  set_property(GLOBAL PROPERTY "QEQ_SOURCES" "${QEQ_SOURCES}")
+
+  foreach(MY_HEADER ${QEQ_HEADERS})
+    AddStyleHeader(${MY_HEADER} FIX)
+  endforeach()
+
+  get_property(QEQ_SOURCES GLOBAL PROPERTY QEQ_SOURCES)
+  list(APPEND LIB_SOURCES ${QEQ_SOURCES})
+  include_directories(${QEQ_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/USER-H5MD.cmake
+++ b/cmake/Modules/Packages/USER-H5MD.cmake
@ -0,0 +1,8 @@
+if(PKG_USER-H5MD)
+  enable_language(C)
+
+  find_package(HDF5 REQUIRED)
+  target_link_libraries(h5md ${HDF5_LIBRARIES})
+  target_include_directories(h5md PRIVATE ${HDF5_INCLUDE_DIRS})
+  include_directories(${HDF5_INCLUDE_DIRS})
+endif()
--- a/cmake/Modules/Packages/USER-INTEL.cmake
+++ b/cmake/Modules/Packages/USER-INTEL.cmake
@ -0,0 +1,118 @@
+if(PKG_USER-INTEL)
+  check_include_file_cxx(immintrin.h FOUND_IMMINTRIN)
+  if(NOT FOUND_IMMINTRIN)
+    message(FATAL_ERROR "immintrin.h header not found, Intel package won't work without it")
+  endif()
+
+  add_definitions(-DLMP_USER_INTEL)
+
+  set(INTEL_ARCH "cpu" CACHE STRING "Architectures used by USER-INTEL (cpu or knl)")
+  set(INTEL_ARCH_VALUES cpu knl)
+  set_property(CACHE INTEL_ARCH PROPERTY STRINGS ${INTEL_ARCH_VALUES})
+  validate_option(INTEL_ARCH INTEL_ARCH_VALUES)
+  string(TOUPPER ${INTEL_ARCH} INTEL_ARCH)
+
+  find_package(Threads QUIET)
+  if(Threads_FOUND)
+    set(INTEL_LRT_MODE "threads" CACHE STRING "Long-range threads mode (none, threads, or c++11)")
+  else()
+    set(INTEL_LRT_MODE "none" CACHE STRING "Long-range threads mode (none, threads, or c++11)")
+  endif()
+  set(INTEL_LRT_VALUES none threads c++11)
+  set_property(CACHE INTEL_LRT_MODE PROPERTY STRINGS ${INTEL_LRT_VALUES})
+  validate_option(INTEL_LRT_MODE INTEL_LRT_VALUES)
+  string(TOUPPER ${INTEL_LRT_MODE} INTEL_LRT_MODE)
+  if(INTEL_LRT_MODE STREQUAL "THREADS")
+    if(Threads_FOUND)
+      add_definitions(-DLMP_INTEL_USELRT)
+      list(APPEND LAMMPS_LINK_LIBS ${CMAKE_THREAD_LIBS_INIT})
+    else()
+      message(FATAL_ERROR "Must have working threads library for Long-range thread support")
+    endif()
+  endif()
+  if(INTEL_LRT_MODE STREQUAL "C++11")
+    add_definitions(-DLMP_INTEL_USERLRT -DLMP_INTEL_LRT11)
+  endif()
+
+  if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+    if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 16)
+      message(FATAL_ERROR "USER-INTEL needs at least a 2016 Intel compiler, found ${CMAKE_CXX_COMPILER_VERSION}")
+    endif()
+  else()
+    message(WARNING "USER-INTEL gives best performance with Intel compilers")
+  endif()
+
+  find_package(TBB QUIET)
+  if(TBB_FOUND)
+    list(APPEND LAMMPS_LINK_LIBS ${TBB_MALLOC_LIBRARIES})
+  else()
+    add_definitions(-DLMP_INTEL_NO_TBB)
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+      message(WARNING "USER-INTEL with Intel compilers should use TBB malloc libraries")
+    endif()
+  endif()
+
+  find_package(MKL QUIET)
+  if(MKL_FOUND)
+    add_definitions(-DLMP_USE_MKL_RNG)
+    list(APPEND LAMMPS_LINK_LIBS ${MKL_LIBRARIES})
+  else()
+    message(STATUS "Pair style dpd/intel will be faster with MKL libraries")
+  endif()
+
+  if((NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") AND (NOT ${LAMMPS_MEMALIGN} STREQUAL "64") AND (NOT ${LAMMPS_MEMALIGN} STREQUAL "128") AND (NOT ${LAMMPS_MEMALIGN} STREQUAL "256"))
+    message(FATAL_ERROR "USER-INTEL only supports memory alignment of 64, 128 or 256 on this platform")
+  endif()
+
+  if(INTEL_ARCH STREQUAL "KNL")
+    if(NOT CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+      message(FATAL_ERROR "Must use Intel compiler with USER-INTEL for KNL architecture")
+    endif()
+    set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} -xHost -qopenmp -qoffload")
+    set(MIC_OPTIONS "-qoffload-option,mic,compiler,\"-fp-model fast=2 -mGLOB_default_function_attrs=\\\"gather_scatter_loop_unroll=4\\\"\"")
+    add_compile_options(-xMIC-AVX512 -qoffload -fno-alias -ansi-alias -restrict -qoverride-limits ${MIC_OPTIONS})
+    add_definitions(-DLMP_INTEL_OFFLOAD)
+  else()
+    if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
+      if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4)
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xCOMMON-AVX512")
+      else()
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -xHost")
+      endif()
+      include(CheckCXXCompilerFlag)
+      foreach(_FLAG -O2 -fp-model fast=2 -no-prec-div -qoverride-limits -qopt-zmm-usage=high -qno-offload -fno-alias -ansi-alias -restrict)
+        check_cxx_compiler_flag("${__FLAG}" COMPILER_SUPPORTS${_FLAG})
+        if(COMPILER_SUPPORTS${_FLAG})
+          add_compile_options(${_FLAG})
+        endif()
+      endforeach()
+    else()
+      add_compile_options(-O3 -ffast-math)
+    endif()
+  endif()
+
+  # collect sources
+  set(USER-INTEL_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-INTEL)
+  set(USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/fix_intel.cpp
+                         ${USER-INTEL_SOURCES_DIR}/fix_nh_intel.cpp
+                         ${USER-INTEL_SOURCES_DIR}/intel_buffers.cpp
+                         ${USER-INTEL_SOURCES_DIR}/nbin_intel.cpp
+                         ${USER-INTEL_SOURCES_DIR}/npair_intel.cpp)
+
+  set_property(GLOBAL PROPERTY "USER-INTEL_SOURCES" "${USER-INTEL_SOURCES}")
+
+  # detect styles which have a USER-INTEL version
+  RegisterStylesExt(${USER-INTEL_SOURCES_DIR} intel USER-INTEL_SOURCES)
+  RegisterNBinStyle(${USER-INTEL_SOURCES_DIR}/nbin_intel.h)
+  RegisterNPairStyle(${USER-INTEL_SOURCES_DIR}/npair_intel.h)
+  RegisterFixStyle(${USER-INTEL_SOURCES_DIR}/fix_intel.h)
+
+  get_property(USER-INTEL_SOURCES GLOBAL PROPERTY USER-INTEL_SOURCES)
+  if(PKG_KSPACE)
+    list(APPEND USER-INTEL_SOURCES ${USER-INTEL_SOURCES_DIR}/verlet_lrt_intel.cpp)
+    RegisterIntegrateStyle(${USER-INTEL_SOURCES_DIR}/verlet_lrt_intel.h)
+  endif()
+
+  list(APPEND LIB_SOURCES ${USER-INTEL_SOURCES})
+  include_directories(${USER-INTEL_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/USER-MOLFILE.cmake
+++ b/cmake/Modules/Packages/USER-MOLFILE.cmake
@ -0,0 +1,10 @@
+if(PKG_USER-MOLFILE)
+  set(MOLFILE_INCLUDE_DIRS "${LAMMPS_LIB_SOURCE_DIR}/molfile" CACHE STRING "Path to VMD molfile plugin headers")
+  add_library(molfile INTERFACE)
+  target_include_directories(molfile INTERFACE ${MOLFILE_INCLUDE_DIRS})
+  # no need to link with -ldl on windows
+  if(NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
+    target_link_libraries(molfile INTERFACE ${CMAKE_DL_LIBS})
+  endif()
+  list(APPEND LAMMPS_LINK_LIBS molfile)
+endif()
--- a/cmake/Modules/Packages/USER-NETCDF.cmake
+++ b/cmake/Modules/Packages/USER-NETCDF.cmake
@ -0,0 +1,6 @@
+if(PKG_USER-NETCDF)
+  find_package(NetCDF REQUIRED)
+  include_directories(${NETCDF_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${NETCDF_LIBRARIES})
+  add_definitions(-DLMP_HAS_NETCDF -DNC_64BIT_DATA=0x0020)
+endif()
--- a/cmake/Modules/Packages/USER-OMP.cmake
+++ b/cmake/Modules/Packages/USER-OMP.cmake
@ -0,0 +1,42 @@
+if(PKG_USER-OMP)
+    set(USER-OMP_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-OMP)
+    set(USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/thr_data.cpp
+                         ${USER-OMP_SOURCES_DIR}/thr_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/fix_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/fix_nh_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/fix_nh_sphere_omp.cpp
+                         ${USER-OMP_SOURCES_DIR}/domain_omp.cpp)
+    add_definitions(-DLMP_USER_OMP)
+    set_property(GLOBAL PROPERTY "OMP_SOURCES" "${USER-OMP_SOURCES}")
+
+    # detects styles which have USER-OMP version
+    RegisterStylesExt(${USER-OMP_SOURCES_DIR} omp OMP_SOURCES)
+    RegisterFixStyle(${USER-OMP_SOURCES_DIR}/fix_omp.h)
+
+    get_property(USER-OMP_SOURCES GLOBAL PROPERTY OMP_SOURCES)
+
+    # manually add package dependent source files from USER-OMP that do not provide styles
+
+    if(PKG_ASPHERE)
+      list(APPEND USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/fix_nh_asphere_omp.cpp)
+    endif()
+
+    if(PKG_RIGID)
+      list(APPEND USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/fix_rigid_nh_omp.cpp)
+    endif()
+
+    if(PKG_USER-REAXC)
+      list(APPEND USER-OMP_SOURCES ${USER-OMP_SOURCES_DIR}/reaxc_bond_orders_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_hydrogen_bonds_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_nonbonded_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_bonds_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_init_md_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_torsion_angles_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_forces_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_multi_body_omp.cpp
+                                   ${USER-OMP_SOURCES_DIR}/reaxc_valence_angles_omp.cpp)
+    endif()
+
+    list(APPEND LIB_SOURCES ${USER-OMP_SOURCES})
+    include_directories(${USER-OMP_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/USER-PLUMED.cmake
+++ b/cmake/Modules/Packages/USER-PLUMED.cmake
@ -0,0 +1,79 @@
+if(PKG_USER-PLUMED)
+  find_package(GSL REQUIRED)
+  set(PLUMED_MODE "static" CACHE STRING "Linkage mode for Plumed2 library")
+  set(PLUMED_MODE_VALUES static shared runtime)
+  set_property(CACHE PLUMED_MODE PROPERTY STRINGS ${PLUMED_MODE_VALUES})
+  validate_option(PLUMED_MODE PLUMED_MODE_VALUES)
+  string(TOUPPER ${PLUMED_MODE} PLUMED_MODE)
+
+  find_package(PkgConfig QUIET)
+  set(DOWNLOAD_PLUMED_DEFAULT ON)
+  if(PKG_CONFIG_FOUND)
+    pkg_check_modules(PLUMED QUIET plumed)
+    if(PLUMED_FOUND)
+      set(DOWNLOAD_PLUMED_DEFAULT OFF)
+    endif()
+  endif()
+
+  option(DOWNLOAD_PLUMED "Download Plumed package instead of using an already installed one" ${DOWNLOAD_PLUMED_DEFAULT})
+  if(DOWNLOAD_PLUMED)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded Plumed library with Ninja build tool")
+    endif()
+    if(BUILD_MPI)
+      set(PLUMED_CONFIG_MPI "--enable-mpi")
+      set(PLUMED_CONFIG_CC  ${CMAKE_MPI_C_COMPILER})
+      set(PLUMED_CONFIG_CXX  ${CMAKE_MPI_CXX_COMPILER})
+    else()
+      set(PLUMED_CONFIG_MPI "--disable-mpi")
+      set(PLUMED_CONFIG_CC  ${CMAKE_C_COMPILER})
+      set(PLUMED_CONFIG_CXX  ${CMAKE_CXX_COMPILER})
+    endif()
+    if(BUILD_OMP)
+      set(PLUMED_CONFIG_OMP "--enable-openmp")
+    else()
+      set(PLUMED_CONFIG_OMP "--disable-openmp")
+    endif()
+    message(STATUS "PLUMED download requested - we will build our own")
+    include(ExternalProject)
+    ExternalProject_Add(plumed_build
+      URL https://github.com/plumed/plumed2/releases/download/v2.5.1/plumed-src-2.5.1.tgz
+      URL_MD5 c2a7b519e32197a120cdf47e0f194f81
+      BUILD_IN_SOURCE 1
+      CONFIGURE_COMMAND <SOURCE_DIR>/configure --prefix=<INSTALL_DIR>
+                                               ${CONFIGURE_REQUEST_PIC}
+                                               --enable-modules=all
+                                               ${PLUMED_CONFIG_MPI}
+                                               ${PLUMED_CONFIG_OMP}
+                                               CXX=${PLUMED_CONFIG_CXX}
+                                               CC=${PLUMED_CONFIG_CC}
+    )
+    ExternalProject_get_property(plumed_build INSTALL_DIR)
+    set(PLUMED_INSTALL_DIR ${INSTALL_DIR})
+    list(APPEND LAMMPS_DEPS plumed_build)
+    if(PLUMED_MODE STREQUAL "STATIC")
+      add_definitions(-D__PLUMED_WRAPPER_CXX=1)
+      list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumed.a ${GSL_LIBRARIES} ${LAPACK_LIBRARIES} ${CMAKE_DL_LIBS})
+    elseif(PLUMED_MODE STREQUAL "SHARED")
+      list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumed.so ${PLUMED_INSTALL_DIR}/lib/libplumedKernel.so ${CMAKE_DL_LIBS})
+    elseif(PLUMED_MODE STREQUAL "RUNTIME")
+      add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_INSTALL_DIR}/lib/libplumedKernel.so)
+      list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumedWrapper.a -rdynamic ${CMAKE_DL_LIBS})
+    endif()
+    set(PLUMED_INCLUDE_DIRS "${PLUMED_INSTALL_DIR}/include")
+  else()
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(PLUMED REQUIRED plumed)
+    if(PLUMED_MODE STREQUAL "STATIC")
+      add_definitions(-D__PLUMED_WRAPPER_CXX=1)
+      include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
+    elseif(PLUMED_MODE STREQUAL "SHARED")
+      include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.shared)
+    elseif(PLUMED_MODE STREQUAL "RUNTIME")
+      add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_LIBDIR}/libplumedKernel.so)
+      include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.runtime)
+    endif()
+    list(APPEND LAMMPS_LINK_LIBS ${PLUMED_LOAD})
+  endif()
+  include_directories(${PLUMED_INCLUDE_DIRS})
+endif()
--- a/cmake/Modules/Packages/USER-QMMM.cmake
+++ b/cmake/Modules/Packages/USER-QMMM.cmake
@ -0,0 +1,9 @@
+if(PKG_USER-QMMM)
+  enable_language(Fortran)
+  enable_language(C)
+
+  message(WARNING "Building QMMM with CMake is still experimental")
+  find_package(QE REQUIRED)
+  include_directories(${QE_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${QE_LIBRARIES})
+endif()
--- a/cmake/Modules/Packages/USER-QUIP.cmake
+++ b/cmake/Modules/Packages/USER-QUIP.cmake
@ -0,0 +1,5 @@
+if(PKG_USER-QUIP)
+  enable_language(Fortran)
+  find_package(QUIP REQUIRED)
+  list(APPEND LAMMPS_LINK_LIBS ${QUIP_LIBRARIES} ${LAPACK_LIBRARIES})
+endif()
--- a/cmake/Modules/Packages/USER-SCAFACOS.cmake
+++ b/cmake/Modules/Packages/USER-SCAFACOS.cmake
@ -0,0 +1,62 @@
+if(PKG_USER-SCAFACOS)
+  enable_language(Fortran)
+  enable_language(C)
+
+  find_package(GSL REQUIRED)
+  find_package(PkgConfig QUIET)
+  set(DOWNLOAD_SCAFACOS_DEFAULT ON)
+  if(PKG_CONFIG_FOUND)
+    pkg_check_modules(SCAFACOS QUIET scafacos)
+    if(SCAFACOS_FOUND)
+      set(DOWNLOAD_SCAFACOS_DEFAULT OFF)
+    endif()
+  endif()
+  option(DOWNLOAD_SCAFACOS "Download ScaFaCoS library instead of using an already installed one" ${DOWNLOAD_SCAFACOS_DEFAULT})
+  if(DOWNLOAD_SCAFACOS)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded ScaFaCoS library with Ninja build tool")
+    endif()
+    message(STATUS "ScaFaCoS download requested - we will build our own")
+    include(ExternalProject)
+    ExternalProject_Add(scafacos_build
+      URL https://github.com/scafacos/scafacos/releases/download/v1.0.1/scafacos-1.0.1.tar.gz
+      URL_MD5 bd46d74e3296bd8a444d731bb10c1738
+      CONFIGURE_COMMAND <SOURCE_DIR>/configure --prefix=<INSTALL_DIR> --disable-doc
+                                               --enable-fcs-solvers=fmm,p2nfft,direct,ewald,p3m
+                                               --with-internal-fftw --with-internal-pfft
+                                               --with-internal-pnfft ${CONFIGURE_REQUEST_PIC}
+                                               FC=${CMAKE_MPI_Fortran_COMPILER}
+                                               CXX=${CMAKE_MPI_CXX_COMPILER}
+                                               CC=${CMAKE_MPI_C_COMPILER}
+                                               F77=
+    )
+    ExternalProject_get_property(scafacos_build INSTALL_DIR)
+    set(SCAFACOS_BUILD_DIR ${INSTALL_DIR})
+    set(SCAFACOS_INCLUDE_DIRS ${SCAFACOS_BUILD_DIR}/include)
+    list(APPEND LAMMPS_DEPS scafacos_build)
+    # list and order from pkg_config file of ScaFaCoS build
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_direct.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_ewald.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_fmm.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_p2nfft.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_p3m.a)
+    list(APPEND LAMMPS_LINK_LIBS ${GSL_LIBRARIES})
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_near.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_gridsort.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_resort.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_redist.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_common.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_pnfft.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_pfft.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_fftw3_mpi.a)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_BUILD_DIR}/lib/libfcs_fftw3.a)
+    list(APPEND LAMMPS_LINK_LIBS ${MPI_Fortran_LIBRARIES})
+    list(APPEND LAMMPS_LINK_LIBS ${MPI_C_LIBRARIES})
+  else()
+    find_package(PkgConfig REQUIRED)
+    pkg_check_modules(SCAFACOS REQUIRED scafacos)
+    list(APPEND LAMMPS_LINK_LIBS ${SCAFACOS_LDFLAGS})
+  endif()
+  include_directories(${SCAFACOS_INCLUDE_DIRS})
+endif()
--- a/cmake/Modules/Packages/USER-SDPD.cmake
+++ b/cmake/Modules/Packages/USER-SDPD.cmake
@ -0,0 +1,13 @@
+# Fix rigid/meso requires RIGID to be installed
+if(PKG_USER-SDPD)
+  set(USER-SDPD_SOURCES_DIR ${LAMMPS_SOURCE_DIR}/USER-SDPD)
+
+  get_property(hlist GLOBAL PROPERTY FIX)
+  if(NOT PKG_RIGID)
+    list(REMOVE_ITEM hlist ${USER-SDPD_SOURCES_DIR}/fix_rigid_meso.h)
+    list(REMOVE_ITEM LIB_SOURCES ${USER-SDPD_SOURCES_DIR}/fix_rigid_meso.cpp)
+  endif()
+  set_property(GLOBAL PROPERTY FIX "${hlist}")
+
+  include_directories(${USER-SDPD_SOURCES_DIR})
+endif()
--- a/cmake/Modules/Packages/USER-SMD.cmake
+++ b/cmake/Modules/Packages/USER-SMD.cmake
@ -0,0 +1,28 @@
+if(PKG_USER-SMD)
+  find_package(Eigen3 NO_MODULE)
+  if(EIGEN3_FOUND)
+    set(DOWNLOAD_EIGEN3_DEFAULT OFF)
+  else()
+    set(DOWNLOAD_EIGEN3_DEFAULT ON)
+  endif()
+  option(DOWNLOAD_EIGEN3 "Download Eigen3 instead of using an already installed one)" ${DOWNLOAD_EIGEN3_DEFAULT})
+  if(DOWNLOAD_EIGEN3)
+    message(STATUS "Eigen3 download requested - we will build our own")
+    include(ExternalProject)
+    ExternalProject_Add(Eigen3_build
+      URL http://bitbucket.org/eigen/eigen/get/3.3.7.tar.gz
+      URL_MD5 f2a417d083fe8ca4b8ed2bc613d20f07
+      CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND ""
+    )
+    ExternalProject_get_property(Eigen3_build SOURCE_DIR)
+    set(EIGEN3_INCLUDE_DIR ${SOURCE_DIR})
+    list(APPEND LAMMPS_DEPS Eigen3_build)
+  else()
+    find_package(Eigen3 NO_MODULE)
+    mark_as_advanced(Eigen3_DIR)
+    if(NOT EIGEN3_FOUND)
+      message(FATAL_ERROR "Eigen3 not found, help CMake to find it by setting EIGEN3_INCLUDE_DIR, or set DOWNLOAD_EIGEN3=ON to download it")
+    endif()
+  endif()
+  include_directories(${EIGEN3_INCLUDE_DIR})
+endif()
--- a/cmake/Modules/Packages/USER-VTK.cmake
+++ b/cmake/Modules/Packages/USER-VTK.cmake
@ -0,0 +1,6 @@
+if(PKG_USER-VTK)
+  find_package(VTK REQUIRED NO_MODULE)
+  include(${VTK_USE_FILE})
+  add_definitions(-DLAMMPS_VTK)
+  list(APPEND LAMMPS_LINK_LIBS ${VTK_LIBRARIES})
+endif()
--- a/cmake/Modules/Packages/VORONOI.cmake
+++ b/cmake/Modules/Packages/VORONOI.cmake
@ -0,0 +1,41 @@
+if(PKG_VORONOI)
+  find_package(VORO)
+  if(VORO_FOUND)
+    set(DOWNLOAD_VORO_DEFAULT OFF)
+  else()
+    set(DOWNLOAD_VORO_DEFAULT ON)
+  endif()
+  option(DOWNLOAD_VORO "Download and compile the Voro++ library instead of using an already installed one" ${DOWNLOAD_VORO_DEFAULT})
+  if(DOWNLOAD_VORO)
+    if(CMAKE_GENERATOR STREQUAL "Ninja")
+      message(FATAL_ERROR "Cannot build downloaded Voro++ library with Ninja build tool")
+    endif()
+    message(STATUS "Voro++ download requested - we will build our own")
+    include(ExternalProject)
+
+    if(BUILD_SHARED_LIBS)
+      set(VORO_BUILD_CFLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS} ${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${BTYPE}}")
+    else()
+      set(VORO_BUILD_CFLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_FLAGS_${BTYPE}}")
+    endif()
+    string(APPEND VORO_BUILD_CFLAGS ${CMAKE_CXX_FLAGS})
+    set(VORO_BUILD_OPTIONS CXX=${CMAKE_CXX_COMPILER} CFLAGS=${VORO_BUILD_CFLAGS})
+
+    ExternalProject_Add(voro_build
+      URL https://download.lammps.org/thirdparty/voro++-0.4.6.tar.gz
+      URL_MD5 2338b824c3b7b25590e18e8df5d68af9
+      CONFIGURE_COMMAND "" BUILD_COMMAND make ${VORO_BUILD_OPTIONS} BUILD_IN_SOURCE 1 INSTALL_COMMAND ""
+      )
+    ExternalProject_get_property(voro_build SOURCE_DIR)
+    set(VORO_LIBRARIES ${SOURCE_DIR}/src/libvoro++.a)
+    set(VORO_INCLUDE_DIRS ${SOURCE_DIR}/src)
+    list(APPEND LAMMPS_DEPS voro_build)
+  else()
+    find_package(VORO)
+    if(NOT VORO_FOUND)
+      message(FATAL_ERROR "Voro++ library not found. Help CMake to find it by setting VORO_LIBRARY and VORO_INCLUDE_DIR, or set DOWNLOAD_VORO=ON to download it")
+    endif()
+  endif()
+  include_directories(${VORO_INCLUDE_DIRS})
+  list(APPEND LAMMPS_LINK_LIBS ${VORO_LIBRARIES})
+endif()
--- a/cmake/Modules/Testing.cmake
+++ b/cmake/Modules/Testing.cmake
@ -0,0 +1,52 @@
+###############################################################################
+# Testing
+###############################################################################
+option(ENABLE_TESTING "Enable testing" OFF)
+if(ENABLE_TESTING AND BUILD_EXE)
+  enable_testing()
+  option(LAMMPS_TESTING_SOURCE_DIR "Location of lammps-testing source directory" "")
+  option(LAMMPS_TESTING_GIT_TAG    "Git tag of lammps-testing" "master")
+  mark_as_advanced(LAMMPS_TESTING_SOURCE_DIR LAMMPS_TESTING_GIT_TAG)
+
+  if (CMAKE_VERSION VERSION_GREATER "3.10.3" AND NOT LAMMPS_TESTING_SOURCE_DIR)
+    include(FetchContent)
+
+    FetchContent_Declare(lammps-testing
+      GIT_REPOSITORY https://github.com/lammps/lammps-testing.git
+      GIT_TAG ${LAMMPS_TESTING_GIT_TAG}
+    )
+
+    FetchContent_GetProperties(lammps-testing)
+    if(NOT lammps-testing_POPULATED)
+      message(STATUS "Downloading tests...")
+      FetchContent_Populate(lammps-testing)
+    endif()
+
+    set(LAMMPS_TESTING_SOURCE_DIR ${lammps-testing_SOURCE_DIR})
+  elseif(NOT LAMMPS_TESTING_SOURCE_DIR)
+    message(WARNING "Full test-suite requires CMake >= 3.11 or copy of\n"
+                    "https://github.com/lammps/lammps-testing in LAMMPS_TESTING_SOURCE_DIR")
+  endif()
+
+  add_test(ShowHelp ${CMAKE_BINARY_DIR}/${LAMMPS_BINARY} -help)
+
+  if(EXISTS ${LAMMPS_TESTING_SOURCE_DIR})
+    message(STATUS "Running test discovery...")
+
+    file(GLOB_RECURSE TEST_SCRIPTS ${LAMMPS_TESTING_SOURCE_DIR}/tests/core/*/in.*)
+    foreach(script_path ${TEST_SCRIPTS})
+      get_filename_component(TEST_NAME ${script_path} EXT)
+      get_filename_component(SCRIPT_NAME ${script_path} NAME)
+      get_filename_component(PARENT_DIR ${script_path} DIRECTORY)
+      string(SUBSTRING ${TEST_NAME} 1 -1 TEST_NAME)
+      string(REPLACE "-" "_" TEST_NAME ${TEST_NAME})
+      string(REPLACE "+" "_" TEST_NAME ${TEST_NAME})
+      set(TEST_NAME "test_core_${TEST_NAME}_serial")
+      add_test(${TEST_NAME} ${CMAKE_BINARY_DIR}/${LAMMPS_BINARY} -in ${SCRIPT_NAME})
+      set_tests_properties(${TEST_NAME} PROPERTIES WORKING_DIRECTORY ${PARENT_DIR})
+    endforeach()
+    list(LENGTH TEST_SCRIPTS NUM_TESTS)
+
+    message(STATUS "Found ${NUM_TESTS} tests.")
+  endif()
+endif()
--- a/doc/lammps.1
+++ b/doc/lammps.1
@ -1,4 +1,4 @@
-.TH LAMMPS "5 June 2019" "2019-06-05"
+.TH LAMMPS "18 June 2019" "2019-06-18"
 .SH NAME
 .B LAMMPS
 \- Molecular Dynamics Simulator.
--- a/doc/src/Build_cmake.txt
+++ b/doc/src/Build_cmake.txt
@ -32,10 +32,18 @@ cmake \[options ...\] ../cmake     # configuration with (command-line) cmake
 make                             # compilation :pre

 The cmake command will detect available features, enable selected
-packages and options, and will generate the build environment.  The make
-command will then compile and link LAMMPS, producing (by default) an
-executable called "lmp" and a library called "liblammps.a" in the
-"build" folder.
+packages and options, and will generate the build environment.  By default
+this build environment will be created for "Unix Makefiles" on most
+platforms and particularly on Linux.  However, alternate build tools
+(e.g. Ninja) and support files for Integrated Development Environments
+(IDE) like Eclipse, CodeBlocks, or Kate can be generated, too. This is
+selected via the "-G" command line flag. For the rest of the documentation
+we will assume that the build environment is generated for makefiles
+and thus the make command will be used to compile and link LAMMPS as
+indicated above, producing (by default) an executable called "lmp" and
+a library called "liblammps.a" in the "build" folder.  When generating
+a build environment for the "Ninja" build tool, the build command would
+be "ninja" instead of "make".

 If your machine has multiple CPU cores (most do these days), using a
 command like "make -jN" (with N being the number of available local
--- a/doc/src/Commands_bond.txt
+++ b/doc/src/Commands_bond.txt
@ -28,8 +28,12 @@ OPT.

 "none"_bond_none.html,
 "zero"_bond_zero.html,
-"hybrid"_bond_hybrid.html :tb(c=3,ea=c)
-
+"hybrid"_bond_hybrid.html,
+,
+,
+,
+,
+,
 "class2 (ko)"_bond_class2.html,
 "fene (iko)"_bond_fene.html,
 "fene/expand (o)"_bond_fene_expand.html,
@ -56,8 +60,12 @@ OPT.

 "none"_angle_none.html,
 "zero"_angle_zero.html,
-"hybrid"_angle_hybrid.html :tb(c=3,ea=c)
-
+"hybrid"_angle_hybrid.html,
+,
+,
+,
+,
+,
 "charmm (iko)"_angle_charmm.html,
 "class2 (ko)"_angle_class2.html,
 "class2/p6"_angle_class2.html,
@ -89,8 +97,12 @@ OPT.

 "none"_dihedral_none.html,
 "zero"_dihedral_zero.html,
-"hybrid"_dihedral_hybrid.html :tb(c=3,ea=c)
-
+"hybrid"_dihedral_hybrid.html,
+,
+,
+,
+,
+,
 "charmm (iko)"_dihedral_charmm.html,
 "charmmfsw"_dihedral_charmm.html,
 "class2 (ko)"_dihedral_class2.html,
@ -117,8 +129,12 @@ OPT.

 "none"_improper_none.html,
 "zero"_improper_zero.html,
-"hybrid"_improper_hybrid.html :tb(c=3,ea=c)
-
+"hybrid"_improper_hybrid.html,
+,
+,
+,
+,
+,
 "class2 (ko)"_improper_class2.html,
 "cossq (o)"_improper_cossq.html,
 "cvff (io)"_improper_cvff.html,
--- a/doc/src/Commands_pair.txt
+++ b/doc/src/Commands_pair.txt
@ -27,8 +27,11 @@ OPT.
 "none"_pair_none.html,
 "zero"_pair_zero.html,
 "hybrid (k)"_pair_hybrid.html,
-"hybrid/overlay (k)"_pair_hybrid.html :tb(c=4,ea=c)
-
+"hybrid/overlay (k)"_pair_hybrid.html,
+,
+,
+,
+,
 "adp (o)"_pair_adp.html,
 "agni (o)"_pair_agni.html,
 "airebo (io)"_pair_airebo.html,
--- a/doc/src/Eqs/pair_spin_dipole.jpg
+++ b/doc/src/Eqs/pair_spin_dipole.jpg
--- a/doc/src/Eqs/pair_spin_dipole.tex
+++ b/doc/src/Eqs/pair_spin_dipole.tex
@ -0,0 +1,42 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \mathcal{H}_{\rm long}= 
+    -\frac{\mu_{0} \left( \mu_B\right)^2}{4\pi} 
+    \sum_{i,j,i\neq j}^{N}
+    \frac{g_i g_j}{r_{ij}^3}
+    \Big(3 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{i}\right) 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{j}\right) 
+    -\bm{s}_i\cdot\bm{s}_j \Big)
+    \nonumber
+  \end{equation}
+  \begin{equation}
+    \bm{\omega}_i = 
+    \frac{\mu_0 (\mu_B)^2}{4\pi\hbar}\sum_{j}
+    \frac{g_i g_j}{r_{ij}^3}
+    \, \Big(
+    3\,(\bm{e}_{ij}\cdot\bm{s}_{j})\bm{e}_{ij}
+    -\bm{s}_{j} \Big) \nonumber
+  \end{equation}
+  \begin{equation}
+    \bm{F}_i =
+    \frac{3\, \mu_0 (\mu_B)^2}{4\pi} \sum_j
+    \frac{g_i g_j}{r_{ij}^4}
+    \Big[\big( (\bm{s}_i\cdot\bm{s}_j) 
+    -5(\bm{e}_{ij}\cdot\bm{s}_i)
+    (\bm{e}_{ij}\cdot\bm{s}_j)\big) \bm{e}_{ij}+
+    \big(
+    (\bm{e}_{ij}\cdot\bm{s}_i)\bm{s}_j+
+    (\bm{e}_{ij}\cdot\bm{s}_j)\bm{s}_i
+    \big)
+    \Big]
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Eqs/pair_spin_long_range.jpg
+++ b/doc/src/Eqs/pair_spin_long_range.jpg
--- a/doc/src/Eqs/pair_spin_long_range.tex
+++ b/doc/src/Eqs/pair_spin_long_range.tex
@ -0,0 +1,20 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \mathcal{H}_{\rm long}= 
+    -\frac{\mu_{0} \left( \mu_B\right)^2}{4\pi} 
+    \sum_{i,j,i\neq j}^{N}
+    \frac{g_i g_j}{r_{ij}^3}
+    \Big(3 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{i}\right) 
+    \left(\bm{e}_{ij}\cdot \bm{s}_{j}\right) 
+    -\bm{s}_i\cdot\bm{s}_j \Big)
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Eqs/pair_spin_long_range_force.jpg
+++ b/doc/src/Eqs/pair_spin_long_range_force.jpg
--- a/doc/src/Eqs/pair_spin_long_range_force.tex
+++ b/doc/src/Eqs/pair_spin_long_range_force.tex
@ -0,0 +1,23 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \bm{F}_i =
+    \frac{\mu_0 (\mu_B)^2}{4\pi} \sum_j
+    \frac{g_i g_j}{r_{ij}^4}
+    \Big[\big( (\bm{s}_i\cdot\bm{s}_j) 
+    -5(\bm{e}_{ij}\cdot\bm{s}_i)
+    (\bm{e}_{ij}\cdot\bm{s}_j)\big) \bm{e}_{ij}+
+    \big(
+    (\bm{e}_{ij}\cdot\bm{s}_i)\bm{s}_j+
+    (\bm{e}_{ij}\cdot\bm{s}_j)\bm{s}_i
+    \big)
+    \Big]
+    \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Eqs/pair_spin_long_range_magforce.jpg
+++ b/doc/src/Eqs/pair_spin_long_range_magforce.jpg
--- a/doc/src/Eqs/pair_spin_long_range_magforce.tex
+++ b/doc/src/Eqs/pair_spin_long_range_magforce.tex
@ -0,0 +1,17 @@
+\documentclass[preview]{standalone}
+\usepackage{varwidth}
+\usepackage[utf8x]{inputenc}
+\usepackage{amsmath,amssymb,graphics,bm,setspace}
+
+\begin{document}
+\begin{varwidth}{50in}
+  \begin{equation}
+    \bm{\omega}_i = 
+    \frac{\mu_0 (\mu_B)^2}{4\pi\hbar}\sum_{j}
+    \frac{g_i g_j}{r_{ij}^3}
+    \, \Big(
+    3\,(\bm{e}_{ij}\cdot\bm{s}_{j})\bm{e}_{ij}
+    -\bm{s}_{j} \Big) \nonumber
+  \end{equation}
+\end{varwidth}
+\end{document}
--- a/doc/src/Errors_messages.txt
+++ b/doc/src/Errors_messages.txt
@ -636,12 +636,12 @@ Please ensure reaction map files are properly formatted. :dd

 {Bond/react: Atom affected by reaction too close to template edge} :dt

-This means an atom which changes type during the reaction is too close
-to an 'edge' atom defined in the superimpose file. This could cause
-incorrect assignment of bonds, angle, etc. Generally, this means you
-must include more atoms in your templates, such that there are at
-least two atoms between each atom involved in the reaction and an edge
-atom. :dd
+This means an atom which changes type or connectivity during the
+reaction is too close to an 'edge' atom defined in the superimpose
+file. This could cause incorrect assignment of bonds, angle, etc.
+Generally, this means you must include more atoms in your templates,
+such that there are at least two atoms between each atom involved in
+the reaction and an edge atom. :dd

 {Bond/react: Fix bond/react needs ghost atoms from farther away} :dt

@ -2202,10 +2202,6 @@ Self-explanatory. :dd

 This is a current restriction in LAMMPS. :dd

-{Cannot use pair hybrid with GPU neighbor list builds} :dt
-
-Neighbor list builds must be done on the CPU for this pair style. :dd
-
 {Cannot use pair tail corrections with 2d simulations} :dt

 The correction factors are only currently defined for 3d systems. :dd
@ -5523,10 +5519,6 @@ Self-explanatory. :dd
 For this pair style, you cannot run part of the force calculation on
 the host.  See the package command. :dd

-{GPU split param must be positive for hybrid pair styles} :dt
-
-See the package gpu command. :dd
-
 {GPUs are requested but Kokkos has not been compiled for CUDA} :dt

 Re-compile Kokkos with CUDA support to use GPUs. :dd
@ -7097,6 +7089,18 @@ Self-explanatory. :dd

 One or more GPUs must be used when Kokkos is compiled for CUDA. :dd

+{Kspace_modify mesh parameter must be all zero or all positive} :dt
+
+Valid kspace mesh parameters are >0. The code will try to auto-detect
+suitable values when all three mesh sizes are set to zero (the default). :dd
+
+{Kspace_modify mesh/disp parameter must be all zero or all positive} :dt
+
+Valid kspace mesh/disp parameters are >0. The code will try to auto-detect
+suitable values when all three mesh sizes are set to zero [and]
+the required accuracy via {force/disp/real} as well as
+{force/disp/kspace} is set. :dd
+
 {Kspace style does not support compute group/group} :dt

 Self-explanatory. :dd
--- a/doc/src/Errors_warnings.txt
+++ b/doc/src/Errors_warnings.txt
@ -82,10 +82,14 @@ bond/angle/dihedral.  LAMMPS computes this by taking the maximum bond
 length, multiplying by the number of bonds in the interaction (e.g. 3
 for a dihedral) and adding a small amount of stretch. :dd

-{Bond/react: An atom in 'react #%d' changes bond connectivity but not atom type} :dt
+{Bond/react: Atom affected by reaction too close to template edge} :dt

-You may want to double-check that all atom types are properly assigned
-in the post-reaction template. :dd
+This means an atom which changes type or connectivity during the
+reaction is too close to an 'edge' atom defined in the superimpose
+file. This could cause incorrect assignment of bonds, angle, etc.
+Generally, this means you must include more atoms in your templates,
+such that there are at least two atoms between each atom involved in
+the reaction and an edge atom. :dd

 {Both groups in compute group/group have a net charge; the Kspace boundary correction to energy will be non-zero} :dt

--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="5 Jun 2019 version">
+<META NAME="docnumber" CONTENT="18 Jun 2019 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -21,7 +21,7 @@
 :line

 LAMMPS Documentation :c,h1
-5 Jun 2019 version :c,h2
+18 Jun 2019 version :c,h2

 "What is a LAMMPS version?"_Manual_version.html

--- a/doc/src/Speed_compare.txt
+++ b/doc/src/Speed_compare.txt
@ -104,7 +104,7 @@ code (with a performance penalty due to having data transfers between
 host and GPU). :ulb,l

 The GPU package requires neighbor lists to be built on the CPU when using
-exclusion lists, hybrid pair styles, or a triclinic simulation box. :l
+exclusion lists, or a triclinic simulation box. :l

 The GPU package can be compiled for CUDA or OpenCL and thus supports
 both, Nvidia and AMD GPUs well. On Nvidia hardware, using CUDA is typically
--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@ -46,7 +46,7 @@ software version 7.5 or later must be installed on your system. See
 the discussion for the "GPU package"_Speed_gpu.html for details of how
 to check and do this.

-NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI
+NOTE: Kokkos with CUDA currently implicitly assumes that the MPI
 library is CUDA-aware and has support for GPU-direct. This is not
 always the case, especially when using pre-compiled MPI libraries
 provided by a Linux distribution. This is not a problem when using
@ -207,19 +207,21 @@ supports.

 [Running on GPUs:]

-Use the "-k" "command-line switch"_Run_options.html to
-specify the number of GPUs per node. Typically the -np setting of the
-mpirun command should set the number of MPI tasks/node to be equal to
-the number of physical GPUs on the node.  You can assign multiple MPI
-tasks to the same GPU with the KOKKOS package, but this is usually
-only faster if significant portions of the input script have not
-been ported to use Kokkos. Using CUDA MPS is recommended in this
-scenario. Using a CUDA-aware MPI library with support for GPU-direct
-is highly recommended. GPU-direct use can be avoided by using
-"-pk kokkos gpu/direct no"_package.html.
-As above for multi-core CPUs (and no GPU), if N is the number of
-physical cores/node, then the number of MPI tasks/node should not
-exceed N.
+Use the "-k" "command-line switch"_Run_options.html to specify the 
+number of GPUs per node. Typically the -np setting of the mpirun command 
+should set the number of MPI tasks/node to be equal to the number of 
+physical GPUs on the node. You can assign multiple MPI tasks to the same 
+GPU with the KOKKOS package, but this is usually only faster if some 
+portions of the input script have not been ported to use Kokkos. In this 
+case, also packing/unpacking communication buffers on the host may give 
+speedup (see the KOKKOS "package"_package.html command). Using CUDA MPS 
+is recommended in this scenario.
+
+Using a CUDA-aware MPI library with 
+support for GPU-direct is highly recommended. GPU-direct use can be 
+avoided by using "-pk kokkos gpu/direct no"_package.html. As above for 
+multi-core CPUs (and no GPU), if N is the number of physical cores/node, 
+then the number of MPI tasks/node should not exceed N.

 -k on g Ng :pre

--- a/doc/src/compute_sna_atom.txt
+++ b/doc/src/compute_sna_atom.txt
@ -24,12 +24,7 @@ twojmax = band limit for bispectrum components (non-negative integer) :l
 R_1, R_2,... = list of cutoff radii, one for each type (distance units) :l
 w_1, w_2,... = list of neighbor weights, one for each type  :l
 zero or more keyword/value pairs may be appended :l
-keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
-  {diagonal} value = {0} or {1} or {2} or {3}
-     {0} = all j1, j2, j <= twojmax, j2 <= j1
-     {1} = subset satisfying j1 == j2
-     {2} = subset satisfying j1 == j2 == j3
-     {3} = subset satisfying j2 <= j1 <= j
+keyword = {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag} :l
  {rmin0} value = parameter in distance to angle conversion (distance units)
  {switchflag} value = {0} or {1}
     {0} = do not use switching function
@ -44,7 +39,7 @@ keyword = {diagonal} or {rmin0} or {switchflag} or {bzeroflag} or {quadraticflag

 [Examples:]

-compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 diagonal 3 rmin0 0.0
+compute b all sna/atom 1.4 0.99363 6 2.0 2.4 0.75 1.0 rmin0 0.0
 compute db all sna/atom 1.4 0.95 6 2.0 1.0
 compute vb all sna/atom 1.4 0.95 6 2.0 1.0 :pre

@ -151,7 +146,7 @@ The argument {rfac0} and the optional keyword {rmin0} define the
 linear mapping from radial distance to polar angle {theta0} on the
 3-sphere.

-The argument {twojmax} and the keyword {diagonal} define which
+The argument {twojmax} defines which
 bispectrum components are generated. See section below on output for a
 detailed explanation of the number of bispectrum components and the
 ordered in which they are listed.
@ -192,23 +187,18 @@ command that includes all pairs in the neighbor list.
 Compute {sna/atom} calculates a per-atom array, each column
 corresponding to a particular bispectrum component.  The total number
 of columns and the identity of the bispectrum component contained in
-each column depend on the values of {twojmax} and {diagonal}, as
+each column depend of the value of {twojmax}, as
 described by the following piece of python code:

 for j1 in range(0,twojmax+1):
-    if(diagonal==2):
-        print j1/2.,j1/2.,j1/2.
-    elif(diagonal==1):
-        for j in range(0,min(twojmax,2*j1)+1,2):
-            print j1/2.,j1/2.,j/2.
-    elif(diagonal==0):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                print j1/2.,j2/2.,j/2.
-    elif(diagonal==3):
-        for j2 in range(0,j1+1):
-            for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
-                if (j>=j1): print j1/2.,j2/2.,j/2. :pre
+    for j2 in range(0,j1+1):
+        for j in range(j1-j2,min(twojmax,j1+j2)+1,2):
+            if (j>=j1): print j1/2.,j2/2.,j/2. :pre
+
+NOTE: the {diagonal} keyword allowing other possible choices
+for the number of bispectrum components was removed in 2019, 
+since all potentials use the value of 3, corresponding to the
+above set of bispectrum components.

 Compute {snad/atom} evaluates a per-atom array. The columns are
 arranged into {ntypes} blocks, listed in order of atom type {I}.  Each
@ -259,7 +249,7 @@ package"_Build_package.html doc page for more info.

 [Default:]

-The optional keyword defaults are {diagonal} = 0, {rmin0} = 0,
+The optional keyword defaults are {rmin0} = 0,
 {switchflag} = 1, {bzeroflag} = 1, {quadraticflag} = 0,

 :line
--- a/doc/src/compute_tally.txt
+++ b/doc/src/compute_tally.txt
@ -88,6 +88,8 @@ potentials only include the pair potential portion of the EAM
 interaction when used by this compute, not the embedding term.  Also
 bonded or Kspace interactions do not contribute to this compute.

+The computes in this package are not compatible with dynamic groups.
+
 [Related commands:]

 {compute group/group}_compute_group_group.html, {compute
--- a/doc/src/kspace_modify.txt
+++ b/doc/src/kspace_modify.txt
@ -392,7 +392,8 @@ boundaries can be set using "boundary"_boundary.html (the slab
 approximation in not needed).  The {slab} keyword is not currently
 supported by Ewald or PPPM when using a triclinic simulation cell. The
 slab correction has also been extended to point dipole interactions
-"(Klapp)"_#Klapp in "kspace_style"_kspace_style.html {ewald/disp}.
+"(Klapp)"_#Klapp in "kspace_style"_kspace_style.html {ewald/disp},
+{ewald/dipole}, and {pppm/dipole}.

 NOTE: If you wish to apply an electric field in the Z-direction, in
 conjunction with the {slab} keyword, you should do it by adding
--- a/doc/src/kspace_style.txt
+++ b/doc/src/kspace_style.txt
@ -20,6 +20,10 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
    accuracy = desired relative error in forces
  {ewald/omp} value = accuracy
    accuracy = desired relative error in forces
+  {ewald/dipole} value = accuracy
+    accuracy = desired relative error in forces
+  {ewald/dipole/spin} value = accuracy
+    accuracy = desired relative error in forces
  {pppm} value = accuracy
    accuracy = desired relative error in forces
  {pppm/cg} values = accuracy (smallq)
@ -47,6 +51,10 @@ style = {none} or {ewald} or {ewald/disp} or {ewald/omp} or {pppm} or {pppm/cg}
    accuracy = desired relative error in forces
  {pppm/stagger} value = accuracy
    accuracy = desired relative error in forces
+  {pppm/dipole} value = accuracy
+    accuracy = desired relative error in forces
+  {pppm/dipole/spin} value = accuracy
+    accuracy = desired relative error in forces
  {msm} value = accuracy
    accuracy = desired relative error in forces
  {msm/cg} value = accuracy (smallq)
@ -105,9 +113,15 @@ The {ewald/disp} style adds a long-range dispersion sum option for
 but in a more efficient manner than the {ewald} style.  The 1/r^6
 capability means that Lennard-Jones or Buckingham potentials can be
 used without a cutoff, i.e. they become full long-range potentials.
-The {ewald/disp} style can also be used with point-dipoles
-"(Toukmaji)"_#Toukmaji and is currently the only kspace solver in
-LAMMPS with this capability.
+The {ewald/disp} style can also be used with point-dipoles, see
+"(Toukmaji)"_#Toukmaji.
+
+The {ewald/dipole} style adds long-range standard Ewald summations 
+for dipole-dipole interactions, see "(Toukmaji)"_#Toukmaji.
+
+The {ewald/dipole/spin} style adds long-range standard Ewald 
+summations for magnetic dipole-dipole interactions between
+magnetic spins.

 :line

@ -128,6 +142,12 @@ The optional {smallq} argument defines the cutoff for the absolute
 charge value which determines whether a particle is considered charged
 or not.  Its default value is 1.0e-5.

+The {pppm/dipole} style invokes a particle-particle particle-mesh solver 
+for dipole-dipole interactions, following the method of "(Cerda)"_#Cerda2008.
+
+The {pppm/dipole/spin} style invokes a particle-particle particle-mesh solver 
+for magnetic dipole-dipole interactions between magnetic spins. 
+
 The {pppm/tip4p} style is identical to the {pppm} style except that it
 adds a charge at the massless 4th site in each TIP4P water molecule.
 It should be used with "pair styles"_pair_style.html with a
@ -317,7 +337,10 @@ using ideas from chapter 3 of "(Hardy)"_#Hardy2006, with equation 3.197
 of particular note. When using {msm} with non-periodic boundary
 conditions, it is expected that the error estimation will be too
 pessimistic. RMS force errors for dipoles when using {ewald/disp}
-are estimated using equations 33 and 46 of "(Wang)"_#Wang.
+or {ewald/dipole} are estimated using equations 33 and 46 of
+"(Wang)"_#Wang. The RMS force errors for {pppm/dipole} are estimated
+using the equations in "(Cerda)"_#Cerda2008.
+

 See the "kspace_modify"_kspace_modify.html command for additional
 options of the K-space solvers that can be set, including a {force}
@ -464,6 +487,9 @@ Illinois at Urbana-Champaign, (2006).
 :link(Sutmann2013)
 [(Sutmann)] Sutmann, Arnold, Fahrenberger, et. al., Physical review / E 88(6), 063308 (2013)

+:link(Cerda2008)
+[(Cerda)] Cerda, Ballenegger, Lenz, Holm, J Chem Phys 129, 234104 (2008)
+
 :link(Who2012)
 [(Who)] Who, Author2, Author3, J of Long Range Solvers, 35, 164-177
 (2012).
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -64,13 +64,16 @@ args = arguments specific to the style :l
      {no_affinity} values = none
  {kokkos} args = keyword value ...
    zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
+    keywords = {neigh} or {neigh/qeq} or {neigh/thread} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse} or {gpu/direct}
      {neigh} value = {full} or {half}
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
      {neigh/qeq} value = {full} or {half}
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
+      {neigh/thread} value = {off} or {on}
+        off = thread only over atoms
+        on = thread over both atoms and neighbors
      {newton} = {off} or {on}
        off = set Newton pairwise and bonded flags off
        on = set Newton pairwise and bonded flags on
@ -173,12 +176,10 @@ computation will be built.  If {neigh} is {yes}, which is the default,
 neighbor list building is performed on the GPU.  If {neigh} is {no},
 neighbor list building is performed on the CPU.  GPU neighbor list
 building currently cannot be used with a triclinic box.  GPU neighbor
-list calculation currently cannot be used with
-"hybrid"_pair_hybrid.html pair styles.  GPU neighbor lists are not
-compatible with commands that are not GPU-enabled.  When a non-GPU
-enabled command requires a neighbor list, it will also be built on the
-CPU.  In these cases, it will typically be more efficient to only use
-CPU neighbor list builds.
+lists are not compatible with commands that are not GPU-enabled.  When
+a non-GPU enabled command requires a neighbor list, it will also be
+built on the CPU.  In these cases, it will typically be more efficient
+to only use CPU neighbor list builds.

 The {newton} keyword sets the Newton flags for pairwise (not bonded)
 interactions to {off} or {on}, the same as the "newton"_newton.html
@ -442,7 +443,19 @@ running on CPUs, a {half} neighbor list is the default because it are
 often faster, just as it is for non-accelerated pair styles. Similarly, 
 the {neigh/qeq} keyword determines how neighbor lists are built for "fix 
 qeq/reax/kk"_fix_qeq_reax.html. If not explicitly set, the value of 
-{neigh/qeq} will match {neigh}. 
+{neigh/qeq} will match {neigh}.
+
+If the {neigh/thread} keyword is set to {off}, then the KOKKOS package 
+threads only over atoms. However, for small systems, this may not expose 
+enough parallelism to keep a GPU busy. When this keyword is set to {on}, 
+the KOKKOS package threads over both atoms and neighbors of atoms. When 
+using {neigh/thread} {on}, a full neighbor list must also be used. Using 
+{neigh/thread} {on} may be slower for large systems, so this this option 
+is turned on by default only when there are 16K atoms or less owned by 
+an MPI rank and when using a full neighbor list. Not all KOKKOS-enabled 
+potentials support this keyword yet, and only thread over atoms. Many 
+simple pair-wise potentials such as Lennard-Jones do support threading 
+over both atoms and neighbors.

 The {newton} keyword sets the Newton flags for pairwise and bonded 
 interactions to {off} or {on}, the same as the "newton"_newton.html 
@ -475,10 +488,10 @@ are rebuilt. The data is only for atoms that migrate to new processors.
 "Forward" communication happens every timestep. "Reverse" communication 
 happens every timestep if the {newton} option is on. The data is for 
 atom coordinates and any other atom properties that needs to be updated 
-for ghost atoms owned by each processor. 
+for ghost atoms owned by each processor.

 The {comm} keyword is simply a short-cut to set the same value for both 
-the {comm/exchange} and {comm/forward} and {comm/reverse} keywords. 
+the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.

 The value options for all 3 keywords are {no} or {host} or {device}. A 
 value of {no} means to use the standard non-KOKKOS method of 
@ -486,26 +499,26 @@ packing/unpacking data for the communication. A value of {host} means to
 use the host, typically a multi-core CPU, and perform the 
 packing/unpacking in parallel with threads. A value of {device} means to 
 use the device, typically a GPU, to perform the packing/unpacking 
-operation. 
+operation.

 The optimal choice for these keywords depends on the input script and 
 the hardware used. The {no} value is useful for verifying that the 
 Kokkos-based {host} and {device} values are working correctly. It is the 
-default when running on CPUs since it is usually the fastest. 
+default when running on CPUs since it is usually the fastest.

 When running on CPUs or Xeon Phi, the {host} and {device} values work 
 identically. When using GPUs, the {device} value is the default since it 
 will typically be optimal if all of your styles used in your input 
 script are supported by the KOKKOS package. In this case data can stay 
 on the GPU for many timesteps without being moved between the host and 
-GPU, if you use the {device} value. This requires that your MPI is able 
-to access GPU memory directly. Currently that is true for OpenMPI 1.8 
-(or later versions), Mvapich2 1.9 (or later), and CrayMPI. If your 
-script uses styles (e.g. fixes) which are not yet supported by the 
-KOKKOS package, then data has to be move between the host and device 
-anyway, so it is typically faster to let the host handle communication, 
-by using the {host} value. Using {host} instead of {no} will enable use 
-of multiple threads to pack/unpack communicated data. 
+GPU, if you use the {device} value. If your script uses styles (e.g. 
+fixes) which are not yet supported by the KOKKOS package, then data has 
+to be move between the host and device anyway, so it is typically faster 
+to let the host handle communication, by using the {host} value. Using 
+{host} instead of {no} will enable use of multiple threads to 
+pack/unpack communicated data. When running small systems on a GPU, 
+performing the exchange pack/unpack on the host CPU can give speedup 
+since it reduces the number of CUDA kernel launches.

 The {gpu/direct} keyword chooses whether GPU-direct will be used. When 
 this keyword is set to {on}, buffers in GPU memory are passed directly 
@ -518,7 +531,8 @@ the {gpu/direct} keyword is automatically set to {off} by default. When
 the {gpu/direct} keyword is set to {off} while any of the {comm} 
 keywords are set to {device}, the value for these {comm} keywords will 
 be automatically changed to {host}. This setting has no effect if not 
-running on GPUs.
+running on GPUs. GPU-direct is available for OpenMPI 1.8 (or later 
+versions), Mvapich2 1.9 (or later), and CrayMPI.

 :line

@ -630,11 +644,12 @@ neigh/qeq = full, newton = off, binsize for GPUs = 2x LAMMPS default
 value, comm = device, gpu/direct = on. When LAMMPS can safely detect 
 that GPU-direct is not available, the default value of gpu/direct 
 becomes "off". For CPUs or Xeon Phis, the option defaults are neigh = 
-half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. These 
-settings are made automatically by the required "-k on" "command-line 
-switch"_Run_options.html. You can change them by using the package 
-kokkos command in your input script or via the "-pk kokkos command-line 
-switch"_Run_options.html.
+half, neigh/qeq = half, newton = on, binsize = 0.0, and comm = no. The 
+option neigh/thread = on when there are 16K atoms or less on an MPI 
+rank, otherwise it is "off". These settings are made automatically by 
+the required "-k on" "command-line switch"_Run_options.html. You can 
+change them by using the package kokkos command in your input script or 
+via the "-pk kokkos command-line switch"_Run_options.html.

 For the OMP package, the default is Nthreads = 0 and the option
 defaults are neigh = yes.  These settings are made automatically if
--- a/doc/src/pair_class2.txt
+++ b/doc/src/pair_class2.txt
@ -155,9 +155,12 @@ All of the lj/class2 pair styles write their information to "binary
 restart files"_restart.html, so pair_style and pair_coeff commands do
 not need to be specified in an input script that reads a restart file.

-All of the lj/class2 pair styles can only be used via the {pair}
-keyword of the "run_style respa"_run_style.html command.  They do not
-support the {inner}, {middle}, {outer} keywords.
+Only the {lj/class2} pair style support the use of the
+{inner}, {middle}, and {outer} keywords of the "run_style
+respa"_run_style.html command, meaning the pairwise forces can be
+partitioned by distance at different levels of the rRESPA hierarchy.
+The other styles only support the {pair} keyword of run_style respa.
+See the "run_style"_run_style.html command for details.

 [Restrictions:]

--- a/doc/src/pair_snap.txt
+++ b/doc/src/pair_snap.txt
@ -38,7 +38,7 @@ where {B_k^i} is the {k}-th bispectrum component of atom {i},
 and {beta_k^alpha_i} is the corresponding linear coefficient
 that depends on {alpha_i}, the SNAP element of atom {i}. The
 number of bispectrum components used and their definitions
-depend on the values of {twojmax} and {diagonalstyle}
+depend on the value of {twojmax}
 defined in the SNAP parameter file described below.
 The bispectrum calculation is described in more detail
 in "compute sna/atom"_compute_sna_atom.html.
@ -125,14 +125,13 @@ This line is followed by {ncoeff} coefficients, one per line.
 The SNAP parameter file can contain blank and comment lines (start
 with #) anywhere. Each non-blank non-comment line must contain one
 keyword/value pair. The required keywords are {rcutfac} and
-{twojmax}. Optional keywords are {rfac0}, {rmin0}, {diagonalstyle},
+{twojmax}. Optional keywords are {rfac0}, {rmin0},
 {switchflag}, and {bzeroflag}.

 The default values for these keywords are

 {rfac0} = 0.99363
 {rmin0} = 0.0
-{diagonalstyle} = 3
 {switchflag} = 0
 {bzeroflag} = 1
 {quadraticflag} = 1 :ul
@ -144,6 +143,9 @@ If {quadraticflag} is set to 1, then the SNAP energy expression includes the qua
 The SNAP element file should contain {K}({K}+1)/2 additional coefficients
 for each element, the upper-triangular elements of alpha.

+NOTE: The previously used {diagonalstyle} keyword was removed in 2019,
+since all known SNAP potentials use the default value of 3.
+
 :line

 [Mixing, shift, table, tail correction, restart, rRESPA info]:
--- a/doc/src/pair_spin_dipole.txt
+++ b/doc/src/pair_spin_dipole.txt
@ -0,0 +1,89 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Commands_all.html)
+
+:line
+
+pair_style spin/dipole/cut command :h3
+pair_style spin/dipole/long command :h3
+
+[Syntax:]
+
+pair_style spin/dipole/cut cutoff 
+pair_style spin/dipole/long cutoff :pre
+
+cutoff = global cutoff for magnetic dipole energy and forces
+(optional) (distance units) :ulb,l
+:ule
+
+[Examples:]
+
+pair_style spin/dipole/cut 10.0
+pair_coeff * * 10.0 
+pair_coeff 2 3 8.0 :pre
+
+pair_style spin/dipole/long 9.0
+pair_coeff * * 1.0 1.0
+pair_coeff 2 3 1.0 1.0 2.5 4.0 scale 0.5
+pair_coeff 2 3 1.0 1.0 2.5 4.0 :pre
+
+[Description:]
+
+Style {spin/dipole/cut} computes a short-range dipole-dipole
+interaction between pairs of magnetic particles that each 
+have a magnetic spin. 
+The magnetic dipole-dipole interactions are computed by the
+following formulas for the magnetic energy, magnetic precession 
+vector omega and mechanical force between particles I and J.
+
+:c,image(Eqs/pair_spin_dipole.jpg)
+
+where si and sj are the spin on two magnetic particles, 
+r is their separation distance, and the vector e = (Ri - Rj)/|Ri - Rj| 
+is the direction vector between the two particles.  
+
+Style {spin/dipole/long} computes long-range magnetic dipole-dipole
+interaction.
+A "kspace_style"_kspace_style.html must be defined to
+use this pair style.  Currently, "kspace_style 
+ewald/dipole/spin"_kspace_style.html and "kspace_style
+pppm/dipole/spin"_kspace_style.html support long-range magnetic 
+dipole-dipole interactions.
+
+:line
+
+The "pair_modify"_pair_modify.html table option is not relevant
+for this pair style.
+
+This pair style does not support the "pair_modify"_pair_modify.html
+tail option for adding long-range tail corrections to energy and
+pressure.
+
+This pair style writes its information to "binary restart
+files"_restart.html, so pair_style and pair_coeff commands do not need
+to be specified in an input script that reads a restart file.
+
+[Restrictions:]
+
+The {spin/dipole/cut} and {spin/dipole/long} styles are part of
+the SPIN package.  They are only enabled if LAMMPS was built with that 
+package.  See the "Build package"_Build_package.html doc page for more 
+info.
+
+Using dipole/spin pair styles with {electron} "units"_units.html is not
+currently supported.
+
+[Related commands:]
+
+"pair_coeff"_pair_coeff.html, "kspace_style"_kspace_style.html
+"fix nve/spin"_fix_nve_spin.html
+
+[Default:] none
+
+:line
+
+:link(Allen2)
+[(Allen)] Allen and Tildesley, Computer Simulation of Liquids,
+Clarendon Press, Oxford, 1987.
--- a/doc/src/variable.txt
+++ b/doc/src/variable.txt
@ -52,7 +52,7 @@ style = {delete} or {index} or {loop} or {world} or {universe} or {uloop} or {st
                     sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x),
                     random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x)
                     ramp(x,y), stagger(x,y), logfreq(x,y,z), logfreq2(x,y,z),
-                     stride(x,y,z), stride2(x,y,z,a,b,c),
+                     logfreq3(x,y,z), stride(x,y,z), stride2(x,y,z,a,b,c),
                     vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)
    group functions = count(group), mass(group), charge(group),
                      xcm(group,dim), vcm(group,dim), fcm(group,dim),
@ -459,8 +459,8 @@ Math functions: sqrt(x), exp(x), ln(x), log(x), abs(x), \
     sin(x), cos(x), tan(x), asin(x), acos(x), atan(x), atan2(y,x), \
     random(x,y,z), normal(x,y,z), ceil(x), floor(x), round(x), \
     ramp(x,y), stagger(x,y), logfreq(x,y,z), logfreq2(x,y,z), \
-     stride(x,y,z), stride2(x,y,z,a,b,c), vdisplace(x,y), \
-     swiggle(x,y,z), cwiggle(x,y,z)
+     logfreq3(x,y,z), stride(x,y,z), stride2(x,y,z,a,b,c), \
+     vdisplace(x,y), swiggle(x,y,z), cwiggle(x,y,z)
 Group functions: count(ID), mass(ID), charge(ID), xcm(ID,dim), \
     vcm(ID,dim), fcm(ID,dim), bound(ID,dir), \
     gyration(ID), ke(ID), angmom(ID,dim), torque(ID,dim), \
@ -670,6 +670,16 @@ sequence of output timesteps:

 100,150,200,...950,1000,1500,2000,...9500,10000,15000,etc :pre

+The logfreq3(x,y,z) function generates y points between x and z (inclusive),
+that are separated by a multiplicative ratio: (z/x)^(1/(y-1)). Constraints
+are: x,z > 0, y > 1, z-x >= y-1. For eg., if logfreq3(10,25,1000) is used in
+a variable by the "fix print"_fix_print.html command, then the interval
+between 10 and 1000 is divided into 24 parts with a multiplicative
+separation of ~1.21, and it will generate the following sequence of output
+timesteps:
+
+10, 13, 15, 18, 22, 27, 32,...384, 465, 563, 682, 826, 1000 :pre
+
 The stride(x,y,z) function uses the current timestep to generate a new
 timestep.  X,y >= 0 and z > 0 and x <= y are required.  The generated
 timesteps increase in increments of z, from x to y, i.e. it generates
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -1405,6 +1405,7 @@ Lenart
 lennard
 Lennard
 Lenosky
+Lenz
 Lett
 Leuven
 Leven
--- a/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
+++ b/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
@ -25,16 +25,18 @@ velocity 	all create 100 4928459 rot yes dist gaussian

 #pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0 spin/neel 4.0
 pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0
-pair_coeff 	* * eam/alloy Co_PurjaPun_2012.eam.alloy Co
-pair_coeff 	* * spin/exchange exchange 4.0 0.3593 1.135028015e-05 1.064568567
+pair_coeff 	* * eam/alloy ../examples/SPIN/cobalt_hcp/Co_PurjaPun_2012.eam.alloy Co
+#pair_coeff 	* * eam/alloy Co_PurjaPun_2012.eam.alloy Co
+pair_coeff 	* * spin/exchange exchange 4.0 -0.3593 1.135028015e-05 1.064568567
 #pair_coeff 	* * spin/neel neel 4.0 0.0048 0.234 1.168 2.6905 0.705 0.652  

 neighbor 	0.1 bin
 neigh_modify 	every 10 check yes delay 20

 #fix 		1 all precession/spin zeeman 1.0 0.0 0.0 1.0
-fix 		1 all precession/spin zeeman 0.0 0.0 0.0 1.0
-fix 		2 all langevin/spin 0.0 0.0 21
+fix 		1 all precession/spin anisotropy 0.01 0.0 0.0 1.0
+#fix 		2 all langevin/spin 0.0 0.0 21
+fix 		2 all langevin/spin 0.0 0.1 21
 fix 		3 all nve/spin lattice yes

 timestep	0.0001
--- a/examples/SPIN/dipole_spin/Fe_Mishin2006.eam.alloy
+++ b/examples/SPIN/dipole_spin/Fe_Mishin2006.eam.alloy
@ -0,0 +1 @@
+../iron/Fe_Mishin2006.eam.alloy
--- a/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_bcc_iron.dat
+++ b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_bcc_iron.dat
@ -0,0 +1,5 @@
+2.4824 0.01948336
+2.8665 0.01109
+4.0538 -0.0002176
+4.753 -0.001714
+4.965 -0.001986
--- a/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_fit.py
+++ b/examples/SPIN/dipole_spin/exchange_fit_bcc_iron/exchange_fit.py
@ -0,0 +1,32 @@
+#Program fitting the exchange interaction
+#Model curve: Bethe-Slater function
+import numpy as np, pylab, tkinter
+import matplotlib.pyplot as plt
+from scipy.optimize import curve_fit
+from decimal import *
+
+print("Loop begin")
+
+#Definition of the Bethe-Slater function
+def func(x,a,b,c):
+    return 4*a*((x/c)**2)*(1-b*(x/c)**2)*np.exp(-(x/c)**2)
+
+#Exchange coeff table (data to fit)
+rdata, Jdata = np.loadtxt('exchange_bcc_iron.dat', usecols=(0,1), unpack=True)
+plt.plot(rdata, Jdata, 'b-', label='data')
+
+#Perform the fit
+popt, pcov = curve_fit(func, rdata, Jdata, bounds=(0, [500.,5.,5.]))
+plt.plot(rdata, func(rdata, *popt), 'r--', label='fit')
+
+#Print the fitted params
+print("Parameters: a={:.10} (in meV), b={:.10} (adim), c={:.10} (in Ang)".format(*popt))
+
+#Ploting the result
+plt.xlabel('r_ij')
+pylab.xlim([0,6.5])
+plt.ylabel('J_ij')
+plt.legend()
+plt.show()
+
+print("Loop end")
--- a/examples/SPIN/dipole_spin/fe_dd.dat
+++ b/examples/SPIN/dipole_spin/fe_dd.dat
@ -0,0 +1,19 @@
+            6            8
+ Optimal parameter set
+            1    4.100199340884814       F
+            2    1.565647547483517       F
+            1   0.9332056681088162       T    3.000000000000000     
+            2   -1.162558782567700       T    2.866666666666670     
+            3  -0.3502026949249225       T    2.733333333333330     
+            4   0.4287820835430028       T    2.600000000000000     
+            5    4.907925057809273       T    2.400000000000000     
+            6   -5.307049068415304       T    2.300000000000000     
+            1  -0.1960674387419232       F    4.100000000000000     
+            2   0.3687525935422963       F    3.800000000000000     
+            3   -1.505333614924853       F    3.500000000000000     
+            4    4.948907078156191       T    3.200000000000000     
+            5   -4.894613262753399       T    2.900000000000000     
+            6    3.468897724782442       T    2.600000000000000     
+            7   -1.792218099820337       T    2.400000000000000     
+            8    80.22069592246987       T    2.300000000000000     
+
--- a/examples/SPIN/dipole_spin/in.spin.iron_dipole_cut
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_cut
@ -0,0 +1,59 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/cut 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff      * * spin/dipole/cut 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
--- a/examples/SPIN/dipole_spin/in.spin.iron_dipole_ewald
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_ewald
@ -0,0 +1,61 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/long 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff	* * spin/dipole/long 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+kspace_style 	ewald/dipole/spin 1.0e-4
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
--- a/examples/SPIN/dipole_spin/in.spin.iron_dipole_pppm
+++ b/examples/SPIN/dipole_spin/in.spin.iron_dipole_pppm
@ -0,0 +1,62 @@
+# bcc iron in a 3d periodic box
+
+clear 
+units 		metal
+atom_style 	spin
+
+dimension 	3
+boundary 	p p p
+
+# necessary for the serial algorithm (sametag)
+atom_modify 	map array 
+
+lattice 	bcc 2.8665
+region 		box block 0.0 5.0 0.0 5.0 0.0 5.0
+create_box 	1 box
+create_atoms 	1 box
+
+# setting mass, mag. moments, and interactions for bcc iron
+
+mass		1 55.845
+set 		group all spin 2.2 -1.0 0.0 0.0
+velocity 	all create 100 4928459 rot yes dist gaussian
+
+pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5 spin/dipole/long 8.0
+pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff	* * spin/dipole/long 8.0
+
+neighbor 	0.1 bin
+neigh_modify 	every 10 check yes delay 20
+
+kspace_style 	pppm/dipole/spin 1.0e-4
+kspace_modify 	compute yes
+
+fix 		1 all precession/spin cubic 0.001 0.0005 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0
+fix_modify 	1 energy yes
+fix 		2 all langevin/spin 0.0 0.0 21
+
+fix 		3 all nve/spin lattice yes
+timestep	0.0001
+
+# compute and output options
+
+compute 	out_mag    all spin
+compute 	out_pe     all pe
+compute 	out_ke     all ke
+compute 	out_temp   all temp
+
+variable 	magx      equal c_out_mag[1]
+variable 	magy      equal c_out_mag[2]
+variable 	magz      equal c_out_mag[3]
+variable 	magnorm   equal c_out_mag[4]
+variable 	emag      equal c_out_mag[5]
+variable 	tmag      equal c_out_mag[6]
+
+thermo_style    custom step time v_magx v_magy v_magz v_magnorm v_tmag v_emag pe etotal
+thermo          50
+
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 100 dump_iron.lammpstrj type x y z c_outsp[1] c_outsp[2] c_outsp[3]
+
+run 		2000
--- a/examples/SPIN/iron/in.spin.iron
+++ b/examples/SPIN/iron/in.spin.iron
@ -19,7 +19,8 @@ create_atoms 	1 box

 mass		1 55.845

-set 		group all spin/random 31 2.2
+#set 		group all spin/random 31 2.2
+set 		group all spin 2.2 0.0 0.0 1.0
 velocity 	all create 100 4928459 rot yes dist gaussian

 pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5
--- a/examples/USER/misc/kolmogorov_crespi_full/CH.rebo
+++ b/examples/USER/misc/kolmogorov_crespi_full/CH.rebo
@ -0,0 +1 @@
+../../../../potentials/CH.rebo
--- a/examples/USER/misc/kolmogorov_crespi_full/in.bilayer-graphene
+++ b/examples/USER/misc/kolmogorov_crespi_full/in.bilayer-graphene
@ -18,7 +18,7 @@ group adsorbate type 2
 ######################## Potential defition ########################
 pair_style  hybrid/overlay  rebo  kolmogorov/crespi/full 16.0
 ####################################################################
-pair_coeff  * * rebo                     CH.airebo      NULL C  # chemical
+pair_coeff  * * rebo                     CH.rebo      NULL C  # chemical
 pair_coeff  * * kolmogorov/crespi/full   CC.KC-full     C    C  # long range
 ####################################################################
 # Neighbor update settings
--- a/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.1
+++ b/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.1
@ -1,4 +1,5 @@
-LAMMPS (8 Mar 2018)
+LAMMPS (5 Jun 2019)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:88)
  using 1 OpenMP thread(s) per MPI task
 # Initialization
 units           metal
@ -21,6 +22,8 @@ read_data       Bi_gr_AB_stack_2L_noH_300K.data
  0 = max # of 1-3 neighbors
  0 = max # of 1-4 neighbors
  1 = max # of special neighbors
+  special bonds CPU = 0.000353813 secs
+  read_data CPU = 0.0043292 secs
 mass            1 12.0107   # carbon mass (g/mole) | membrane
 mass            2 12.0107   # carbon mass (g/mole) | adsorbate
 # Separate atom groups
@ -32,8 +35,8 @@ group adsorbate type 2
 ######################## Potential defition ########################
 pair_style  hybrid/overlay  rebo  kolmogorov/crespi/full 16.0
 ####################################################################
-pair_coeff  * * rebo                     CH.airebo      NULL C  # chemical
-Reading potential file CH.airebo with DATE: 2011-10-25
+pair_coeff  * * rebo                     CH.rebo      NULL C  # chemical
+Reading potential file CH.rebo with DATE: 2018-7-3
 pair_coeff  * * kolmogorov/crespi/full   CC.KC-full     C    C  # long range
 ####################################################################
 # Neighbor update settings
@ -92,32 +95,32 @@ Neighbor list info ...
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 16.96 | 16.96 | 16.96 Mbytes
 Step TotEng PotEng KinEng v_REBO v_KC Temp v_adsxcom v_adsycom v_adszcom v_adsvxcom v_adsvycom v_adsvzcom 
-       0 -5025.3867722725 -5040.0767391239 14.6899668514 -5011.2636297759 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
-     100 -5025.3962433293 -5041.3829775585 15.9867342292 -5012.5109377234 -28.8720398351 91.0071804888 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069014 
-     200 -5025.3942568861 -5041.9638220670 16.5695651809 -5012.7804299195 -29.1833921475 94.3250439654 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265577 
-     300 -5025.3919463074 -5040.9705419367 15.5785956293 -5012.0510295102 -28.9195124265 88.6837826830 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745908 
-     400 -5025.3965376948 -5041.6929964127 16.2964587179 -5012.6418090677 -29.0511873450 92.7703393702 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704496 
-     500 -5025.4050172900 -5042.1712310053 16.7662137153 -5013.1850218645 -28.9862091408 95.4444989087 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
-     600 -5025.3985715734 -5041.2158947893 15.8173232159 -5012.4875319345 -28.7283628548 90.0427797270 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001093 
-     700 -5025.3997561572 -5041.6276721306 16.2279159734 -5012.7093581188 -28.9183140118 92.3801482386 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311564 
-     800 -5025.3967603736 -5042.3401685987 16.9434082251 -5013.3044877099 -29.0356808888 96.4532085367 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492925 
-     900 -5025.3984542801 -5042.2820667481 16.8836124680 -5013.4066841442 -28.8753826039 96.1128111061 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648638 
-    1000 -5025.3988185618 -5041.9160822433 16.5172636815 -5012.8147737982 -29.1013084450 94.0273088606 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
-Loop time of 156.142 on 1 procs for 1000 steps with 1360 atoms
+       0 -5025.3867727863 -5040.0767396377 14.6899668514 -5011.2636302897 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
+     100 -5025.3962438431 -5041.3829780735 15.9867342304 -5012.5109382383 -28.8720398352 91.0071804956 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069015 
+     200 -5025.3942574000 -5041.9638225847 16.5695651847 -5012.7804304371 -29.1833921476 94.3250439874 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265578 
+     300 -5025.3919468212 -5040.9705424499 15.5785956286 -5012.0510300232 -28.9195124266 88.6837826792 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745907 
+     400 -5025.3965382086 -5041.6929969192 16.2964587107 -5012.6418095739 -29.0511873454 92.7703393292 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704497 
+     500 -5025.4050178038 -5042.1712315208 16.7662137170 -5013.1850223792 -28.9862091417 95.4444989189 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
+     600 -5025.3985720873 -5041.2158953052 15.8173232179 -5012.4875324499 -28.7283628553 90.0427797386 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001092 
+     700 -5025.3997566711 -5041.6276726420 16.2279159709 -5012.7093586298 -28.9183140122 92.3801482242 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311565 
+     800 -5025.3967608874 -5042.3401691104 16.9434082230 -5013.3044882226 -29.0356808878 96.4532085250 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492926 
+     900 -5025.3984547937 -5042.2820672614 16.8836124676 -5013.4066846579 -28.8753826035 96.1128111040 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648640 
+    1000 -5025.3988190757 -5041.9160827657 16.5172636900 -5012.8147743212 -29.1013084444 94.0273089090 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
+Loop time of 103.724 on 1 procs for 1000 steps with 1360 atoms

-Performance: 0.553 ns/day, 43.373 hours/ns, 6.404 timesteps/s
-99.6% CPU use with 1 MPI tasks x 1 OpenMP threads
+Performance: 0.833 ns/day, 28.812 hours/ns, 9.641 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 155.99     | 155.99     | 155.99     |   0.0 | 99.90
-Bond    | 0.00075769 | 0.00075769 | 0.00075769 |   0.0 |  0.00
+Pair    | 103.59     | 103.59     | 103.59     |   0.0 | 99.87
+Bond    | 0.00022388 | 0.00022388 | 0.00022388 |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.084217   | 0.084217   | 0.084217   |   0.0 |  0.05
-Output  | 0.0016122  | 0.0016122  | 0.0016122  |   0.0 |  0.00
-Modify  | 0.034797   | 0.034797   | 0.034797   |   0.0 |  0.02
-Other   |            | 0.02838    |            |       |  0.02
+Comm    | 0.082476   | 0.082476   | 0.082476   |   0.0 |  0.08
+Output  | 0.0010884  | 0.0010884  | 0.0010884  |   0.0 |  0.00
+Modify  | 0.032938   | 0.032938   | 0.032938   |   0.0 |  0.03
+Other   |            | 0.01749    |            |       |  0.02

 Nlocal:    1360 ave 1360 max 1360 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
@ -133,4 +136,4 @@ Ave neighs/atom = 195.004
 Ave special neighs/atom = 0
 Neighbor list builds = 0
 Dangerous builds = 0
-Total wall time: 0:02:36
+Total wall time: 0:01:43
--- a/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.4
+++ b/examples/USER/misc/kolmogorov_crespi_full/log.16Mar18.bilayer-graphene.g++.4
@ -1,4 +1,5 @@
-LAMMPS (8 Mar 2018)
+LAMMPS (5 Jun 2019)
+OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:88)
  using 1 OpenMP thread(s) per MPI task
 # Initialization
 units           metal
@ -21,6 +22,8 @@ read_data       Bi_gr_AB_stack_2L_noH_300K.data
  0 = max # of 1-3 neighbors
  0 = max # of 1-4 neighbors
  1 = max # of special neighbors
+  special bonds CPU = 0.000187874 secs
+  read_data CPU = 0.00234103 secs
 mass            1 12.0107   # carbon mass (g/mole) | membrane
 mass            2 12.0107   # carbon mass (g/mole) | adsorbate
 # Separate atom groups
@ -32,8 +35,8 @@ group adsorbate type 2
 ######################## Potential defition ########################
 pair_style  hybrid/overlay  rebo  kolmogorov/crespi/full 16.0
 ####################################################################
-pair_coeff  * * rebo                     CH.airebo      NULL C  # chemical
-Reading potential file CH.airebo with DATE: 2011-10-25
+pair_coeff  * * rebo                     CH.rebo      NULL C  # chemical
+Reading potential file CH.rebo with DATE: 2018-7-3
 pair_coeff  * * kolmogorov/crespi/full   CC.KC-full     C    C  # long range
 ####################################################################
 # Neighbor update settings
@ -92,32 +95,32 @@ Neighbor list info ...
      bin: standard
 Per MPI rank memory allocation (min/avg/max) = 11.13 | 11.13 | 11.13 Mbytes
 Step TotEng PotEng KinEng v_REBO v_KC Temp v_adsxcom v_adsycom v_adszcom v_adsvxcom v_adsvycom v_adsvzcom 
-       0 -5025.3867722725 -5040.0767391239 14.6899668514 -5011.2636297759 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
-     100 -5025.3962433293 -5041.3829775585 15.9867342292 -5012.5109377234 -28.8720398351 91.0071804888 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069014 
-     200 -5025.3942568861 -5041.9638220670 16.5695651809 -5012.7804299195 -29.1833921475 94.3250439654 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265577 
-     300 -5025.3919463074 -5040.9705419367 15.5785956293 -5012.0510295103 -28.9195124265 88.6837826830 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745908 
-     400 -5025.3965376948 -5041.6929964127 16.2964587179 -5012.6418090677 -29.0511873450 92.7703393702 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704496 
-     500 -5025.4050172900 -5042.1712310053 16.7662137153 -5013.1850218645 -28.9862091408 95.4444989088 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
-     600 -5025.3985715734 -5041.2158947893 15.8173232159 -5012.4875319345 -28.7283628548 90.0427797270 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001093 
-     700 -5025.3997561572 -5041.6276721306 16.2279159734 -5012.7093581188 -28.9183140118 92.3801482386 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311564 
-     800 -5025.3967603736 -5042.3401685987 16.9434082251 -5013.3044877099 -29.0356808888 96.4532085367 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492925 
-     900 -5025.3984542801 -5042.2820667481 16.8836124680 -5013.4066841442 -28.8753826039 96.1128111061 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648638 
-    1000 -5025.3988185618 -5041.9160822433 16.5172636815 -5012.8147737983 -29.1013084450 94.0273088606 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
-Loop time of 42.5422 on 4 procs for 1000 steps with 1360 atoms
+       0 -5025.3867727863 -5040.0767396377 14.6899668514 -5011.2636302897 -28.8131093480 83.6251135127 22.0155657205 20.2812150219 3.4623630945 0.0282287195 0.0535565745 0.2193320108 
+     100 -5025.3962438431 -5041.3829780735 15.9867342304 -5012.5109382383 -28.8720398352 91.0071804956 22.0181858078 20.2867731676 3.4456714402 0.0241525932 0.0573807336 -0.5235069015 
+     200 -5025.3942574000 -5041.9638225847 16.5695651847 -5012.7804304371 -29.1833921476 94.3250439874 22.0203529515 20.2926376511 3.3740502908 0.0186420748 0.0595018114 -0.7867265578 
+     300 -5025.3919468212 -5040.9705424499 15.5785956286 -5012.0510300232 -28.9195124266 88.6837826792 22.0218424095 20.2984380400 3.3199036613 0.0106250874 0.0544668352 -0.1513745907 
+     400 -5025.3965382086 -5041.6929969192 16.2964587107 -5012.6418095739 -29.0511873454 92.7703393291 22.0224243957 20.3034636122 3.3515794172 0.0006844935 0.0458598502 0.6967704497 
+     500 -5025.4050178038 -5042.1712315208 16.7662137170 -5013.1850223792 -28.9862091417 95.4444989189 22.0220673443 20.3074634962 3.4286173278 -0.0078273439 0.0340764532 0.6845095066 
+     600 -5025.3985720873 -5041.2158953052 15.8173232179 -5012.4875324499 -28.7283628553 90.0427797386 22.0209262700 20.3103065099 3.4653840648 -0.0141442608 0.0229602847 0.0009001092 
+     700 -5025.3997566711 -5041.6276726420 16.2279159709 -5012.7093586298 -28.9183140122 92.3801482242 22.0191651506 20.3120184840 3.4291788224 -0.0208485646 0.0104216414 -0.6668311565 
+     800 -5025.3967608874 -5042.3401691104 16.9434082230 -5013.3044882226 -29.0356808878 96.4532085250 22.0167259920 20.3122737443 3.3535033285 -0.0279747378 -0.0060833621 -0.7003492926 
+     900 -5025.3984547938 -5042.2820672614 16.8836124676 -5013.4066846579 -28.8753826035 96.1128111040 22.0136711877 20.3107854823 3.3206430872 -0.0331979094 -0.0237440547 0.1335648640 
+    1000 -5025.3988190757 -5041.9160827657 16.5172636900 -5012.8147743212 -29.1013084444 94.0273089090 22.0102627032 20.3075977018 3.3736867454 -0.0340065996 -0.0390649991 0.7872380119 
+Loop time of 33.7338 on 4 procs for 1000 steps with 1360 atoms

-Performance: 2.031 ns/day, 11.817 hours/ns, 23.506 timesteps/s
-98.9% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 2.561 ns/day, 9.370 hours/ns, 29.644 timesteps/s
+94.1% CPU use with 4 MPI tasks x 1 OpenMP threads

 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 39.928     | 40.992     | 42.377     |  15.8 | 96.36
-Bond    | 0.0003643  | 0.00043392 | 0.00048113 |   0.0 |  0.00
+Pair    | 30.833     | 31.356     | 32.18      |   9.1 | 92.95
+Bond    | 0.00026059 | 0.00029182 | 0.00031185 |   0.0 |  0.00
 Neigh   | 0          | 0          | 0          |   0.0 |  0.00
-Comm    | 0.12253    | 1.5076     | 2.5698     |  82.1 |  3.54
-Output  | 0.0012577  | 0.0013637  | 0.0016453  |   0.4 |  0.00
-Modify  | 0.010833   | 0.012247   | 0.013317   |   0.9 |  0.03
-Other   |            | 0.02864    |            |       |  0.07
+Comm    | 1.443      | 2.2722     | 2.8091     |  34.3 |  6.74
+Output  | 0.00068855 | 0.00095087 | 0.0017185  |   0.0 |  0.00
+Modify  | 0.010187   | 0.011709   | 0.015284   |   1.9 |  0.03
+Other   |            | 0.09241    |            |       |  0.27

 Nlocal:    340 ave 344 max 334 min
 Histogram: 1 0 0 0 0 0 1 0 1 1
@ -133,4 +136,4 @@ Ave neighs/atom = 195.004
 Ave special neighs/atom = 0
 Neighbor list builds = 0
 Dangerous builds = 0
-Total wall time: 0:00:42
+Total wall time: 0:00:33
--- a/lib/gpu/lal_base_atomic.cpp
+++ b/lib/gpu/lal_base_atomic.cpp
@ -64,9 +64,12 @@ int BaseAtomicT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_charge.cpp
+++ b/lib/gpu/lal_base_charge.cpp
@ -65,9 +65,12 @@ int BaseChargeT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,true,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dipole.cpp
+++ b/lib/gpu/lal_base_dipole.cpp
@ -66,9 +66,12 @@ int BaseDipoleT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,true,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,true,true,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_dpd.cpp
+++ b/lib/gpu/lal_base_dpd.cpp
@ -65,9 +65,13 @@ int BaseDPDT::init_atomic(const int nlocal, const int nall,
  } else
    _nbor_data=&(nbor->dev_nbor);

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom,true);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial,true);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
+
  if (success!=0)
    return success;

--- a/lib/gpu/lal_base_ellipsoid.cpp
+++ b/lib/gpu/lal_base_ellipsoid.cpp
@ -71,12 +71,15 @@ int BaseEllipsoidT::init_base(const int nlocal, const int nall,

  _threads_per_atom=device->threads_per_atom();

-  int success=device->init(*ans,false,true,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,true,
-                           1);
+  int success=device->init(*ans,false,true,nlocal,nall,maxspecial);
  if (success!=0)
    return success;

+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,true,1);
+  if (success!=0)
+    return success;
+  
  ucl_device=device->gpu;
  atom=&device->atom;

--- a/lib/gpu/lal_base_three.cpp
+++ b/lib/gpu/lal_base_three.cpp
@ -78,9 +78,12 @@ int BaseThreeT::init_three(const int nlocal, const int nall,
  if (_threads_per_atom*_threads_per_atom>device->warp_size())
    return -10;

-  int success=device->init(*ans,false,false,nlocal,host_nlocal,nall,nbor,
-                           maxspecial,_gpu_host,max_nbors,cell_size,false,
-                           _threads_per_atom);
+  int success=device->init(*ans,false,false,nlocal,nall,maxspecial);
+  if (success!=0)
+    return success;
+
+  success = device->init_nbor(nbor,nlocal,host_nlocal,nall,maxspecial,_gpu_host,
+                  max_nbors,cell_size,false,_threads_per_atom);
  if (success!=0)
    return success;

--- a/lib/gpu/lal_device.cpp
+++ b/lib/gpu/lal_device.cpp
@ -246,11 +246,8 @@ int DeviceT::set_ocl_params(char *ocl_vendor) {
 template <class numtyp, class acctyp>
 int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
                  const bool rot, const int nlocal,
-                  const int host_nlocal, const int nall,
-                  Neighbor *nbor, const int maxspecial,
-                  const int gpu_host, const int max_nbors,
-                  const double cell_size, const bool pre_cut,
-                  const int threads_per_atom, const bool vel) {
+                  const int nall, const int maxspecial,
+                  const bool vel) {
  if (!_device_init)
    return -1;
  if (sizeof(acctyp)==sizeof(double) && gpu->double_precision()==false)
@ -301,16 +298,6 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const bool charge,
  if (!ans.init(ef_nlocal,charge,rot,*gpu))
    return -3;

-  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
-                  *gpu,gpu_nbor,gpu_host,pre_cut, _block_cell_2d,
-                  _block_cell_id, _block_nbor_build, threads_per_atom,
-                  _warp_size, _time_device, compile_string()))
-    return -3;
-  if (_cell_size<0.0)
-    nbor->cell_size(cell_size,cell_size);
-  else
-    nbor->cell_size(_cell_size,cell_size);
-
  _init_count++;
  return 0;
 }
@ -338,6 +325,39 @@ int DeviceT::init(Answer<numtyp,acctyp> &ans, const int nlocal,
  return 0;
 }

+template <class numtyp, class acctyp>
+int DeviceT::init_nbor(Neighbor *nbor, const int nlocal,
+                  const int host_nlocal, const int nall,
+                  const int maxspecial, const int gpu_host,
+                  const int max_nbors, const double cell_size,
+                  const bool pre_cut, const int threads_per_atom) {
+  int ef_nlocal=nlocal;
+  if (_particle_split<1.0 && _particle_split>0.0)
+    ef_nlocal=static_cast<int>(_particle_split*nlocal);
+ 
+  int gpu_nbor=0;
+  if (_gpu_mode==Device<numtyp,acctyp>::GPU_NEIGH)
+    gpu_nbor=1;
+  else if (_gpu_mode==Device<numtyp,acctyp>::GPU_HYB_NEIGH)
+    gpu_nbor=2;
+  #ifndef USE_CUDPP
+  if (gpu_nbor==1)
+    gpu_nbor=2;
+  #endif
+
+  if (!nbor->init(&_neighbor_shared,ef_nlocal,host_nlocal,max_nbors,maxspecial,
+                  *gpu,gpu_nbor,gpu_host,pre_cut,_block_cell_2d,
+                  _block_cell_id, _block_nbor_build, threads_per_atom,
+                  _warp_size, _time_device, compile_string()))
+    return -3;
+  if (_cell_size<0.0)
+    nbor->cell_size(cell_size,cell_size);
+  else
+    nbor->cell_size(_cell_size,cell_size);
+
+  return 0;
+}
+
 template <class numtyp, class acctyp>
 void DeviceT::set_single_precompute
                     (PPPM<numtyp,acctyp,float,_lgpu_float4> *pppm) {
@ -614,7 +634,7 @@ void DeviceT::output_kspace_times(UCL_Timer &time_in,
    if (screen && times[6]>0.0) {
      fprintf(screen,"\n\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");
-      fprintf(screen,"    Device Time Info (average): ");
+      fprintf(screen,"    Device Time Info (average) for kspace: ");
      fprintf(screen,"\n-------------------------------------");
      fprintf(screen,"--------------------------------\n");

--- a/lib/gpu/lal_device.h
+++ b/lib/gpu/lal_device.h
@ -53,11 +53,43 @@ class Device {
                  const int t_per_atom, const double cell_size,
                  char *vendor_string, const int block_pair);

-  /// Initialize the device for Atom and Neighbor storage
-  /** \param rot True if quaternions need to be stored
+  /// Initialize the device for Atom storage
+  /** \param charge True if charges need to be stored 
+    * \param rot True if quaternions need to be stored
+    * \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
+    * \param vel True if velocities need to be stored
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &ans, const bool charge, const bool rot,
+           const int nlocal, const int nall, const int maxspecial,
+           const bool vel=false);
+
+  /// Initialize the device for Atom storage only
+  /** \param nlocal Total number of local particles to allocate memory for
+    * \param nall Total number of local+ghost particles
+    *
+    * Returns:
+    * -  0 if successfull
+    * - -1 if fix gpu not found
+    * - -3 if there is an out of memory error
+    * - -4 if the GPU library was not compiled for GPU
+    * - -5 Double precision is not supported on card **/
+  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
+
+  /// Initialize the neighbor list storage
+  /** \param charge True if charges need to be stored
+    * \param rot True if quaternions need to be stored
    * \param nlocal Total number of local particles to allocate memory for
    * \param host_nlocal Initial number of host particles to allocate memory for
    * \param nall Total number of local+ghost particles
+    * \param maxspecial Maximum mumber of special bonded atoms per atom
    * \param gpu_host 0 if host will not perform force calculations,
    *                 1 if gpu_nbor is true, and host needs a half nbor list,
    *                 2 if gpu_nbor is true, and host needs a full nbor list
@ -73,23 +105,11 @@ class Device {
    * - -3 if there is an out of memory error
    * - -4 if the GPU library was not compiled for GPU
    * - -5 Double precision is not supported on card **/
-  int init(Answer<numtyp,acctyp> &a, const bool charge, const bool rot,
-           const int nlocal, const int host_nlocal, const int nall,
-           Neighbor *nbor, const int maxspecial, const int gpu_host,
-           const int max_nbors, const double cell_size, const bool pre_cut,
-           const int threads_per_atom, const bool vel=false);
-
-  /// Initialize the device for Atom storage only
-  /** \param nlocal Total number of local particles to allocate memory for
-    * \param nall Total number of local+ghost particles
-    *
-    * Returns:
-    * -  0 if successfull
-    * - -1 if fix gpu not found
-    * - -3 if there is an out of memory error
-    * - -4 if the GPU library was not compiled for GPU
-    * - -5 Double precision is not supported on card **/
-  int init(Answer<numtyp,acctyp> &ans, const int nlocal, const int nall);
+  int init_nbor(Neighbor *nbor, const int nlocal,
+                const int host_nlocal, const int nall,
+                const int maxspecial, const int gpu_host,
+                const int max_nbors, const double cell_size,
+                const bool pre_cut, const int threads_per_atom);

  /// Output a message for pair_style acceleration with device stats
  void init_message(FILE *screen, const char *name,
@ -173,7 +193,7 @@ class Device {
  /// Return host memory usage in bytes
  double host_memory_usage() const;

-  /// Return the number of procs sharing a device (size of device commincator)
+  /// Return the number of procs sharing a device (size of device communicator)
  inline int procs_per_gpu() const { return _procs_per_gpu; }
  /// Return the number of threads per proc
  inline int num_threads() const { return _nthreads; }
@ -260,12 +280,12 @@ class Device {
  /// Atom Data
  Atom<numtyp,acctyp> atom;

-  // --------------------------- NBOR DATA ----------------------------
+  // --------------------------- NBOR SHARED KERNELS ----------------

-  /// Neighbor Data
+  /// Shared kernels for neighbor lists
  NeighborShared _neighbor_shared;

-  // ------------------------ LONG RANGE DATA -------------------------
+  // ------------------------ LONG RANGE DATA -----------------------

  // Long Range Data
  int _long_range_precompute;
--- a/potentials/Ta06A.snapparam
+++ b/potentials/Ta06A.snapparam
@ -10,6 +10,5 @@ twojmax 6

 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
--- a/potentials/W_2940_2017_2.snapparam
+++ b/potentials/W_2940_2017_2.snapparam
@ -8,6 +8,5 @@ twojmax 8

 rfac0 0.99363
 rmin0 0
-diagonalstyle 3
 bzeroflag 0
 quadraticflag 0
--- a/src/.gitignore
+++ b/src/.gitignore
@ -167,6 +167,10 @@
 /pair_spin.h
 /pair_spin_dmi.cpp
 /pair_spin_dmi.h
+/pair_spin_dipole_cut.cpp
+/pair_spin_dipole_cut.h
+/pair_spin_dipole_long.cpp
+/pair_spin_dipole_long.h
 /pair_spin_exchange.cpp
 /pair_spin_exchange.h
 /pair_spin_magelec.cpp
@ -428,6 +432,10 @@
 /ewald.h
 /ewald_cg.cpp
 /ewald_cg.h
+/ewald_dipole.cpp
+/ewald_dipole.h
+/ewald_dipole_spin.cpp
+/ewald_dipole_spin.h
 /ewald_disp.cpp
 /ewald_disp.h
 /ewald_n.cpp
@ -1027,6 +1035,10 @@
 /pppm.h
 /pppm_cg.cpp
 /pppm_cg.h
+/pppm_dipole.cpp
+/pppm_dipole.h
+/pppm_dipole_spin.cpp
+/pppm_dipole_spin.h
 /pppm_disp.cpp
 /pppm_disp.h
 /pppm_disp_tip4p.cpp
--- a/src/CLASS2/pair_lj_class2.cpp
+++ b/src/CLASS2/pair_lj_class2.cpp
@ -2,12 +2,10 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
-
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
-
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

@ -19,7 +17,12 @@
 #include "atom.h"
 #include "comm.h"
 #include "force.h"
+#include "neighbor.h"
 #include "neigh_list.h"
+#include "neigh_request.h"
+#include "update.h"
+#include "integrate.h"
+#include "respa.h"
 #include "math_const.h"
 #include "memory.h"
 #include "error.h"
@ -31,6 +34,7 @@ using namespace MathConst;

 PairLJClass2::PairLJClass2(LAMMPS *lmp) : Pair(lmp)
 {
+  respa_enable = 1;
  writedata = 1;
 }

@ -133,6 +137,270 @@ void PairLJClass2::compute(int eflag, int vflag)
  if (vflag_fdotr) virial_fdotr_compute();
 }

+/* ----------------------------------------------------------------------
+*/
+
+void PairLJClass2::compute_inner()
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fpair;
+  double rsq,rinv,r2inv,r3inv,r6inv,forcelj,factor_lj,rsw;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum_inner;
+  ilist = list->ilist_inner;
+  numneigh = list->numneigh_inner;
+  firstneigh = list->firstneigh_inner;
+
+  double cut_out_on = cut_respa[0];
+  double cut_out_off = cut_respa[1];
+
+  double cut_out_diff = cut_out_off - cut_out_on;
+  double cut_out_on_sq = cut_out_on*cut_out_on;
+  double cut_out_off_sq = cut_out_off*cut_out_off;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_out_off_sq) {
+        r2inv = 1.0/rsq;
+		rinv = sqrt(r2inv);
+		r3inv = r2inv*rinv;
+        r6inv = r3inv*r3inv;
+        jtype = type[j];
+        forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+        if (rsq > cut_out_on_sq) {
+          rsw = (sqrt(rsq) - cut_out_on)/cut_out_diff;
+          fpair *= 1.0 - rsw*rsw*(3.0 - 2.0*rsw);
+        }
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2::compute_middle()
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,fpair;
+  double rsq,rinv,r2inv,r3inv,r6inv,forcelj,factor_lj,rsw;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum_middle;
+  ilist = list->ilist_middle;
+  numneigh = list->numneigh_middle;
+  firstneigh = list->firstneigh_middle;
+
+  double cut_in_off = cut_respa[0];
+  double cut_in_on = cut_respa[1];
+  double cut_out_on = cut_respa[2];
+  double cut_out_off = cut_respa[3];
+
+  double cut_in_diff = cut_in_on - cut_in_off;
+  double cut_out_diff = cut_out_off - cut_out_on;
+  double cut_in_off_sq = cut_in_off*cut_in_off;
+  double cut_in_on_sq = cut_in_on*cut_in_on;
+  double cut_out_on_sq = cut_out_on*cut_out_on;
+  double cut_out_off_sq = cut_out_off*cut_out_off;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq < cut_out_off_sq && rsq > cut_in_off_sq) {
+        r2inv = 1.0/rsq;
+		rinv = sqrt(r2inv);
+		r3inv = r2inv*rinv;
+        r6inv = r3inv*r3inv;
+        jtype = type[j];
+        forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+        fpair = factor_lj*forcelj*r2inv;
+        if (rsq < cut_in_on_sq) {
+          rsw = (sqrt(rsq) - cut_in_off)/cut_in_diff;
+          fpair *= rsw*rsw*(3.0 - 2.0*rsw);
+        }
+        if (rsq > cut_out_on_sq) {
+          rsw = (sqrt(rsq) - cut_out_on)/cut_out_diff;
+          fpair *= 1.0 + rsw*rsw*(2.0*rsw - 3.0);
+        }
+
+        f[i][0] += delx*fpair;
+        f[i][1] += dely*fpair;
+        f[i][2] += delz*fpair;
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= delx*fpair;
+          f[j][1] -= dely*fpair;
+          f[j][2] -= delz*fpair;
+        }
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairLJClass2::compute_outer(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,rinv,r2inv,r3inv,r6inv,forcelj,factor_lj,rsw;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = 0.0;
+  ev_init(eflag,vflag);
+
+  double **x = atom->x;
+  double **f = atom->f;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  double *special_lj = force->special_lj;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  double cut_in_off = cut_respa[2];
+  double cut_in_on = cut_respa[3];
+
+  double cut_in_diff = cut_in_on - cut_in_off;
+  double cut_in_off_sq = cut_in_off*cut_in_off;
+  double cut_in_on_sq = cut_in_on*cut_in_on;
+
+  // loop over neighbors of my atoms
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    xtmp = x[i][0];
+    ytmp = x[i][1];
+    ztmp = x[i][2];
+    itype = type[i];
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      delx = xtmp - x[j][0];
+      dely = ytmp - x[j][1];
+      delz = ztmp - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      jtype = type[j];
+
+      if (rsq < cutsq[itype][jtype]) {
+        if (rsq > cut_in_off_sq) {
+          r2inv = 1.0/rsq;
+		  rinv = sqrt(r2inv);
+		  r3inv = r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+          fpair = factor_lj*forcelj*r2inv;
+          if (rsq < cut_in_on_sq) {
+            rsw = (sqrt(rsq) - cut_in_off)/cut_in_diff;
+            fpair *= rsw*rsw*(3.0 - 2.0*rsw);
+          }
+
+          f[i][0] += delx*fpair;
+          f[i][1] += dely*fpair;
+          f[i][2] += delz*fpair;
+          if (newton_pair || j < nlocal) {
+            f[j][0] -= delx*fpair;
+            f[j][1] -= dely*fpair;
+            f[j][2] -= delz*fpair;
+          }
+        }
+
+        if (eflag) {
+          r2inv = 1.0/rsq;
+		  rinv = sqrt(r2inv);
+		  r3inv = r2inv*rinv;
+          r6inv = r3inv*r3inv;
+          evdwl = r6inv*(lj3[itype][jtype]*r3inv-lj4[itype][jtype]) -
+            offset[itype][jtype];
+          evdwl *= factor_lj;
+        }
+
+        if (vflag) {
+          if (rsq <= cut_in_off_sq) {
+            r2inv = 1.0/rsq;
+			rinv = sqrt(r2inv);
+			r3inv = r2inv*rinv;
+            r6inv = r3inv*r3inv;
+            forcelj = r6inv * (lj1[itype][jtype]*r3inv - lj2[itype][jtype]);
+            fpair = factor_lj*forcelj*r2inv;
+          } else if (rsq < cut_in_on_sq)
+            fpair = factor_lj*forcelj*r2inv;
+        }
+
+        if (evflag) ev_tally(i,j,nlocal,newton_pair,
+                             evdwl,0.0,fpair,delx,dely,delz);
+      }
+    }
+  }
+}
 /* ----------------------------------------------------------------------
   allocate all arrays
 ------------------------------------------------------------------------- */
@ -212,6 +480,38 @@ void PairLJClass2::coeff(int narg, char **arg)
  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
 }

+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+void PairLJClass2::init_style()
+{
+  // request regular or rRESPA neighbor list
+
+  int irequest;
+  int respa = 0;
+
+  if (update->whichflag == 1 && strstr(update->integrate_style,"respa")) {
+    if (((Respa *) update->integrate)->level_inner >= 0) respa = 1;
+    if (((Respa *) update->integrate)->level_middle >= 0) respa = 2;
+  }
+
+  irequest = neighbor->request(this,instance_me);
+
+  if (respa >= 1) {
+    neighbor->requests[irequest]->respaouter = 1;
+    neighbor->requests[irequest]->respainner = 1;
+  }
+  if (respa == 2) neighbor->requests[irequest]->respamiddle = 1;
+
+  // set rRESPA cutoffs
+
+  if (strstr(update->integrate_style,"respa") &&
+      ((Respa *) update->integrate)->level_inner >= 0)
+    cut_respa = ((Respa *) update->integrate)->cutoff;
+  else cut_respa = NULL;
+}
+
 /* ----------------------------------------------------------------------
   init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
@ -246,6 +546,11 @@ double PairLJClass2::init_one(int i, int j)
  lj4[j][i] = lj4[i][j];
  offset[j][i] = offset[i][j];

+  // check interior rRESPA cutoff
+
+  if (cut_respa && cut[i][j] < cut_respa[3])
+    error->all(FLERR,"Pair cutoff < Respa interior cutoff");
+
  // compute I,J contribution to long-range tail correction
  // count total # of atoms of type I and J via Allreduce

--- a/src/CLASS2/pair_lj_class2.h
+++ b/src/CLASS2/pair_lj_class2.h
@ -2,12 +2,10 @@
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   http://lammps.sandia.gov, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov
-
   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.
-
   See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */

@ -31,6 +29,7 @@ class PairLJClass2 : public Pair {
  virtual void compute(int, int);
  virtual void settings(int, char **);
  void coeff(int, char **);
+  void init_style();
  virtual double init_one(int, int);
  void write_restart(FILE *);
  void read_restart(FILE *);
@ -41,11 +40,16 @@ class PairLJClass2 : public Pair {
  double single(int, int, int, int, double, double, double, double &);
  void *extract(const char *, int &);

+  void compute_inner();
+  void compute_middle();
+  void compute_outer(int, int);
+
 protected:
  double cut_global;
  double **cut;
  double **epsilon,**sigma;
  double **lj1,**lj2,**lj3,**lj4,**offset;
+  double *cut_respa;

  virtual void allocate();
 };
@ -56,15 +60,13 @@ class PairLJClass2 : public Pair {
 #endif

 /* ERROR/WARNING messages:
-
 E: Illegal ... command
-
 Self-explanatory.  Check the input script syntax and compare to the
 documentation for the command.  You can use -echo screen as a
 command-line option when running LAMMPS to see the offending line.
-
 E: Incorrect args for pair coefficients
-
 Self-explanatory.  Check the input script or data file.
-
+E: Pair cutoff < Respa interior cutoff
+One or more pairwise cutoffs are too short to use with the specified
+rRESPA cutoffs.
 */
--- a/src/DIPOLE/pair_lj_cut_dipole_long.cpp
+++ b/src/DIPOLE/pair_lj_cut_dipole_long.cpp
@ -44,7 +44,7 @@ using namespace MathConst;
 PairLJCutDipoleLong::PairLJCutDipoleLong(LAMMPS *lmp) : Pair(lmp)
 {
  single_enable = 0;
-  ewaldflag = dipoleflag = 1;
+  ewaldflag = pppmflag = dipoleflag = 1;
  respa_enable = 0;
 }

--- a/src/GPU/README
+++ b/src/GPU/README
@ -1,9 +1,8 @@
 This package implements GPU optimizations of various LAMMPS styles.

-Section 5.3.1 on the manual gives details of what hardware and Cuda
+Section 3.7 of the manual gives details of what hardware and Cuda
 software is required on your system, and full details on how to build
-and use this package.  See the KOKKOS package, which also has
-GPU-enabled styles.
+and use this package.  The KOKKOS package also has GPU-enabled styles.

 This package uses an external library provided in lib/gpu which must
 be compiled before making LAMMPS.  See the lib/gpu/README file and the
--- a/src/GPU/fix_gpu.cpp
+++ b/src/GPU/fix_gpu.cpp
@ -219,17 +219,6 @@ void FixGPU::init()
    error->all(FLERR,"GPU package does not (yet) work with "
               "atom_style template");

-  // hybrid cannot be used with force/neigh option
-
-  if (_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"Cannot use pair hybrid with GPU neighbor list builds");
-
-  if (_particle_split < 0)
-    if (force->pair_match("^hybrid",0) != NULL)
-      error->all(FLERR,"GPU split param must be positive "
-                 "for hybrid pair styles");
-
  // neighbor list builds on the GPU with triclinic box is not yet supported

  if ((_gpu_mode == GPU_NEIGH || _gpu_mode == GPU_HYB_NEIGH) &&
--- a/src/GPU/fix_gpu.h
+++ b/src/GPU/fix_gpu.h
@ -65,14 +65,6 @@ E: GPU package does not (yet) work with atom_style template

 Self-explanatory.

-E: Cannot use pair hybrid with GPU neighbor list builds
-
-Neighbor list builds must be done on the CPU for this pair style.
-
-E: GPU split param must be positive for hybrid pair styles
-
-See the package gpu command.
-
 E: Cannot use package gpu neigh yes with triclinic box

 This is a current restriction in LAMMPS.
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@ -22,6 +22,7 @@
 #include "memory_kokkos.h"
 #include "error.h"
 #include "kokkos.h"
+#include "atom_masks.h"

 using namespace LAMMPS_NS;

@ -270,8 +271,10 @@ int AtomKokkos::add_custom(const char *name, int flag)
    int n = strlen(name) + 1;
    dname[index] = new char[n];
    strcpy(dname[index],name);
+    this->sync(Device,DVECTOR_MASK);
    memoryKK->grow_kokkos(k_dvector,dvector,ndvector,nmax,
                        "atom:dvector");
+    this->modified(Device,DVECTOR_MASK);
  }

  return index;
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -59,14 +59,15 @@ AtomVecAngleKokkos::AtomVecAngleKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecAngleKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -98,7 +99,7 @@ void AtomVecAngleKokkos::grow(int n)
                      "atom:angle_atom3");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -282,7 +283,7 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
  // Choose correct forward PackComm kernel

  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecAngleKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
@ -309,7 +310,7 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
      }
    }
  } else {
-    sync(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecAngleKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
@ -397,8 +398,8 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
                                       const int nfirst, const int &pbc_flag,
                                       const int* const pbc) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecAngleKokkos_PackCommSelf<LMPHostType,1,1>
@ -429,8 +430,8 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
      }
    }
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecAngleKokkos_PackCommSelf<LMPDeviceType,1,1>
@ -493,13 +494,13 @@ struct AtomVecAngleKokkos_UnpackComm {
 void AtomVecAngleKokkos::unpack_comm_kokkos(const int &n, const int &first,
    const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    struct AtomVecAngleKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    struct AtomVecAngleKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  }
@ -642,7 +643,7 @@ void AtomVecAngleKokkos::unpack_comm_vel(int n, int first, double *buf)
 int AtomVecAngleKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);

  int m = 0;
  const int last = first + n;
@ -659,7 +660,7 @@ int AtomVecAngleKokkos::pack_reverse(int n, int first, double *buf)
 void AtomVecAngleKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0)
-    modified(Host,F_MASK);
+    atomKK->modified(Host,F_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -960,9 +961,9 @@ struct AtomVecAngleKokkos_UnpackBorder {
 void AtomVecAngleKokkos::unpack_border_kokkos(const int &n, const int &first,
                                             const DAT::tdual_xfloat_2d &buf,
                                             ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecAngleKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
@ -984,7 +985,7 @@ void AtomVecAngleKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1010,7 +1011,7 @@ void AtomVecAngleKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1412,7 +1413,7 @@ int AtomVecAngleKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | SPECIAL_MASK);

@ -1487,7 +1488,7 @@ int AtomVecAngleKokkos::size_restart()

 int AtomVecAngleKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
            ANGLE_MASK | SPECIAL_MASK);

@ -1541,7 +1542,7 @@ int AtomVecAngleKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
                ANGLE_MASK | SPECIAL_MASK);

--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -55,14 +55,15 @@ AtomVecAtomicKokkos::AtomVecAtomicKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecAtomicKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -74,7 +75,7 @@ void AtomVecAtomicKokkos::grow(int n)
  memoryKK->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -393,9 +394,9 @@ struct AtomVecAtomicKokkos_UnpackBorder {

 void AtomVecAtomicKokkos::unpack_border_kokkos(const int &n, const int &first,
                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
  if(space==Host) {
    struct AtomVecAtomicKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,first);
    Kokkos::parallel_for(n,f);
@ -415,7 +416,7 @@ void AtomVecAtomicKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -440,7 +441,7 @@ void AtomVecAtomicKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -668,7 +669,7 @@ int AtomVecAtomicKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK);

  int m = 1;
@ -720,7 +721,7 @@ int AtomVecAtomicKokkos::size_restart()

 int AtomVecAtomicKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK );

  int m = 1;
@ -755,7 +756,7 @@ int AtomVecAtomicKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK );

  int m = 1;
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecBondKokkos::AtomVecBondKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecBondKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -84,7 +85,7 @@ void AtomVecBondKokkos::grow(int n)
  memoryKK->grow_kokkos(atomKK->k_bond_atom,atomKK->bond_atom,nmax,atomKK->bond_per_atom,"atom:bond_atom");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atomKK->nextra_grow; iextra++)
@ -468,9 +469,9 @@ struct AtomVecBondKokkos_UnpackBorder {
 void AtomVecBondKokkos::unpack_border_kokkos(const int &n, const int &first,
                                             const DAT::tdual_xfloat_2d &buf,
                                             ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecBondKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
@ -492,7 +493,7 @@ void AtomVecBondKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -518,7 +519,7 @@ void AtomVecBondKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -866,7 +867,7 @@ int AtomVecBondKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK | SPECIAL_MASK);

  int k;
@ -934,7 +935,7 @@ int AtomVecBondKokkos::size_restart()

 int AtomVecBondKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK | SPECIAL_MASK);
  int m = 1;
  buf[m++] = h_x(i,0);
@ -978,7 +979,7 @@ int AtomVecBondKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK | SPECIAL_MASK);
  int m = 1;
  h_x(nlocal,0) = buf[m++];
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecChargeKokkos::AtomVecChargeKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecChargeKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -79,7 +80,7 @@ void AtomVecChargeKokkos::grow(int n)
  memoryKK->grow_kokkos(atomKK->k_q,atomKK->q,nmax,"atom:q");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -494,7 +495,7 @@ void AtomVecChargeKokkos::unpack_border_kokkos(const int &n, const int &first,
      f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_q,first);
    Kokkos::parallel_for(n,f);
  }
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -510,7 +511,7 @@ void AtomVecChargeKokkos::unpack_border(int n, int first, double *buf)
    if (i == nmax) {
      grow(0);
    }
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -536,7 +537,7 @@ void AtomVecChargeKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -797,7 +798,7 @@ int AtomVecChargeKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK);

  int m = 1;
@ -850,7 +851,7 @@ int AtomVecChargeKokkos::size_restart()

 int AtomVecChargeKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | Q_MASK);

  int m = 1;
@ -888,7 +889,7 @@ int AtomVecChargeKokkos::unpack_restart(double *buf)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK);

  int m = 1;
--- a/src/KOKKOS/atom_vec_dpd_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -60,14 +60,15 @@ AtomVecDPDKokkos::AtomVecDPDKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecDPDKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -93,7 +94,7 @@ void AtomVecDPDKokkos::grow(int n)
      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);
 }

 /* ----------------------------------------------------------------------
@ -158,7 +159,7 @@ void AtomVecDPDKokkos::grow_reset()

 void AtomVecDPDKokkos::copy(int i, int j, int delflag)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
            UCG_MASK | UCGNEW_MASK |
            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
@ -184,7 +185,7 @@ void AtomVecDPDKokkos::copy(int i, int j, int delflag)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
                UCG_MASK | UCGNEW_MASK |
                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
@ -268,7 +269,7 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
  // Choose correct forward PackComm kernel

  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,
@ -303,7 +304,7 @@ int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
      }
    }
  } else {
-    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,
@ -410,8 +411,8 @@ struct AtomVecDPDKokkos_PackCommSelf {
 int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
                                                                                const int nfirst, const int &pbc_flag, const int* const pbc) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,
@ -446,8 +447,8 @@ int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list
      }
    }
  } else {
-    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,
@ -528,15 +529,15 @@ struct AtomVecDPDKokkos_UnpackComm {
 void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
    const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    struct AtomVecDPDKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,
    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
    buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
-    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    atomKK->modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
    struct AtomVecDPDKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,
    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
    buf,first);
@ -552,7 +553,7 @@ int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz;

-  sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -598,7 +599,7 @@ int AtomVecDPDKokkos::pack_comm_vel(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz,dvx,dvy,dvz;

-  sync(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->sync(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -685,7 +686,7 @@ void AtomVecDPDKokkos::unpack_comm(int n, int first, double *buf)
    h_uChem[i] = buf[m++];
  }

-  modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -709,7 +710,7 @@ void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
    h_uChem[i] = buf[m++];
  }

-  modified(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+  atomKK->modified(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -717,7 +718,7 @@ void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
 int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);

  int m = 0;
  const int last = first + n;
@ -734,8 +735,8 @@ int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
 void AtomVecDPDKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0) {
-    sync(Host,F_MASK);
-    modified(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);
+    atomKK->modified(Host,F_MASK);
  }

  int m = 0;
@ -819,7 +820,7 @@ int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DA
 {
  X_FLOAT dx,dy,dz;

-  sync(space,ALL_MASK);
+  atomKK->sync(space,ALL_MASK);

  if (pbc_flag != 0) {
    if (domain->triclinic == 0) {
@ -876,7 +877,7 @@ int AtomVecDPDKokkos::pack_border(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -937,7 +938,7 @@ int AtomVecDPDKokkos::pack_border_vel(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz,dvx,dvy,dvz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -1032,7 +1033,7 @@ int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
 {
  int i,j,m;

-  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->sync(Host,DPDTHETA_MASK | UCOND_MASK |
            UMECH_MASK | UCHEM_MASK);

  m = 0;
@ -1052,7 +1053,7 @@ int AtomVecDPDKokkos::pack_border_hybrid(int n, int *list, double *buf)
 {
  int i,j,m;

-  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->sync(Host,DPDTHETA_MASK | UCOND_MASK |
            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);

  m = 0;
@ -1127,11 +1128,11 @@ struct AtomVecDPDKokkos_UnpackBorder {

 void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                 UCG_MASK|UCGNEW_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                 UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
  if(space==Host) {
@ -1179,7 +1180,7 @@ void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
 }
@ -1217,7 +1218,7 @@ void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
 }
@ -1237,7 +1238,7 @@ int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
    h_uChem(i) = buf[m++];
  }

-  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->modified(Host,DPDTHETA_MASK | UCOND_MASK |
                UMECH_MASK | UCHEM_MASK );

  return m;
@ -1260,7 +1261,7 @@ int AtomVecDPDKokkos::unpack_border_hybrid(int n, int first, double *buf)
    h_uCGnew(i) = buf[m++];
  }

-  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+  atomKK->modified(Host,DPDTHETA_MASK | UCOND_MASK |
                UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);

  return m;
@ -1384,7 +1385,7 @@ int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d
    int newsize = nsend*17/k_buf.view<LMPHostType>().extent(1)+1;
    k_buf.resize(newsize,k_buf.view<LMPHostType>().extent(1));
  }
-  sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
             MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
             UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
             DVECTOR_MASK);
@ -1402,7 +1403,7 @@ int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d

 int AtomVecDPDKokkos::pack_exchange(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
            DVECTOR_MASK);
@ -1518,7 +1519,7 @@ int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nre
    k_count.sync<LMPHostType>();
  }

-  modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                 MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
                 UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
                 DVECTOR_MASK);
@ -1556,7 +1557,7 @@ int AtomVecDPDKokkos::unpack_exchange(double *buf)
      m += modify->fix[atom->extra_grow[iextra]]->
        unpack_exchange(nlocal,&buf[m]);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
           UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
           DVECTOR_MASK);
@ -1593,7 +1594,7 @@ int AtomVecDPDKokkos::size_restart()

 int AtomVecDPDKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);

@ -1658,7 +1659,7 @@ int AtomVecDPDKokkos::unpack_restart(double *buf)
    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
  }

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
                UCG_MASK | UCGNEW_MASK |
                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecFullKokkos::AtomVecFullKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecFullKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -123,7 +124,7 @@ void AtomVecFullKokkos::grow(int n)
                      atomKK->improper_per_atom,"atom:improper_atom4");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -608,9 +609,9 @@ struct AtomVecFullKokkos_UnpackBorder {
 void AtomVecFullKokkos::unpack_border_kokkos(const int &n, const int &first,
                                                  const DAT::tdual_xfloat_2d &buf,
                                                  ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecFullKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_q,h_molecule,first);
@ -632,7 +633,7 @@ void AtomVecFullKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -659,7 +660,7 @@ void AtomVecFullKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1204,7 +1205,7 @@ int AtomVecFullKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1297,7 +1298,7 @@ int AtomVecFullKokkos::size_restart()

 int AtomVecFullKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
       MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
       ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1370,10 +1371,10 @@ int AtomVecFullKokkos::unpack_restart(double *buf)
    if (atom->nextra_store)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
       MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
       ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | Q_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

--- a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
@ -307,7 +307,7 @@ int AtomVecHybridKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int
 int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
                             int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK);
+  atomKK->sync(Host,X_MASK);

  int i,j,k,m;
  double dx,dy,dz;
@ -351,7 +351,7 @@ int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
 int AtomVecHybridKokkos::pack_comm_vel(int n, int *list, double *buf,
                                 int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->sync(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  int i,j,k,m;
  double dx,dy,dz,dvx,dvy,dvz;
@ -463,7 +463,7 @@ void AtomVecHybridKokkos::unpack_comm(int n, int first, double *buf)
    h_x(i,2) = buf[m++];
  }

-  modified(Host,X_MASK);
+  atomKK->modified(Host,X_MASK);

  // unpack sub-style contributions as contiguous chunks

@ -500,7 +500,7 @@ void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)
    }
  }

-  modified(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->modified(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  // unpack sub-style contributions as contiguous chunks

@ -512,7 +512,7 @@ void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)

 int AtomVecHybridKokkos::pack_reverse(int n, int first, double *buf)
 {
-  sync(Host,F_MASK);
+  atomKK->sync(Host,F_MASK);

  int i,k,m,last;

@ -546,7 +546,7 @@ void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
    h_f(j,2) += buf[m++];
  }

-  modified(Host,F_MASK);
+  atomKK->modified(Host,F_MASK);

  // unpack sub-style contributions as contiguous chunks

@ -559,7 +559,7 @@ void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
 int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
                               int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);

  int i,j,k,m;
  double dx,dy,dz;
@ -613,7 +613,7 @@ int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
 int AtomVecHybridKokkos::pack_border_vel(int n, int *list, double *buf,
                                   int pbc_flag, int *pbc)
 {
-  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
  int i,j,k,m;
  double dx,dy,dz,dvx,dvy,dvz;
  int omega_flag = atom->omega_flag;
@ -741,7 +741,7 @@ void AtomVecHybridKokkos::unpack_border(int n, int first, double *buf)
    h_mask[i] = (int) ubuf(buf[m++]).i;
  }

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);

  // unpack sub-style contributions as contiguous chunks

@ -787,7 +787,7 @@ void AtomVecHybridKokkos::unpack_border_vel(int n, int first, double *buf)
    }
  }

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  // unpack sub-style contributions as contiguous chunks

@ -969,7 +969,7 @@ void AtomVecHybridKokkos::create_atom(int itype, double *coord)

 void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **values)
 {
-  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->sync(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
@ -1000,7 +1000,7 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val
    h_angmom(nlocal,2) = 0.0;
  }

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);

  // each sub-style parses sub-style specific values

@ -1017,13 +1017,13 @@ void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **val

 void AtomVecHybridKokkos::data_vel(int m, char **values)
 {
-  sync(Host,V_MASK);
+  atomKK->sync(Host,V_MASK);

  h_v(m,0) = atof(values[0]);
  h_v(m,1) = atof(values[1]);
  h_v(m,2) = atof(values[2]);

-  modified(Host,V_MASK);
+  atomKK->modified(Host,V_MASK);

  // each sub-style parses sub-style specific values

@ -1038,7 +1038,7 @@ void AtomVecHybridKokkos::data_vel(int m, char **values)

 void AtomVecHybridKokkos::pack_data(double **buf)
 {
-  sync(Host,TAG_MASK|TYPE_MASK|X_MASK);
+  atomKK->sync(Host,TAG_MASK|TYPE_MASK|X_MASK);

  int k,m;

@ -1089,7 +1089,7 @@ void AtomVecHybridKokkos::write_data(FILE *fp, int n, double **buf)

 void AtomVecHybridKokkos::pack_vel(double **buf)
 {
-  sync(Host,V_MASK);
+  atomKK->sync(Host,V_MASK);

  int k,m;

--- a/src/KOKKOS/atom_vec_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_kokkos.cpp
@ -267,6 +267,114 @@ int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, c
        return n*3;
 }

+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int TRICLINIC>
+struct AtomVecKokkos_PackCommSelfFused {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _pbc;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _pbc_flag;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _firstrecv;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _sendnum_scan;
+  typename ArrayTypes<DeviceType>::t_int_1d_const _g2l;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+
+  AtomVecKokkos_PackCommSelfFused(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_int_2d &list,
+      const typename DAT::tdual_int_2d &pbc,
+      const typename DAT::tdual_int_1d &pbc_flag,
+      const typename DAT::tdual_int_1d &firstrecv,
+      const typename DAT::tdual_int_1d &sendnum_scan,
+      const typename DAT::tdual_int_1d &g2l,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),
+      _list(list.view<DeviceType>()),
+      _pbc(pbc.view<DeviceType>()),
+      _pbc_flag(pbc_flag.view<DeviceType>()),
+      _firstrecv(firstrecv.view<DeviceType>()),
+      _sendnum_scan(sendnum_scan.view<DeviceType>()),
+      _g2l(g2l.view<DeviceType>()),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& ii) const {
+
+    int iswap = 0;
+    while (ii >= _sendnum_scan[iswap]) iswap++;
+    int i = ii;
+    if (iswap > 0)
+      i = ii - _sendnum_scan[iswap-1];
+
+    const int _nfirst = _firstrecv[iswap];
+    const int nlocal = _firstrecv[0];
+
+    int j = _list(iswap,i);
+    if (j >= nlocal)
+      j = _g2l(j-nlocal);
+
+    if (_pbc_flag(ii) == 0) {
+        _xw(i+_nfirst,0) = _x(j,0);
+        _xw(i+_nfirst,1) = _x(j,1);
+        _xw(i+_nfirst,2) = _x(j,2);
+    } else {
+      if (TRICLINIC == 0) {
+        _xw(i+_nfirst,0) = _x(j,0) + _pbc(ii,0)*_xprd;
+        _xw(i+_nfirst,1) = _x(j,1) + _pbc(ii,1)*_yprd;
+        _xw(i+_nfirst,2) = _x(j,2) + _pbc(ii,2)*_zprd;
+      } else {
+        _xw(i+_nfirst,0) = _x(j,0) + _pbc(ii,0)*_xprd + _pbc(ii,5)*_xy + _pbc(ii,4)*_xz;
+        _xw(i+_nfirst,1) = _x(j,1) + _pbc(ii,1)*_yprd + _pbc(ii,3)*_yz;
+        _xw(i+_nfirst,2) = _x(j,2) + _pbc(ii,2)*_zprd;
+      }
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_self_fused(const int &n, const DAT::tdual_int_2d &list, const DAT::tdual_int_1d &sendnum_scan,
+                                         const DAT::tdual_int_1d &firstrecv, const DAT::tdual_int_1d &pbc_flag, const DAT::tdual_int_2d &pbc,
+                                         const DAT::tdual_int_1d &g2l) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(domain->triclinic) {
+    struct AtomVecKokkos_PackCommSelfFused<LMPHostType,1> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    } else {
+    struct AtomVecKokkos_PackCommSelfFused<LMPHostType,0> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    }
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(domain->triclinic) {
+    struct AtomVecKokkos_PackCommSelfFused<LMPDeviceType,1> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    } else {
+    struct AtomVecKokkos_PackCommSelfFused<LMPDeviceType,0> f(atomKK->k_x,list,pbc,pbc_flag,firstrecv,sendnum_scan,g2l,
+        domain->xprd,domain->yprd,domain->zprd,
+        domain->xy,domain->xz,domain->yz);
+    Kokkos::parallel_for(n,f);
+    }
+  }
+  return n*3;
+}
+
 /* ---------------------------------------------------------------------- */

 template<class DeviceType>
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@ -51,6 +51,14 @@ class AtomVecKokkos : public AtomVec {
                   const int & iswap, const int nfirst,
                   const int &pbc_flag, const int pbc[]);

+  virtual int
+    pack_comm_self_fused(const int &n, const DAT::tdual_int_2d &list,
+                         const DAT::tdual_int_1d &sendnum_scan,
+                         const DAT::tdual_int_1d &firstrecv,
+                         const DAT::tdual_int_1d &pbc_flag,
+                         const DAT::tdual_int_2d &pbc,
+                         const DAT::tdual_int_1d &g2l);
+
  virtual int
    pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list,
                     const int & iswap, const DAT::tdual_xfloat_2d &buf,
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@ -24,7 +24,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 /* ---------------------------------------------------------------------- */

@ -58,14 +58,15 @@ AtomVecMolecularKokkos::AtomVecMolecularKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)

 void AtomVecMolecularKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atomKK->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -121,7 +122,7 @@ void AtomVecMolecularKokkos::grow(int n)
                      atomKK->improper_per_atom,"atom:improper_atom4");

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  if (atom->nextra_grow)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@ -361,7 +362,7 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
  // Choose correct forward PackComm kernel

  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecMolecularKokkos_PackComm<LMPHostType,1,1>
@ -388,7 +389,7 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
      }
    }
  } else {
-    sync(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecMolecularKokkos_PackComm<LMPDeviceType,1,1>
@ -477,8 +478,8 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
                                           const int nfirst, const int &pbc_flag,
                                           const int* const pbc) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecMolecularKokkos_PackCommSelf<LMPHostType,1,1>
@ -505,8 +506,8 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
      }
    }
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
      struct AtomVecMolecularKokkos_PackCommSelf<LMPDeviceType,1,1>
@ -565,13 +566,13 @@ struct AtomVecMolecularKokkos_UnpackComm {
 void AtomVecMolecularKokkos::unpack_comm_kokkos(const int &n, const int &first,
    const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
    struct AtomVecMolecularKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
+    atomKK->sync(Device,X_MASK);
+    atomKK->modified(Device,X_MASK);
    struct AtomVecMolecularKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
    Kokkos::parallel_for(n,f);
  }
@ -714,7 +715,7 @@ void AtomVecMolecularKokkos::unpack_comm_vel(int n, int first, double *buf)
 int AtomVecMolecularKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK);
+    atomKK->sync(Host,F_MASK);

  int m = 0;
  const int last = first + n;
@ -731,7 +732,7 @@ int AtomVecMolecularKokkos::pack_reverse(int n, int first, double *buf)
 void AtomVecMolecularKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0)
-    modified(Host,F_MASK);
+    atomKK->modified(Host,F_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -1032,9 +1033,9 @@ struct AtomVecMolecularKokkos_UnpackBorder {
 void AtomVecMolecularKokkos::unpack_border_kokkos(const int &n, const int &first,
                                                  const DAT::tdual_xfloat_2d &buf,
                                                  ExecutionSpace space) {
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  while (first+n >= nmax) grow(0);
-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
  if(space==Host) {
    struct AtomVecMolecularKokkos_UnpackBorder<LMPHostType>
      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
@ -1056,7 +1057,7 @@ void AtomVecMolecularKokkos::unpack_border(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1082,7 +1083,7 @@ void AtomVecMolecularKokkos::unpack_border_vel(int n, int first, double *buf)
  last = first + n;
  for (i = first; i < last; i++) {
    if (i == nmax) grow(0);
-    modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK);
    h_x(i,0) = buf[m++];
    h_x(i,1) = buf[m++];
    h_x(i,2) = buf[m++];
@ -1615,7 +1616,7 @@ int AtomVecMolecularKokkos::unpack_exchange(double *buf)
 {
  int nlocal = atom->nlocal;
  if (nlocal == nmax) grow(0);
-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1707,7 +1708,7 @@ int AtomVecMolecularKokkos::size_restart()

 int AtomVecMolecularKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
       MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
       ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

@ -1780,7 +1781,7 @@ int AtomVecMolecularKokkos::unpack_restart(double *buf)
      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
  }

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | MOLECULE_MASK | BOND_MASK |
           ANGLE_MASK | DIHEDRAL_MASK | IMPROPER_MASK | SPECIAL_MASK);

--- a/src/KOKKOS/atom_vec_sphere_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_sphere_kokkos.cpp
@ -30,7 +30,7 @@

 using namespace LAMMPS_NS;

-#define DELTA 10000
+#define DELTA 10

 static const double MY_PI  = 3.14159265358979323846; // pi

@ -93,14 +93,15 @@ void AtomVecSphereKokkos::init()

 void AtomVecSphereKokkos::grow(int n)
 {
-  if (n == 0) nmax += DELTA;
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
  else nmax = n;
  atom->nmax = nmax;
  if (nmax < 0 || nmax > MAXSMALLINT)
    error->one(FLERR,"Per-processor system is too big");

-  sync(Device,ALL_MASK);
-  modified(Device,ALL_MASK);
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);

  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
@ -120,7 +121,7 @@ void AtomVecSphereKokkos::grow(int n)
      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);

  grow_reset();
-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);
 }

 /* ----------------------------------------------------------------------
@ -172,7 +173,7 @@ void AtomVecSphereKokkos::grow_reset()

 void AtomVecSphereKokkos::copy(int i, int j, int delflag)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | RADIUS_MASK |
            RMASS_MASK | OMEGA_MASK);

@ -197,7 +198,7 @@ void AtomVecSphereKokkos::copy(int i, int j, int delflag)
    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | RADIUS_MASK |
                RMASS_MASK | OMEGA_MASK);
 }
@ -277,7 +278,7 @@ int AtomVecSphereKokkos::pack_comm_kokkos(
  // Check whether to always run forward communication on the host
  // Choose correct forward PackComm kernel
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecSphereKokkos_PackComm<LMPHostType,1,1> f(
@ -316,7 +317,7 @@ int AtomVecSphereKokkos::pack_comm_kokkos(
      }
    }
  } else {
-    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
        struct AtomVecSphereKokkos_PackComm<LMPDeviceType,1,1> f(
@ -464,7 +465,7 @@ int AtomVecSphereKokkos::pack_comm_vel_kokkos(
  const int* const pbc)
 {
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if(pbc_flag) {
      if(deform_vremap) {
        if(domain->triclinic) {
@ -595,7 +596,7 @@ int AtomVecSphereKokkos::pack_comm_vel_kokkos(
      }
    }
  } else {
-    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if(pbc_flag) {
      if(deform_vremap) {
        if(domain->triclinic) {
@ -795,8 +796,8 @@ int AtomVecSphereKokkos::pack_comm_self(
  if (radvary == 0)
    return AtomVecKokkos::pack_comm_self(n,list,iswap,nfirst,pbc_flag,pbc);
  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
 	struct AtomVecSphereKokkos_PackCommSelf<LMPHostType,1,1> f(
@ -835,8 +836,8 @@ int AtomVecSphereKokkos::pack_comm_self(
      }
    }
  } else {
-    sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
-    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
    if(pbc_flag) {
      if(domain->triclinic) {
 	struct AtomVecSphereKokkos_PackCommSelf<LMPDeviceType,1,1> f(
@ -926,14 +927,14 @@ void AtomVecSphereKokkos::unpack_comm_kokkos(
    return;
  }
  if(commKK->forward_comm_on_host) {
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    struct AtomVecSphereKokkos_UnpackComm<LMPHostType> f(
      atomKK->k_x,
      atomKK->k_radius,atomKK->k_rmass,
      buf,first);
    Kokkos::parallel_for(n,f);
  } else {
-    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK);
    struct AtomVecSphereKokkos_UnpackComm<LMPDeviceType> f(
      atomKK->k_x,
      atomKK->k_radius,atomKK->k_rmass,
@ -998,7 +999,7 @@ void AtomVecSphereKokkos::unpack_comm_vel_kokkos(
  const int &n, const int &first,
  const DAT::tdual_xfloat_2d &buf ) {
  if(commKK->forward_comm_on_host) {
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if (radvary == 0) {
      struct AtomVecSphereKokkos_UnpackCommVel<LMPHostType,0> f(
        atomKK->k_x,
@ -1015,7 +1016,7 @@ void AtomVecSphereKokkos::unpack_comm_vel_kokkos(
      Kokkos::parallel_for(n,f);
    }
  } else {
-    modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Device,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    if (radvary == 0) {
      struct AtomVecSphereKokkos_UnpackCommVel<LMPDeviceType,0> f(
        atomKK->k_x,
@ -1044,7 +1045,7 @@ int AtomVecSphereKokkos::pack_comm(int n, int *list, double *buf,

  if (radvary == 0) {
    // Not sure if we need to call sync for X here
-    sync(Host,X_MASK);
+    atomKK->sync(Host,X_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1071,7 +1072,7 @@ int AtomVecSphereKokkos::pack_comm(int n, int *list, double *buf,
      }
    }
  } else {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1115,7 +1116,7 @@ int AtomVecSphereKokkos::pack_comm_vel(int n, int *list, double *buf,
  double dx,dy,dz,dvx,dvy,dvz;

  if (radvary == 0) {
-    sync(Host,X_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Host,X_MASK|V_MASK|OMEGA_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1178,7 +1179,7 @@ int AtomVecSphereKokkos::pack_comm_vel(int n, int *list, double *buf,
      }
    }
  } else {
-    sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->sync(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
    m = 0;
    if (pbc_flag == 0) {
      for (i = 0; i < n; i++) {
@ -1257,7 +1258,7 @@ int AtomVecSphereKokkos::pack_comm_hybrid(int n, int *list, double *buf)
 {
  if (radvary == 0) return 0;

-  sync(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->sync(Host,RADIUS_MASK|RMASS_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -1280,7 +1281,7 @@ void AtomVecSphereKokkos::unpack_comm(int n, int first, double *buf)
      h_x(i,1) = buf[m++];
      h_x(i,2) = buf[m++];
    }
-    modified(Host,X_MASK);
+    atomKK->modified(Host,X_MASK);
  } else {
    int m = 0;
    const int last = first + n;
@ -1291,7 +1292,7 @@ void AtomVecSphereKokkos::unpack_comm(int n, int first, double *buf)
      h_radius[i] = buf[m++];
      h_rmass[i] = buf[m++];
    }
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK);
  }
 }

@ -1313,7 +1314,7 @@ void AtomVecSphereKokkos::unpack_comm_vel(int n, int first, double *buf)
      h_omega(i,1) = buf[m++];
      h_omega(i,2) = buf[m++];
    }
-    modified(Host,X_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Host,X_MASK|V_MASK|OMEGA_MASK);
  } else {
    int m = 0;
    const int last = first + n;
@ -1330,7 +1331,7 @@ void AtomVecSphereKokkos::unpack_comm_vel(int n, int first, double *buf)
      h_omega(i,1) = buf[m++];
      h_omega(i,2) = buf[m++];
    }
-    modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+    atomKK->modified(Host,X_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
  }
 }

@ -1346,7 +1347,7 @@ int AtomVecSphereKokkos::unpack_comm_hybrid(int n, int first, double *buf)
    h_radius[i] = buf[m++];
    h_rmass[i] = buf[m++];
  }
-  modified(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->modified(Host,RADIUS_MASK|RMASS_MASK);
  return m;
 }

@ -1355,7 +1356,7 @@ int AtomVecSphereKokkos::unpack_comm_hybrid(int n, int first, double *buf)
 int AtomVecSphereKokkos::pack_reverse(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,F_MASK|TORQUE_MASK);
+    atomKK->sync(Host,F_MASK|TORQUE_MASK);

  int m = 0;
  const int last = first + n;
@ -1375,7 +1376,7 @@ int AtomVecSphereKokkos::pack_reverse(int n, int first, double *buf)
 int AtomVecSphereKokkos::pack_reverse_hybrid(int n, int first, double *buf)
 {
  if(n > 0)
-    sync(Host,TORQUE_MASK);
+    atomKK->sync(Host,TORQUE_MASK);

  int m = 0;
  const int last = first + n;
@ -1392,7 +1393,7 @@ int AtomVecSphereKokkos::pack_reverse_hybrid(int n, int first, double *buf)
 void AtomVecSphereKokkos::unpack_reverse(int n, int *list, double *buf)
 {
  if(n > 0) {
-    modified(Host,F_MASK|TORQUE_MASK);
+    atomKK->modified(Host,F_MASK|TORQUE_MASK);
  }

  int m = 0;
@ -1412,7 +1413,7 @@ void AtomVecSphereKokkos::unpack_reverse(int n, int *list, double *buf)
 int AtomVecSphereKokkos::unpack_reverse_hybrid(int n, int *list, double *buf)
 {
  if(n > 0) {
-    modified(Host,TORQUE_MASK);
+    atomKK->modified(Host,TORQUE_MASK);
  }

  int m = 0;
@ -1492,7 +1493,7 @@ int AtomVecSphereKokkos::pack_border_kokkos(
  X_FLOAT dx,dy,dz;

  // This was in atom_vec_dpd_kokkos but doesn't appear in any other atom_vec
-  sync(space,ALL_MASK);
+  atomKK->sync(space,ALL_MASK);

  if (pbc_flag != 0) {
    if (domain->triclinic == 0) {
@ -1549,7 +1550,7 @@ int AtomVecSphereKokkos::pack_border(
  int i,j,m;
  double dx,dy,dz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -1686,7 +1687,7 @@ int AtomVecSphereKokkos::pack_border_vel_kokkos(
  X_FLOAT dvx=0,dvy=0,dvz=0;

  // This was in atom_vec_dpd_kokkos but doesn't appear in any other atom_vec
-  sync(space,ALL_MASK);
+  atomKK->sync(space,ALL_MASK);

  if (pbc_flag != 0) {
    if (domain->triclinic == 0) {
@ -1776,7 +1777,7 @@ int AtomVecSphereKokkos::pack_border_vel(int n, int *list, double *buf,
  int i,j,m;
  double dx,dy,dz,dvx,dvy,dvz;

-  sync(Host,ALL_MASK);
+  atomKK->sync(Host,ALL_MASK);

  m = 0;
  if (pbc_flag == 0) {
@ -1866,7 +1867,7 @@ int AtomVecSphereKokkos::pack_border_vel(int n, int *list, double *buf,

 int AtomVecSphereKokkos::pack_border_hybrid(int n, int *list, double *buf)
 {
-  sync(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->sync(Host,RADIUS_MASK|RMASS_MASK);

  int m = 0;
  for (int i = 0; i < n; i++) {
@ -1942,7 +1943,7 @@ void AtomVecSphereKokkos::unpack_border_kokkos(const int &n, const int &first,
    Kokkos::parallel_for(n,f);
  }

-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
 	         RADIUS_MASK|RMASS_MASK);
 }

@ -1969,7 +1970,7 @@ void AtomVecSphereKokkos::unpack_border(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK);
 }


@ -2052,7 +2053,7 @@ void AtomVecSphereKokkos::unpack_border_vel_kokkos(
    Kokkos::parallel_for(n,f);
  }

-  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
 	         RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
 }

@ -2085,7 +2086,7 @@ void AtomVecSphereKokkos::unpack_border_vel(int n, int first, double *buf)
      m += modify->fix[atom->extra_border[iextra]]->
        unpack_border(n,first,&buf[m]);

-  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
+  atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|RADIUS_MASK|RMASS_MASK|V_MASK|OMEGA_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -2098,7 +2099,7 @@ int AtomVecSphereKokkos::unpack_border_hybrid(int n, int first, double *buf)
    h_radius[i] = buf[m++];
    h_rmass[i] = buf[m++];
  }
-  modified(Host,RADIUS_MASK|RMASS_MASK);
+  atomKK->modified(Host,RADIUS_MASK|RMASS_MASK);
  return m;
 }

@ -2218,7 +2219,7 @@ int AtomVecSphereKokkos::pack_exchange_kokkos(
    int newsize = nsend*17/k_buf.view<LMPHostType>().extent(1)+1;
    k_buf.resize(newsize,k_buf.view<LMPHostType>().extent(1));
  }
-  sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
             MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
             OMEGA_MASK);

@ -2239,7 +2240,7 @@ int AtomVecSphereKokkos::pack_exchange_kokkos(

 int AtomVecSphereKokkos::pack_exchange(int i, double *buf)
 {
-  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
            OMEGA_MASK);

@ -2354,7 +2355,7 @@ int AtomVecSphereKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
    k_count.sync<LMPHostType>();
  }

-  modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
                 MASK_MASK | IMAGE_MASK| RADIUS_MASK | RMASS_MASK |
                 OMEGA_MASK);

@ -2391,7 +2392,7 @@ int AtomVecSphereKokkos::unpack_exchange(double *buf)
      m += modify->fix[atom->extra_grow[iextra]]->
        unpack_exchange(nlocal,&buf[m]);

-  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
           MASK_MASK | IMAGE_MASK | RADIUS_MASK | RMASS_MASK |
           OMEGA_MASK);

@ -2427,7 +2428,7 @@ int AtomVecSphereKokkos::size_restart()

 int AtomVecSphereKokkos::pack_restart(int i, double *buf)
 {
-  sync(Host,X_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->sync(Host,X_MASK | TAG_MASK | TYPE_MASK |
            MASK_MASK | IMAGE_MASK | V_MASK |
            RADIUS_MASK | RMASS_MASK | OMEGA_MASK);

@ -2494,7 +2495,7 @@ int AtomVecSphereKokkos::unpack_restart(double *buf)
    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
  }

-  modified(Host,X_MASK | TAG_MASK | TYPE_MASK |
+  atomKK->modified(Host,X_MASK | TAG_MASK | TYPE_MASK |
                MASK_MASK | IMAGE_MASK | V_MASK |
 	        RADIUS_MASK | RMASS_MASK | OMEGA_MASK);

@ -2616,14 +2617,14 @@ int AtomVecSphereKokkos::data_atom_hybrid(int nlocal, char **values)

 void AtomVecSphereKokkos::data_vel(int m, char **values)
 {
-  sync(Host,V_MASK|OMEGA_MASK);
+  atomKK->sync(Host,V_MASK|OMEGA_MASK);
  h_v(m,0) = atof(values[0]);
  h_v(m,1) = atof(values[1]);
  h_v(m,2) = atof(values[2]);
  h_omega(m,0) = atof(values[3]);
  h_omega(m,1) = atof(values[4]);
  h_omega(m,2) = atof(values[5]);
-  modified(Host,V_MASK|OMEGA_MASK);
+  atomKK->modified(Host,V_MASK|OMEGA_MASK);
 }

 /* ----------------------------------------------------------------------
@ -2632,11 +2633,11 @@ void AtomVecSphereKokkos::data_vel(int m, char **values)

 int AtomVecSphereKokkos::data_vel_hybrid(int m, char **values)
 {
-  sync(Host,OMEGA_MASK);
+  atomKK->sync(Host,OMEGA_MASK);
  omega[m][0] = atof(values[0]);
  omega[m][1] = atof(values[1]);
  omega[m][2] = atof(values[2]);
-  modified(Host,OMEGA_MASK);
+  atomKK->modified(Host,OMEGA_MASK);
  return 3;
 }

@ -2711,7 +2712,7 @@ int AtomVecSphereKokkos::write_data_hybrid(FILE *fp, double *buf)

 void AtomVecSphereKokkos::pack_vel(double **buf)
 {
-  sync(Host,TAG_MASK|V_MASK|OMEGA_MASK);
+  atomKK->sync(Host,TAG_MASK|V_MASK|OMEGA_MASK);

  int nlocal = atom->nlocal;
  for (int i = 0; i < nlocal; i++) {
@ -2731,7 +2732,7 @@ void AtomVecSphereKokkos::pack_vel(double **buf)

 int AtomVecSphereKokkos::pack_vel_hybrid(int i, double *buf)
 {
-  sync(Host,OMEGA_MASK);
+  atomKK->sync(Host,OMEGA_MASK);

  buf[0] = h_omega(i,0);
  buf[1] = h_omega(i,1);
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@ -57,10 +57,9 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
  memory->destroy(buf_recv);
  buf_recv = NULL;

-  k_exchange_sendlist = DAT::
-    tdual_int_1d("comm:k_exchange_sendlist",100);
-  k_exchange_copylist = DAT::
-    tdual_int_1d("comm:k_exchange_copylist",100);
+  k_exchange_lists = DAT::tdual_int_2d("comm:k_exchange_lists",2,100);
+  k_exchange_sendlist = Kokkos::subview(k_exchange_lists,0,Kokkos::ALL);
+  k_exchange_copylist = Kokkos::subview(k_exchange_lists,1,Kokkos::ALL);
  k_count = DAT::tdual_int_scalar("comm:k_count");
  k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);

@ -187,71 +186,80 @@ void CommKokkos::forward_comm_device(int dummy)
  k_sendlist.sync<DeviceType>();
  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);

-  for (int iswap = 0; iswap < nswap; iswap++) {
-    if (sendproc[iswap] != me) {
-      if (comm_x_only) {
-        if (size_forward_recv[iswap]) {
-          buf = atomKK->k_x.view<DeviceType>().data() +
-            firstrecv[iswap]*atomKK->k_x.view<DeviceType>().extent(1);
-          MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
-        }
-        n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
-                                   iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        if (n) {
-          MPI_Send(k_buf_send.view<DeviceType>().data(),
-                   n,MPI_DOUBLE,sendproc[iswap],0,world);
-        }
+  if (comm->nprocs == 1 && !ghost_velocity) {
+    k_swap.sync<DeviceType>();
+    k_swap2.sync<DeviceType>();
+    k_pbc.sync<DeviceType>();
+    n = avec->pack_comm_self_fused(totalsend,k_sendlist,k_sendnum_scan,
+                    k_firstrecv,k_pbc_flag,k_pbc,k_g2l);
+  } else {

-        if (size_forward_recv[iswap]) {
-          MPI_Wait(&request,MPI_STATUS_IGNORE);
-          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
-                           space,X_MASK);
+    for (int iswap = 0; iswap < nswap; iswap++) {
+      if (sendproc[iswap] != me) {
+        if (comm_x_only) {
+          if (size_forward_recv[iswap]) {
+            buf = atomKK->k_x.view<DeviceType>().data() +
+              firstrecv[iswap]*atomKK->k_x.view<DeviceType>().extent(1);
+            MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
+                      recvproc[iswap],0,world,&request);
+          }
+          n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
+                                     iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          if (n) {
+            MPI_Send(k_buf_send.view<DeviceType>().data(),
+                     n,MPI_DOUBLE,sendproc[iswap],0,world);
+          }
+    
+          if (size_forward_recv[iswap]) {
+            MPI_Wait(&request,MPI_STATUS_IGNORE);
+            atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                             space,X_MASK);
+          }
+        } else if (ghost_velocity) {
+          if (size_forward_recv[iswap]) {
+            MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
+                      size_forward_recv[iswap],MPI_DOUBLE,
+                      recvproc[iswap],0,world,&request);
+          }
+          n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                         k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          if (n) {
+            MPI_Send(k_buf_send.view<DeviceType>().data(),n,
+                     MPI_DOUBLE,sendproc[iswap],0,world);
+          }
+          if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+          avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
+          DeviceType::fence();
+        } else {
+          if (size_forward_recv[iswap])
+            MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
+                      size_forward_recv[iswap],MPI_DOUBLE,
+                      recvproc[iswap],0,world,&request);
+          n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                     k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          if (n)
+            MPI_Send(k_buf_send.view<DeviceType>().data(),n,
+                     MPI_DOUBLE,sendproc[iswap],0,world);
+          if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+          avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
+          DeviceType::fence();
        }
-      } else if (ghost_velocity) {
-        if (size_forward_recv[iswap]) {
-          MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
-                    size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
-        }
-        n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
-                                       k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        if (n) {
-          MPI_Send(k_buf_send.view<DeviceType>().data(),n,
-                   MPI_DOUBLE,sendproc[iswap],0,world);
-        }
-        if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
-        DeviceType::fence();
      } else {
-        if (size_forward_recv[iswap])
-          MPI_Irecv(k_buf_recv.view<DeviceType>().data(),
-                    size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
-        n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap,
-                                   k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        if (n)
-          MPI_Send(k_buf_send.view<DeviceType>().data(),n,
-                   MPI_DOUBLE,sendproc[iswap],0,world);
-        if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv);
-        DeviceType::fence();
-      }
-    } else {
-      if (!ghost_velocity) {
-        if (sendnum[iswap])
-          n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap,
-                                   firstrecv[iswap],pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-      } else {
-        n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
-                                       k_buf_send,pbc_flag[iswap],pbc[iswap]);
-        DeviceType::fence();
-        avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
-        DeviceType::fence();
+        if (!ghost_velocity) {
+          if (sendnum[iswap])
+            n = avec->pack_comm_self(sendnum[iswap],k_sendlist,iswap,
+                                     firstrecv[iswap],pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+        } else {
+          n = avec->pack_comm_vel_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                         k_buf_send,pbc_flag[iswap],pbc[iswap]);
+          DeviceType::fence();
+          avec->unpack_comm_vel_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
+          DeviceType::fence();
+        }
      }
    }
  }
@ -496,9 +504,8 @@ void CommKokkos::exchange()
  }

  atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
-
  CommBrick::exchange();
+  atomKK->modified(Host,ALL_MASK);
 }

 /* ---------------------------------------------------------------------- */
@ -565,146 +572,149 @@ void CommKokkos::exchange_device()
  atom->nghost = 0;
  atom->avec->clear_bonus();

-  // subbox bounds for orthogonal or triclinic
+  if (comm->nprocs > 1) { // otherwise no-op

-  if (triclinic == 0) {
-    sublo = domain->sublo;
-    subhi = domain->subhi;
-  } else {
-    sublo = domain->sublo_lamda;
-    subhi = domain->subhi_lamda;
-  }
+    // subbox bounds for orthogonal or triclinic

-  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
+    if (triclinic == 0) {
+      sublo = domain->sublo;
+      subhi = domain->subhi;
+    } else {
+      sublo = domain->sublo_lamda;
+      subhi = domain->subhi_lamda;
+    }

-  // loop over dimensions
-  for (int dim = 0; dim < 3; dim++) {
+    atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);

-    // fill buffer with atoms leaving my box, using < and >=
-    // when atom is deleted, fill it in with last atom
+    // loop over dimensions
+    for (int dim = 0; dim < 3; dim++) {

-    x = atom->x;
-    lo = sublo[dim];
-    hi = subhi[dim];
-    nlocal = atom->nlocal;
-    i = nsend = 0;
+      // fill buffer with atoms leaving my box, using < and >=
+      // when atom is deleted, fill it in with last atom

-    if (true) {
-      if (k_sendflag.h_view.extent(0)<nlocal) k_sendflag.resize(nlocal);
-      k_sendflag.sync<DeviceType>();
-      k_count.h_view() = k_exchange_sendlist.h_view.extent(0);
-      while (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
-        k_count.h_view() = 0;
-        k_count.modify<LMPHostType>();
-        k_count.sync<DeviceType>();
+      x = atom->x;
+      lo = sublo[dim];
+      hi = subhi[dim];
+      nlocal = atom->nlocal;
+      i = nsend = 0;

-        BuildExchangeListFunctor<DeviceType>
-          f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag,
-            nlocal,dim,lo,hi);
-        Kokkos::parallel_for(nlocal,f);
-        k_exchange_sendlist.modify<DeviceType>();
-        k_sendflag.modify<DeviceType>();
-        k_count.modify<DeviceType>();
+      if (true) {
+        if (k_sendflag.h_view.extent(0)<nlocal) k_sendflag.resize(nlocal);
+        k_sendflag.sync<DeviceType>();
+        k_count.h_view() = k_exchange_sendlist.h_view.extent(0);
+        while (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
+          k_count.h_view() = 0;
+          k_count.modify<LMPHostType>();
+          k_count.sync<DeviceType>();

-        k_count.sync<LMPHostType>();
-        if (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
-          k_exchange_sendlist.resize(k_count.h_view()*1.1);
-          k_exchange_copylist.resize(k_count.h_view()*1.1);
-          k_count.h_view()=k_exchange_sendlist.h_view.extent(0);
+          BuildExchangeListFunctor<DeviceType>
+            f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag,
+              nlocal,dim,lo,hi);
+          Kokkos::parallel_for(nlocal,f);
+          k_exchange_sendlist.modify<DeviceType>();
+          k_sendflag.modify<DeviceType>();
+          k_count.modify<DeviceType>();
+
+          k_count.sync<LMPHostType>();
+          if (k_count.h_view()>=k_exchange_sendlist.h_view.extent(0)) {
+            k_exchange_lists.resize(2,k_count.h_view()*1.1);
+            k_exchange_sendlist = Kokkos::subview(k_exchange_lists,0,Kokkos::ALL);
+            k_exchange_copylist = Kokkos::subview(k_exchange_lists,1,Kokkos::ALL);
+            k_count.h_view()=k_exchange_sendlist.h_view.extent(0);
+          }
+        }
+
+        k_exchange_lists.sync<LMPHostType>();
+        k_sendflag.sync<LMPHostType>();
+
+        int sendpos = nlocal-1;
+        nlocal -= k_count.h_view();
+        for(int i = 0; i < k_count.h_view(); i++) {
+          if (k_exchange_sendlist.h_view(i)<nlocal) {
+            while (k_sendflag.h_view(sendpos)) sendpos--;
+            k_exchange_copylist.h_view(i) = sendpos;
+            sendpos--;
+          } else
+            k_exchange_copylist.h_view(i) = -1;
+        }
+
+        k_exchange_copylist.modify<LMPHostType>();
+        k_exchange_copylist.sync<DeviceType>();
+        nsend = k_count.h_view();
+        if (nsend > maxsend) grow_send_kokkos(nsend,1);
+        nsend =
+          avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
+                                     k_exchange_sendlist,k_exchange_copylist,
+                                     ExecutionSpaceFromDevice<DeviceType>::space,
+                                     dim,lo,hi);
+        DeviceType::fence();
+      } else {
+        while (i < nlocal) {
+          if (x[i][dim] < lo || x[i][dim] >= hi) {
+            if (nsend > maxsend) grow_send_kokkos(nsend,1);
+            nsend += avec->pack_exchange(i,&buf_send[nsend]);
+            avec->copy(nlocal-1,i,1);
+            nlocal--;
+          } else i++;
        }
      }
-      k_exchange_copylist.sync<LMPHostType>();
-      k_exchange_sendlist.sync<LMPHostType>();
-      k_sendflag.sync<LMPHostType>();
+      atom->nlocal = nlocal;

-      int sendpos = nlocal-1;
-      nlocal -= k_count.h_view();
-      for(int i = 0; i < k_count.h_view(); i++) {
-        if (k_exchange_sendlist.h_view(i)<nlocal) {
-          while (k_sendflag.h_view(sendpos)) sendpos--;
-          k_exchange_copylist.h_view(i) = sendpos;
-          sendpos--;
-        } else
-          k_exchange_copylist.h_view(i) = -1;
-      }
+      // send/recv atoms in both directions
+      // if 1 proc in dimension, no send/recv, set recv buf to send buf
+      // if 2 procs in dimension, single send/recv
+      // if more than 2 procs in dimension, send/recv to both neighbors

-      k_exchange_copylist.modify<LMPHostType>();
-      k_exchange_copylist.sync<DeviceType>();
-      nsend = k_count.h_view();
-      if (nsend > maxsend) grow_send_kokkos(nsend,1);
-      nsend =
-        avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
-                                   k_exchange_sendlist,k_exchange_copylist,
-                                   ExecutionSpaceFromDevice<DeviceType>::space,
-                                   dim,lo,hi);
-      DeviceType::fence();
-    } else {
-      while (i < nlocal) {
-        if (x[i][dim] < lo || x[i][dim] >= hi) {
-          if (nsend > maxsend) grow_send_kokkos(nsend,1);
-          nsend += avec->pack_exchange(i,&buf_send[nsend]);
-          avec->copy(nlocal-1,i,1);
-          nlocal--;
-        } else i++;
-      }
-    }
-    atom->nlocal = nlocal;
+      if (procgrid[dim] == 1) {
+        nrecv = nsend;
+        if (nrecv) {
+          atom->nlocal=avec->
+            unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi,
+                                   ExecutionSpaceFromDevice<DeviceType>::space);
+          DeviceType::fence();
+        }
+      } else {
+        MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
+                     &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE);
+        nrecv = nrecv1;
+        if (procgrid[dim] > 2) {
+          MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
+                       &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE);
+          nrecv += nrecv2;
+        }
+        if (nrecv > maxrecv) grow_recv_kokkos(nrecv);

-    // send/recv atoms in both directions
-    // if 1 proc in dimension, no send/recv, set recv buf to send buf
-    // if 2 procs in dimension, single send/recv
-    // if more than 2 procs in dimension, send/recv to both neighbors
-
-    if (procgrid[dim] == 1) {
-      nrecv = nsend;
-      if (nrecv) {
-        atom->nlocal=avec->
-          unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi,
-                                 ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
-      }
-    } else {
-      MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
-                   &nrecv1,1,MPI_INT,procneigh[dim][1],0,world,MPI_STATUS_IGNORE);
-      nrecv = nrecv1;
-      if (procgrid[dim] > 2) {
-        MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][1],0,
-                     &nrecv2,1,MPI_INT,procneigh[dim][0],0,world,MPI_STATUS_IGNORE);
-        nrecv += nrecv2;
-      }
-      if (nrecv > maxrecv) grow_recv_kokkos(nrecv);
-
-      MPI_Irecv(k_buf_recv.view<DeviceType>().data(),nrecv1,
-                MPI_DOUBLE,procneigh[dim][1],0,
-                world,&request);
-      MPI_Send(k_buf_send.view<DeviceType>().data(),nsend,
-               MPI_DOUBLE,procneigh[dim][0],0,world);
-      MPI_Wait(&request,MPI_STATUS_IGNORE);
-
-      if (procgrid[dim] > 2) {
-        MPI_Irecv(k_buf_recv.view<DeviceType>().data()+nrecv1,
-                  nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
+        MPI_Irecv(k_buf_recv.view<DeviceType>().data(),nrecv1,
+                  MPI_DOUBLE,procneigh[dim][1],0,
                  world,&request);
        MPI_Send(k_buf_send.view<DeviceType>().data(),nsend,
-                 MPI_DOUBLE,procneigh[dim][1],0,world);
+                 MPI_DOUBLE,procneigh[dim][0],0,world);
        MPI_Wait(&request,MPI_STATUS_IGNORE);
+
+        if (procgrid[dim] > 2) {
+          MPI_Irecv(k_buf_recv.view<DeviceType>().data()+nrecv1,
+                    nrecv2,MPI_DOUBLE,procneigh[dim][0],0,
+                    world,&request);
+          MPI_Send(k_buf_send.view<DeviceType>().data(),nsend,
+                   MPI_DOUBLE,procneigh[dim][1],0,world);
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+        }
+
+        if (nrecv) {
+          atom->nlocal = avec->
+            unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi,
+                                   ExecutionSpaceFromDevice<DeviceType>::space);
+          DeviceType::fence();
+        }
      }

-      if (nrecv) {
-        atom->nlocal = avec->
-          unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi,
-                                 ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
-      }
+      // check incoming atoms to see if they are in my box
+      // if so, add to my list
+
    }
-
-    // check incoming atoms to see if they are in my box
-    // if so, add to my list
-
+    atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
  }

-  atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::space,ALL_MASK);
-
  if (atom->firstgroupname) {
    /* this is not yet implemented with Kokkos */
    atomKK->sync(Host,ALL_MASK);
@ -742,14 +752,15 @@ void CommKokkos::borders()
  if (!exchange_comm_classic) {
    if (exchange_comm_on_host) borders_device<LMPHostType>();
    else borders_device<LMPDeviceType>();
-    return;
+  } else {
+    atomKK->sync(Host,ALL_MASK);
+    CommBrick::borders();
+    k_sendlist.modify<LMPHostType>();
+    atomKK->modified(Host,ALL_MASK);
  }

-  atomKK->sync(Host,ALL_MASK);
-  k_sendlist.sync<LMPHostType>();
-  CommBrick::borders();
-  k_sendlist.modify<LMPHostType>();
-  atomKK->modified(Host,ALL_MASK);
+  if (comm->nprocs == 1 && !ghost_velocity && !forward_comm_classic)
+    copy_swap_info();
 }

 /* ---------------------------------------------------------------------- */
@ -817,7 +828,6 @@ void CommKokkos::borders_device() {
  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;

  ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  k_sendlist.sync<DeviceType>();
  atomKK->sync(exec_space,ALL_MASK);

  // do swaps over all 3 dimensions
@ -1037,6 +1047,69 @@ void CommKokkos::borders_device() {
    atom->map_set();
  }
 }
+
+/* ----------------------------------------------------------------------
+   copy swap info
+------------------------------------------------------------------------- */
+
+void CommKokkos::copy_swap_info()
+{
+  if (nswap > k_swap.extent(1)) {
+    k_swap = DAT::tdual_int_2d("comm:swap",2,nswap);
+    k_firstrecv    = Kokkos::subview(k_swap,0,Kokkos::ALL);
+    k_sendnum_scan = Kokkos::subview(k_swap,1,Kokkos::ALL);
+  }
+  int scan = 0;
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    scan += sendnum[iswap];
+    k_sendnum_scan.h_view[iswap] = scan;
+    k_firstrecv.h_view[iswap] = firstrecv[iswap];
+  }
+  totalsend = scan;
+
+  // create map of ghost to local atom id
+  // store periodic boundary transform from local to ghost
+
+  k_sendlist.sync<LMPHostType>();
+
+  if (totalsend > k_pbc.extent(0)) {
+    k_pbc = DAT::tdual_int_2d("comm:pbc",totalsend,6);
+    k_swap2 = DAT::tdual_int_2d("comm:swap2",2,totalsend);
+    k_pbc_flag = Kokkos::subview(k_swap2,0,Kokkos::ALL);
+    k_g2l = Kokkos::subview(k_swap2,1,Kokkos::ALL);
+  }
+
+  for (int iswap = 0; iswap < nswap; iswap++) {
+    for (int i = 0; i < sendnum[iswap]; i++) {
+      int source = sendlist[iswap][i] - atom->nlocal;
+      int dest = firstrecv[iswap] + i - atom->nlocal;
+      k_pbc_flag.h_view(dest) = pbc_flag[iswap];
+      k_pbc.h_view(dest,0) = pbc[iswap][0];
+      k_pbc.h_view(dest,1) = pbc[iswap][1];
+      k_pbc.h_view(dest,2) = pbc[iswap][2];
+      k_pbc.h_view(dest,3) = pbc[iswap][3];
+      k_pbc.h_view(dest,4) = pbc[iswap][4];
+      k_pbc.h_view(dest,5) = pbc[iswap][5];
+      k_g2l.h_view(dest) = atom->nlocal + source;
+
+      if (source >= 0) {
+        k_pbc_flag.h_view(dest) = k_pbc_flag.h_view(dest) || k_pbc_flag.h_view(source);
+        k_pbc.h_view(dest,0) += k_pbc.h_view(source,0);
+        k_pbc.h_view(dest,1) += k_pbc.h_view(source,1);
+        k_pbc.h_view(dest,2) += k_pbc.h_view(source,2);
+        k_pbc.h_view(dest,3) += k_pbc.h_view(source,3);
+        k_pbc.h_view(dest,4) += k_pbc.h_view(source,4);
+        k_pbc.h_view(dest,5) += k_pbc.h_view(source,5);
+        k_g2l.h_view(dest) = k_g2l.h_view(source);
+      }
+    }
+  }
+
+  k_swap.modify<LMPHostType>();
+  k_swap2.modify<LMPHostType>();
+  k_pbc.modify<LMPHostType>();
+}
+
 /* ----------------------------------------------------------------------
   realloc the size of the send buffer as needed with BUFFACTOR and bufextra
   if flag = 1, realloc
--- a/Show More
+++ b/Show More