more changes to doc pages and CMakeLists.txt

2018-08-14 15:44:25 -06:00
parent 64d539d9d2
commit 7ec52784cb
35 changed files with 230 additions and 372 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -371,8 +371,8 @@ if(PKG_USER-NETCDF)
 endif()

 if(PKG_USER-SMD)
-  option(DOWNLOAD_Eigen3 "Download Eigen3 (instead of using the system's one)" OFF)
-  if(DOWNLOAD_Eigen3)
+  option(DOWNLOAD_EIGEN3 "Download Eigen3 (instead of using the system's one)" OFF)
+  if(DOWNLOAD_EIGEN3)
    include(ExternalProject)
    ExternalProject_Add(Eigen3_build
      URL http://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz 
@ -385,7 +385,7 @@ if(PKG_USER-SMD)
  else()
    find_package(Eigen3)
    if(NOT Eigen3_FOUND)
-      message(FATAL_ERROR "Eigen3 not found, help CMake to find it by setting EIGEN3_INCLUDE_DIR, or set DOWNLOAD_Eigen3=ON to download it")
+      message(FATAL_ERROR "Eigen3 not found, help CMake to find it by setting EIGEN3_INCLUDE_DIR, or set DOWNLOAD_EIGEN3=ON to download it")
    endif()
  endif()
  include_directories(${EIGEN3_INCLUDE_DIR})
@ -807,16 +807,26 @@ if(PKG_GPU)
                    ${GPU_SOURCES_DIR}/fix_gpu.h
                    ${GPU_SOURCES_DIR}/fix_gpu.cpp)

-    set(GPU_API "OpenCL" CACHE STRING "API used by GPU package")
-    set_property(CACHE GPU_API PROPERTY STRINGS OpenCL CUDA)
+    set(GPU_API "opencl" CACHE STRING "API used by GPU package")
+    set_property(CACHE GPU_API PROPERTY STRINGS opencl cuda)
+    string(TOUPPER ${GPU_API} GPU_API_DEFINE)

-    set(GPU_PREC "SINGLE_DOUBLE" CACHE STRING "LAMMPS GPU precision size")
-    set_property(CACHE GPU_PREC PROPERTY STRINGS SINGLE_DOUBLE SINGLE_SINGLE DOUBLE_DOUBLE)
+    set(GPU_PREC "mixed" CACHE STRING "LAMMPS GPU precision")
+    set_property(CACHE GPU_PREC PROPERTY STRINGS double mixed single)
+    string(TOUPPER ${GPU_PREC} GPU_PREC_DEFINE)
+
+    if(GPU_PREC_DEFINE STREQUAL "DOUBLE")
+      set(GPU_PREC_SETTING "DOUBLE_DOUBLE")
+    elseif(GPU_PREC_DEFINE STREQUAL "MIXED")
+      set(GPU_PREC_SETTING "SINGLE_DOUBLE")
+    elseif(GPU_PREC_DEFINE STREQUAL "SINGLE")
+      set(GPU_PREC_SETTING "SINGLE_SINGLE")
+    endif()

    file(GLOB GPU_LIB_SOURCES ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cpp)
    file(MAKE_DIRECTORY ${LAMMPS_LIB_BINARY_DIR}/gpu)

-    if(GPU_API STREQUAL "CUDA")
+    if(GPU_API_DEFINE STREQUAL "CUDA")
      find_package(CUDA REQUIRED)
      find_program(BIN2C bin2c)
      if(NOT BIN2C)
@ -824,7 +834,7 @@ if(PKG_GPU)
      endif()
      option(CUDPP_OPT "Enable CUDPP_OPT" ON)

-      set(GPU_ARCH "sm_30" CACHE STRING "LAMMPS GPU CUDA SM architecture (e.g.  sm_60)")
+      set(GPU_ARCH "30" CACHE STRING "LAMMPS GPU CUDA SM architecture (e.g. 60)")

      file(GLOB GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/*.cu ${CMAKE_CURRENT_SOURCE_DIR}/gpu/*.cu)
      list(REMOVE_ITEM GPU_LIB_CU ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_pppm.cu)
@ -838,10 +848,10 @@ if(PKG_GPU)
      endif()

      cuda_compile_cubin(GPU_GEN_OBJS ${GPU_LIB_CU} OPTIONS
-                   -DUNIX -O3 -Xptxas -v --use_fast_math -DNV_KERNEL -DUCL_CUDADR -arch=${GPU_ARCH} -D_${GPU_PREC})
+                   -DUNIX -O3 -Xptxas -v --use_fast_math -DNV_KERNEL -DUCL_CUDADR -arch=sm_${GPU_ARCH} -D_${GPU_PREC_SETTING})

      cuda_compile(GPU_OBJS ${GPU_LIB_CUDPP_CU} OPTIONS $<$<BOOL:${BUILD_SHARED_LIBS}>:-Xcompiler=-fPIC>
-                   -DUNIX -O3 -Xptxas -v --use_fast_math -DUCL_CUDADR -arch=${GPU_ARCH} -D_${GPU_PREC})
+                   -DUNIX -O3 -Xptxas -v --use_fast_math -DUCL_CUDADR -arch=sm_${GPU_ARCH} -D_${GPU_PREC_SETTING})

      foreach(CU_OBJ ${GPU_GEN_OBJS})
        get_filename_component(CU_NAME ${CU_OBJ} NAME_WE)
@ -858,7 +868,7 @@ if(PKG_GPU)
      add_library(gpu STATIC ${GPU_LIB_SOURCES} ${GPU_LIB_CUDPP_SOURCES} ${GPU_OBJS})
      target_link_libraries(gpu ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
      target_include_directories(gpu PRIVATE ${LAMMPS_LIB_BINARY_DIR}/gpu ${CUDA_INCLUDE_DIRS})
-      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC} -DMPI_GERYON -DUCL_NO_EXIT)
+      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -DMPI_GERYON -DUCL_NO_EXIT)
      if(CUDPP_OPT)
        target_include_directories(gpu PRIVATE ${LAMMPS_LIB_SOURCE_DIR}/gpu/cudpp_mini)
        target_compile_definitions(gpu PRIVATE -DUSE_CUDPP)
@ -872,10 +882,11 @@ if(PKG_GPU)
      target_include_directories(nvc_get_devices PRIVATE ${CUDA_INCLUDE_DIRS})


-    elseif(GPU_API STREQUAL "OpenCL")
+    elseif(GPU_API_DEFINE STREQUAL "OPENCL")
      find_package(OpenCL REQUIRED)
-      set(OCL_TUNE "GENERIC" CACHE STRING "OpenCL Device Tuning")
-      set_property(CACHE OCL_TUNE PROPERTY STRINGS INTEL FERMI KEPLER CYPRESS GENERIC)
+      set(OCL_TUNE "generic" CACHE STRING "OpenCL Device Tuning")
+      set_property(CACHE OCL_TUNE PROPERTY STRINGS intel fermi kepler cypress generic)
+      string(TOUPPER ${OCL_TUNE} OCL_TUNE_DEFINE)

      include(OpenCLUtils)
      set(OCL_COMMON_HEADERS ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_preprocessor.h ${LAMMPS_LIB_SOURCE_DIR}/gpu/lal_aux_fun1.h)
@ -897,7 +908,7 @@ if(PKG_GPU)
      add_library(gpu STATIC ${GPU_LIB_SOURCES})
      target_link_libraries(gpu ${OpenCL_LIBRARIES})
      target_include_directories(gpu PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/gpu ${OpenCL_INCLUDE_DIRS})
-      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC} -D${OCL_TUNE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
+      target_compile_definitions(gpu PRIVATE -D_${GPU_PREC_SETTING} -D${OCL_TUNE_DEFINE}_OCL -DMPI_GERYON -DUCL_NO_EXIT)
      target_compile_definitions(gpu PRIVATE -DUSE_OPENCL)

      list(APPEND LAMMPS_LINK_LIBS gpu)
@ -1148,9 +1159,9 @@ if(BUILD_MPI)
 endif()
 if(PKG_GPU)
  message(STATUS "GPU Api: ${GPU_API}")
-  if(GPU_API STREQUAL "CUDA")
-    message(STATUS "GPU Arch: ${GPU_ARCH}")
-  elseif(GPU_API STREQUAL "OpenCL")
+  if(GPU_API_DEFINE STREQUAL "CUDA")
+    message(STATUS "GPU Arch: sm_${GPU_ARCH}")
+  elseif(GPU_API_DEFINE STREQUAL "OPENCL")
    message(STATUS "OCL Tune: ${OCL_TUNE}")
  endif()
  message(STATUS "GPU Precision: ${GPU_PREC}")
--- a/doc/src/Build.txt
+++ b/doc/src/Build.txt
@ -19,18 +19,21 @@ as described on the "Install"_Install.html doc page.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Build_cmake
   Build_make
   Build_link

 .. toctree::
+   :maxdepth: 1

   Build_basics
   Build_settings
   Build_package

 .. toctree::
+   :maxdepth: 1

   Build_extras

--- a/doc/src/Build_basics.txt
+++ b/doc/src/Build_basics.txt
@ -18,7 +18,6 @@ CMake and make:
 "Build the LAMMPS documentation"_#doc
 "Install LAMMPS after a build"_#install :ul

-:line
 :line

 Serial vs parallel build :h3,link(serial)
--- a/doc/src/Build_extras.txt
+++ b/doc/src/Build_extras.txt
@ -49,7 +49,6 @@ This is the list of packages that may require additional steps.
 "USER-SMD"_#user-smd,
 "USER-VTK"_#user-vtk :tb(c=6,ea=c)

-:line
 :line

 COMPRESS package :h4,link(compress)
@ -81,15 +80,15 @@ which GPU hardware to build for.

 -D GPU_API=value      # value = opencl (default) or cuda
 -D GPU_PREC=value     # precision setting
-                      # value = single or mixed (default) or double
+                      # value = double or mixed (default) or single
 -D OCL_TUNE=value     # hardware choice for GPU_API=opencl
-                      # generic (default) or intel (Intel CPU) or phi (Intel Xeon Phi) or fermi, kepler, cypress (NVIDIA)
+                      # generic (default) or intel (Intel CPU) or fermi, kepler, cypress (NVIDIA)
 -D GPU_ARCH=value     # hardware choice for GPU_API=cuda
-                      # value = sm20 (Fermi) or sm30 (Kepler) or sm50 (Maxwell) or sm60 (Pascal) or sm70 (Volta)
+                      # value = 20 (Fermi) or 30 (Kepler) or 50 (Maxwell) or 60 (Pascal) or 70 (Volta)
                      # default is Cuda-compiler dependent, but typically Fermi
 -D CUDPP_OPT=value    # optimization setting for GPU_API=cudea
                      # enables CUDA Performance Primitives Optimizations
-                      # on (default) or off :pre
+                      # yes (default) or no :pre

 [Traditional make]:

@ -119,7 +118,7 @@ Makefile.machine you start from via the -h, -a, -p, -e switches, and
 also save a copy of the new Makefile if desired:

 CUDA_HOME = where NVIDIA CUDA software is installed on your system
-CUDA_ARCH = what GPU hardware you have (see help message for details)
+CUDA_ARCH = what GPU hardware you have (same as CMake, see help message for details)
 CUDA_PRECISION = precision (double, mixed, single)
 EXTRAMAKE = which Makefile.lammps.* file to copy to Makefile.lammps :ul

@ -163,7 +162,7 @@ package?" page.

 [CMake build]:

-D DOWNLOAD_KIM=value    # download OpenKIM API v1 for build, value = off (default) or on
+-D DOWNLOAD_KIM=value    # download OpenKIM API v1 for build, value = no (default) or yes
 -D KIM_LIBRARY=path      # path to KIM shared library (only needed if a custom location) 
 -D KIM_INCLUDE_DIR=path  # path to KIM include directory (only needed if a custom location) :pre

@ -183,17 +182,65 @@ make lib-kim args="-p /usr/local/kim-api" # use an existing KIM API installation
 make lib-kim args="-p /usr/local/kim-api -a EAM_Dynamo_Ackland_W__MO_141627196590_002" # ditto but add one model or driver :pre

 :line
- 
+
 KOKKOS package :h4,link(kokkos)

 To build with this package, you must choose which hardware you want to
 build for, either CPUs (multi-threading via OpenMP) or KNLs (OpenMP)
-or GPUs (Cuda).
+or GPUs (NVIDIA Cuda).
+
+For a CMake or make build, these are the possible choices for the
+KOKKOS_ARCH settings described below.  Note that for CMake, these are
+really Kokkos variables, not LAMMPS variables.  Hence you must use
+case-sensitive values, e.g. BDW, not bdw.
+
+ARMv80 = ARMv8.0 Compatible CPU
+ARMv81 = ARMv8.1 Compatible CPU
+ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU
+BGQ = IBM Blue Gene/Q CPUs
+Power8 = IBM POWER8 CPUs
+Power9 = IBM POWER9 CPUs
+SNB = Intel Sandy/Ivy Bridge CPUs
+HSW = Intel Haswell CPUs
+BDW = Intel Broadwell Xeon E-class CPUs
+SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)
+KNC = Intel Knights Corner Xeon Phi
+KNL = Intel Knights Landing Xeon Phi
+Kepler30 = NVIDIA Kepler generation CC 3.0
+Kepler32 = NVIDIA Kepler generation CC 3.2
+Kepler35 = NVIDIA Kepler generation CC 3.5
+Kepler37 = NVIDIA Kepler generation CC 3.7
+Maxwell50 = NVIDIA Maxwell generation CC 5.0
+Maxwell52 = NVIDIA Maxwell generation CC 5.2
+Maxwell53 = NVIDIA Maxwell generation CC 5.3
+Pascal60 = NVIDIA Pascal generation CC 6.0
+Pascal61 = NVIDIA Pascal generation CC 6.1 :ul

 [CMake build]:

-TODO: how to do this, how to select CPU vs KNL vs GPU, and specify
-the particular flavor of hardware: e.g. HSW vs BWL
+For multicore CPUs using OpenMP, set these 2 variables.
+
+-D KOKKOS_ARCH=archCPU         # archCPU = CPU from list above :pre
+-D KOKKOS_ENABLE_OPENMP=yes :pre
+
+For Intel KNLs using OpenMP, set these 2 variables:
+
+-D KOKKOS_ARCH=KNL
+-D KOKKOS_ENABLE_OPENMP=yes :pre
+
+For NVIDIA GPUs using CUDA, set these 4 variables:
+
+-D KOKKOS_ARCH="archCPU;archGPU"   # archCPU = CPU from list above that is hosting the GPU
+                                   # archGPU = GPU from list above
+-D KOKKOS_ENABLE_CUDA=yes
+-D KOKKOS_ENABLE_OPENMP=yes 
+-D CMAKE_CXX_COMPILER=wrapper      # wrapper = full path to Cuda nvcc wrapper :pre
+
+The wrapper value is the Cuda nvcc compiler wrapper provided in the
+Kokkos library: lib/kokkos/bin/nvcc_wrapper.  The setting should
+include the full path name to the wrapper, e.g.
+
+-D CMAKE_CXX_COMPILER=/home/username/lammps/lib/kokkos/bin/nvcc_wrapper :pre

 [Traditional make]:

@ -204,16 +251,7 @@ src/MAKE/OPTIONS/Makefile.kokkos* files for examples.
 For multicore CPUs using OpenMP:

 KOKKOS_DEVICES = OpenMP
-KOKKOS_ARCH = HSW :pre
-
-Possible values are:
-
-HSW for Intel Haswell
-SNB for Intel SandyBridge
-BDW for Intel Broadwell
-BGQ for IBM BlueGene Q
-Power7 for IBM
-Power8 for IBM :ul
+KOKKOS_ARCH = archCPU      # archCPU = CPU from list above :pre

 For Intel KNLs using OpenMP:

@ -223,8 +261,8 @@ KOKKOS_ARCH = KNL :pre
 For NVIDIA GPUs using CUDA:

 KOKKOS_DEVICES = Cuda
-KOKKOS_ARCH = Pascal60,Power8     # P100 hosted by an IBM Power8, etc
-KOKKOS_ARCH = Kepler37,Power8     # K80 hosted by an IBM Power8, etc :pre
+KOKKOS_ARCH = archCPU,archGPU    # archCPU = CPU from list above that is hosting the GPU
+                                 # archGPU = GPU from list above :pre

 For GPUs, you also need these 2 lines in your Makefile.machine before
 the CC line is defined, in this case for use with OpenMPI mpicxx.  The
@ -245,7 +283,7 @@ library.

 [CMake build]:

-D DOWNLOAD_LATTE=value    # download LATTE for build, value = off (default) or on
+-D DOWNLOAD_LATTE=value    # download LATTE for build, value = no (default) or yes
 -D LATTE_LIBRARY=path      # path to LATTE shared library (only needed if a custom location) :pre

 [Traditional make]:
@ -320,7 +358,7 @@ lib/mscg/README and MSCG/Install files for more details.

 [CMake build]:

-D DOWNLOAD_MSCG=value    # download MSCG for build, value = off (default) or on
+-D DOWNLOAD_MSCG=value    # download MSCG for build, value = no (default) or yes
 -D MSCG_LIBRARY=path      # path to MSCG shared library (only needed if a custom location) 
 -D MSCG_INCLUDE_DIR=path  # path to MSCG include directory (only needed if a custom location) :pre

@ -400,12 +438,12 @@ lib/python/README for more details.

 -D PYTHON_EXECUTABLE=path   # path to Python executable to use :pre

-Without this setting, CMake will you your system default Python.  To
-use a different Python version, you can either create a virtualenv,
-activate it and then run cmake.  Or you can set the PYTHON_EXECUTABLE
-variable to specify which Python interpreter should be used.  Note
-note that you will also need to have the development headers installed
-for this version, e.g. python2-devel.
+Without this setting, CMake will ues the default Python on your
+system.  To use a different Python version, you can either create a
+virtualenv, activate it and then run cmake.  Or you can set the
+PYTHON_EXECUTABLE variable to specify which Python interpreter should
+be used.  Note note that you will also need to have the development
+headers installed for this version, e.g. python2-devel.

 [Traditional make]:

@ -464,7 +502,7 @@ library"_voro_home.

 [CMake build]:

-D DOWNLOAD_VORO=value    # download Voro++ for build, value = off (default) or on
+-D DOWNLOAD_VORO=value    # download Voro++ for build, value = no (default) or yes
 -D VORO_LIBRARY=path      # (only needed if at custom location) path to VORO shared library
 -D VORO_INCLUDE_DIR=path  # (only needed if at custom location) path to VORO include directory :pre

@ -486,7 +524,6 @@ created in lib/voronoi to point to the Voro++ src dir.  When LAMMPS
 builds in src it will use these links.  You should not need to edit
 the lib/voronoi/Makefile.lammps file.

-:line
 :line

 USER-ATC package :h4,link(user-atc)
@ -642,15 +679,16 @@ USER-INTEL package :h4,link(user-intel)

 To build with this package, you must choose which hardware you want to
 build for, either Intel CPUs or Intel KNLs.  You should also typically
-install the USER-OMP package, as it can be used in tandem with the
-USER-INTEL package to good effect, as explained on the "Speed
+"install the USER-OMP package"_#user-omp, as it can be used in tandem
+with the USER-INTEL package to good effect, as explained on the "Speed
 intel"_Speed_intel.html doc page.

 [CMake build]:

 -D INTEL_ARCH=value     # value = cpu (default) or knl :pre
+-D BUILD_OMP=yes        # also required to build with the USER-INTEl package :pre

-Requires an Intel compiler, Intel TBB and MKL and has to be built with "-D BUILD_OMP=on".
+Requires an Intel compiler as well as the Intel TBB and MKL libraries.

 [Traditional make]:

@ -821,22 +859,19 @@ successfully build on your system.

 USER-SMD package :h4,link(user-smd)

-To build with this package, you must download the Eigen library.
-Eigen is a template library, so you do not need to build it.
+To build with this package, you must download the Eigen3 library.
+Eigen3 is a template library, so you do not need to build it.

 [CMake build]:

-D EIGEN3_INCLUDE_DIR=path    # path to Eigen library :pre
+-D DOWNLOAD_EIGEN3            # download Eigen3, value = no (default) or yes
+-D EIGEN3_INCLUDE_DIR=path    # path to Eigen library (only needed if a custom location) :pre

-TODO: there is no download option for the Eigen lib?
-
-CMake will not download the Eigen library.  But once you have done
-that, a CMake build of LAMMPS with "-D PKG_USER-SMD=yes" should work.
-Set EIGEN3_INCLUDE_DIR if CMake cannot find the Eigen library.
+Set EIGEN3_INCLUDE_DIR if CMake cannot find the Eigen3 library.

 [Traditional make]:

-You can download the Eigen library manually if you prefer; follow the
+You can download the Eigen3 library manually if you prefer; follow the
 instructions in lib/smd/README.  You can also do it in one step from
 the lammps/src dir, using a command like these, which simply invoke
 the lib/smd/Install.py script with the specified args:
--- a/doc/src/Build_package.txt
+++ b/doc/src/Build_package.txt
@ -130,16 +130,16 @@ the Git or SVN repositories, no packages are pre-installed.

 [CMake shortcuts for installing many packages]:

-Instead of specifying all the CMake options via the command-line, CMake allows
-initializing the variable cache using script files. These are regular CMake
-files which can manipulate and set variables, and can also contain control flow
-constructs.
+Instead of specifying all the CMake options via the command-line,
+CMake allows initializing the variable cache using script files. These
+are regular CMake files which can manipulate and set variables, and
+can also contain control flow constructs.

-LAMMPS includes several of these files to define configuration "presets",
-similar to the options that exist for the Make based system. Using these files
-you can enable/disable portions of the available packages in LAMMPS. If you need a
-custom preset you can take one of them as a starting point and customize it to your
-needs.
+LAMMPS includes several of these files to define configuration
+"presets", similar to the options that exist for the Make based
+system. Using these files you can enable/disable portions of the
+available packages in LAMMPS. If you need a custom preset you can take
+one of them as a starting point and customize it to your needs.

 cmake -C ../cmake/presets/all_on.cmake \[OPTIONS\] ../cmake | enable all packages
 cmake -C ../cmake/presets/all_off.cmake \[OPTIONS\] ../cmake | disable all packages
@ -149,8 +149,9 @@ cmake -C ../cmake/presets/std_nolib.cmake \[OPTIONS\] ../cmake | enable standard
 cmake -C ../cmake/presets/nolib.cmake \[OPTIONS\] ../cmake | disable all packages that do not require extra libraries
 cmake -C ../cmake/presets/manual_selection.cmake \[OPTIONS\] ../cmake | example of how to create a manual selection of packages :tb(s=|,a=l)

-NOTE: Running cmake this way manipulates the variable cache in your current
-build directory. You can combine presets and options with multiple cmake runs.
+NOTE: Running cmake this way manipulates the variable cache in your
+current build directory. You can combine presets and options with
+multiple cmake runs.

 [Example:]

--- a/doc/src/Build_settings.txt
+++ b/doc/src/Build_settings.txt
@ -21,7 +21,6 @@ explain how to do this for building both with CMake and make.
 "Workaround for long long integers"_#longlong
 "Error handling exceptions"_#exceptions when using LAMMPS as a library :all(b)

-:line
 :line
 
 FFT library :h3,link(fft)
@ -38,6 +37,10 @@ LAMMPS can use them if they are available on your system.
 -D FFT_SINGLE=value       # yes or no (default), no = double precision
 -D FFT_PACK=value         # array (default) or pointer or memcpy :pre

+NOTE: The values for the FFT variable must be in upper-case.
+This is an exception to the rule that all CMake variables can
+be specified with lower-case values.
+
 Usually these settings are all that is needed.  If CMake cannot find
 the FFT library, you can set these variables:

@ -50,10 +53,11 @@ the FFT library, you can set these variables:

 [Makefile.machine settings]:

-FFT_INC = -DFFT_FFTW3         # -DFFT_FFTW3, -DFFT_FFTW2, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISSFFT
+FFT_INC = -DFFT_FFTW3         # -DFFT_FFTW3, -DFFT_FFTW2, -DFFT_FFTW (same as -DFFT_FFTW3), -DFFT_MKL, or -DFFT_KISS
                              # default is KISS if not specified
 FFT_INC = -DFFT_SINGLE        # do not specify for double precision
 FFT_INC = -DFFT_PACK_ARRAY    # or -DFFT_PACK_POINTER or -DFFT_PACK_MEMCPY :pre
+                              # default is FFT_PACK_ARRAY if not specified

 FFT_INC =    	-I/usr/local/include
 FFT_PATH =      -L/usr/local/lib
@ -84,9 +88,10 @@ pppm"_kspace_style.html command.  The "Run output"_doc page gives more
 details.

 FFTW is a fast, portable FFT library that should also work on any
-platform and can be faster than KISS FFT.  You can download it from
-"www.fftw.org"_http://www.fftw.org.  Both the (obsolete) legacy version
-2.1.X and the newer 3.X versions are supported.  
+platform and can be faster than the KISS FFT library.  You can
+download it from "www.fftw.org"_http://www.fftw.org.  Both the
+(obsolete) legacy version 2.1.X and the newer 3.X versions are
+supported.

 NOTE: FFTW2 has not been updated since 1999 and has been declared
 obsolete by its developers.
@ -148,7 +153,7 @@ adequate.
 [Makefile.machine setting]:

 LMP_INC = -DLAMMPS_SMALLBIG    # or -DLAMMPS_BIGBIG or -DLAMMPS_SMALLSMALL :pre
-
+                               # default is LAMMMPS_SMALLBIG if not specified
 [CMake and make info]:

 The default "smallbig" setting allows for simulations with:
@ -298,10 +303,10 @@ aligned on 64-byte boundaries.

 [CMake variable]:

-D LAMMPS_MEMALIGN=value            # 8, 16, 32, 64 (default) :pre
+-D LAMMPS_MEMALIGN=value            # 0, 8, 16, 32, 64 (default) :pre

 Use a LAMMPS_MEMALIGN value of 0 to disable using posix_memalign()
-and revert to using the malloc() C-library function instead. When
+and revert to using the malloc() C-library function instead.  When
 compiling LAMMPS for Windows systems, malloc() will always be used
 and this setting ignored.

--- a/doc/src/Commands.txt
+++ b/doc/src/Commands.txt
@ -16,6 +16,7 @@ commands in it are used to define a LAMMPS simulation.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Commands_input
   Commands_parse
@ -23,6 +24,7 @@ commands in it are used to define a LAMMPS simulation.
   Commands_category

 .. toctree::
+   :maxdepth: 1

   Commands_all
   Commands_fix
--- a/doc/src/Errors.txt
+++ b/doc/src/Errors.txt
@ -19,6 +19,7 @@ additional details for many of them.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Errors_common
   Errors_bugs
--- a/doc/src/Install.txt
+++ b/doc/src/Install.txt
@ -20,6 +20,7 @@ need the source code.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Install_linux
   Install_mac
--- a/doc/src/Install_linux.txt
+++ b/doc/src/Install_linux.txt
@ -15,7 +15,6 @@ Binaries are available for many different versions of Linux:
 "Pre-built Ubuntu Linux executables"_#ubuntu
 "Pre-built Gentoo Linux executable"_#gentoo :all(b)

-:line
 :line

 Pre-built binary RPMs for Fedora/RedHat/CentOS/openSUSE :h4,link(rpm)
--- a/doc/src/Intro.txt
+++ b/doc/src/Intro.txt
@ -15,6 +15,7 @@ These pages provide a brief introduction to LAMMPS.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Intro_overview
   Manual_version
--- a/doc/src/Intro_authors.txt
+++ b/doc/src/Intro_authors.txt
@ -58,7 +58,6 @@ Terry Stouch (Lexicon Pharmaceuticals, formerly at Bristol Myers Squibb)
 Steve Lustig (Dupont)
 Jim Belak and Roy Pollock (LLNL) :ul

-:line
 :line

 Here is a timeline for when various individuals contributed to a new
@ -239,7 +238,7 @@ Aug11 : angle_style cosine/shift and cosine/shift/exp : Carsten Svaneborg
 Aug11 : dihedral_style cosine/shift/exp : Carsten Svaneborg
 Aug11 : pair_style dipole/sf : Mario Orsi
 Aug11 : fix addtorque and compute temp/rotate : Laurent Joly (U Lyon)
-Aug11 : FFT support via FFTW3, MKL, ACML, KISSFFT libraries : \
+Aug11 : FFT support via FFTW3, MKL, ACML, KISS FFT libraries : \
  Axel Kohlmeyer (Temple U)
 Jun11 : pair_style adp : Chris Weinberger (Sandia), Stephen Foiles (Sandia), \
  Chandra Veer Singh (Cornell)
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@ -84,7 +84,7 @@ every LAMMPS command.
   Modify
   Python
   Errors
-   Build_manual
+   Manual_build

 .. toctree::
   :caption: Index
--- a/doc/src/Manual_build.txt
+++ b/doc/src/Manual_build.txt
@ -122,4 +122,3 @@ software installed. "http://calibre-ebook.com/"_http://calibre-ebook.com/
 You first create the ePUB file with 'make epub' and then do:

 ebook-convert LAMMPS.epub LAMMPS.mobi :pre
-
--- a/doc/src/Modify.txt
+++ b/doc/src/Modify.txt
@ -24,11 +24,13 @@ contribute"_Modify_contribute.html doc page.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Modify_overview
   Modify_contribute

 .. toctree::
+   :maxdepth: 1

   Modify_atom
   Modify_pair
@ -38,6 +40,7 @@ contribute"_Modify_contribute.html doc page.
   Modify_command

 .. toctree::
+   :maxdepth: 1

   Modify_dump
   Modify_kspace
@ -46,6 +49,7 @@ contribute"_Modify_contribute.html doc page.
   Modify_body

 .. toctree::
+   :maxdepth: 1

   Modify_thermo
   Modify_variable
--- a/doc/src/Packages.txt
+++ b/doc/src/Packages.txt
@ -23,6 +23,7 @@ LAMMPS build process.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Packages_standard
   Packages_user
--- a/doc/src/Packages_details.txt
+++ b/doc/src/Packages_details.txt
@ -99,7 +99,6 @@ as contained in the file name.
 "USER-UEF"_#PKG-USER-UEF,
 "USER-VTK"_#PKG-USER-VTK :tb(c=6,ea=c)

-:line
 :line

 ASPHERE package :link(PKG-ASPHERE),h4
@ -1007,7 +1006,6 @@ lib/voronoi/README
 "compute voronoi/atom"_compute_voronoi_atom.html
 examples/voronoi :ul

-:line
 :line

 USER-ATC package :link(PKG-USER-ATC),h4
--- a/doc/src/Python.txt
+++ b/doc/src/Python.txt
@ -16,10 +16,12 @@ used together.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Python_overview

 .. toctree::
+   :maxdepth: 1

   Python_run
   Python_shlib
@ -31,6 +33,7 @@ used together.
   Python_examples

 .. toctree::
+   :maxdepth: 1

   Python_call

--- a/doc/src/Run.txt
+++ b/doc/src/Run.txt
@ -19,6 +19,7 @@ they can contain.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Run_basics
   Run_options
--- a/doc/src/Run_options.txt
+++ b/doc/src/Run_options.txt
@ -34,7 +34,6 @@ For example, the lmp_mpi executable might be launched as follows:
 mpirun -np 16 lmp_mpi -v f tmp.out -l my.log -sc none -i in.alloy
 mpirun -np 16 lmp_mpi -var f tmp.out -log my.log -screen none -in in.alloy :pre

-:line
 :line

 [-echo style] :link(echo)
--- a/doc/src/Speed.txt
+++ b/doc/src/Speed.txt
@ -31,15 +31,18 @@ hardware platforms.
 <!-- RST

 .. toctree::
+   :maxdepth: 1

   Speed_bench
   Speed_measure

 .. toctree::
+   :maxdepth: 1

   Speed_tips

 .. toctree::
+   :maxdepth: 1

   Speed_packages
   Speed_compare
--- a/doc/src/Speed_gpu.txt
+++ b/doc/src/Speed_gpu.txt
@ -43,89 +43,22 @@ same functionality can eventually be supported on a variety of GPU
 hardware. :l
 :ule

-Here is a quick overview of how to enable and use the GPU package:
-
-build the library in lib/gpu for your GPU hardware with the desired precision settings
-install the GPU package and build LAMMPS as usual
-use the mpirun command to set the number of MPI tasks/node which determines the number of MPI tasks/GPU
-specify the # of GPUs per node
-use GPU styles in your input script :ul
-
-The latter two steps can be done using the "-pk gpu" and "-sf gpu"
-"command-line switches"_Run_options.html respectively.  Or the effect
-of the "-pk" or "-sf" switches can be duplicated by adding the
-"package gpu"_package.html or "suffix gpu"_suffix.html commands
-respectively to your input script.
-
 [Required hardware/software:]

 To use this package, you currently need to have an NVIDIA GPU and
 install the NVIDIA CUDA software on your system:

-Check if you have an NVIDIA GPU: cat /proc/driver/nvidia/gpus/0/information
-Go to http://www.nvidia.com/object/cuda_get.html
-Install a driver and toolkit appropriate for your system (SDK is not necessary)
-Run lammps/lib/gpu/nvc_get_devices (after building the GPU library, see below) to list supported devices and properties :ul
+Check if you have an NVIDIA GPU: cat
+/proc/driver/nvidia/gpus/0/information Go to
+http://www.nvidia.com/object/cuda_get.html Install a driver and
+toolkit appropriate for your system (SDK is not necessary) Run
+lammps/lib/gpu/nvc_get_devices (after building the GPU library, see
+below) to list supported devices and properties :ul

 [Building LAMMPS with the GPU package:]

-This requires two steps (a,b): build the GPU library, then build
-LAMMPS with the GPU package.  You can do both these steps in one line
-as described on the "Packages details"_Packages_details.html#GPU doc
-page.
-
-Or you can follow these two (a,b) steps:
-
-(a) Build the GPU library
-
-The GPU library is in lammps/lib/gpu.  Select a Makefile.machine (in
-lib/gpu) appropriate for your system.  You should pay special
-attention to 3 settings in this makefile.
-
-CUDA_HOME = needs to be where NVIDIA CUDA software is installed on your system
-CUDA_ARCH = needs to be appropriate to your GPUs
-CUDA_PREC = precision (double, mixed, single) you desire :ul
-
-See lib/gpu/Makefile.linux.double for examples of the ARCH settings
-for different GPU choices, e.g. Fermi vs Kepler.  It also lists the
-possible precision settings:
-
-CUDA_PREC = -D_SINGLE_SINGLE  # single precision for all calculations
-CUDA_PREC = -D_DOUBLE_DOUBLE  # double precision for all calculations
-CUDA_PREC = -D_SINGLE_DOUBLE  # accumulation of forces, etc, in double :pre
-
-The last setting is the mixed mode referred to above.  Note that your
-GPU must support double precision to use either the 2nd or 3rd of
-these settings.
-
-To build the library, type:
-
-make -f Makefile.machine :pre
-
-If successful, it will produce the files libgpu.a and Makefile.lammps.
-
-The latter file has 3 settings that need to be appropriate for the
-paths and settings for the CUDA system software on your machine.
-Makefile.lammps is a copy of the file specified by the EXTRAMAKE
-setting in Makefile.machine.  You can change EXTRAMAKE or create your
-own Makefile.lammps.machine if needed.
-
-Note that to change the precision of the GPU library, you need to
-re-build the entire library.  Do a "clean" first, e.g. "make -f
-Makefile.linux clean", followed by the make command above.
-
-(b) Build LAMMPS with the GPU package
-
-cd lammps/src
-make yes-gpu
-make machine :pre
-
-No additional compile/link flags are needed in Makefile.machine.
-
-Note that if you change the GPU library precision (discussed above)
-and rebuild the GPU library, then you also need to re-install the GPU
-package and re-build LAMMPS, so that all affected files are
-re-compiled and linked to the new GPU library.
+See the "Build extras"_Build_extras.html#gpu doc page for
+instructions.

 [Run with the GPU package from the command line:]

--- a/doc/src/Speed_intel.txt
+++ b/doc/src/Speed_intel.txt
@ -203,16 +203,12 @@ cat /proc/cpuinfo :pre

 [Building LAMMPS with the USER-INTEL package:]

-NOTE: See the src/USER-INTEL/README file for additional flags that
-might be needed for best performance on Intel server processors
-code-named "Skylake".
+See the "Build extras"_Build_extras.html#user-intel doc page for
+instructions.  Some additional details are covered here.

-The USER-INTEL package must be installed into the source directory:
-
-make yes-user-intel :pre
-
-Several example Makefiles for building with the Intel compiler are
-included with LAMMPS in the src/MAKE/OPTIONS/ directory:
+For building with make, several example Makefiles for building with
+the Intel compiler are included with LAMMPS in the src/MAKE/OPTIONS/
+directory:

 Makefile.intel_cpu_intelmpi # Intel Compiler, Intel MPI, No Offload
 Makefile.knl                # Intel Compiler, Intel MPI, No Offload
@ -221,20 +217,16 @@ Makefile.intel_cpu_openpmi  # Intel Compiler, OpenMPI, No Offload
 Makefile.intel_coprocessor  # Intel Compiler, Intel MPI, Offload :pre

 Makefile.knl is identical to Makefile.intel_cpu_intelmpi except that
-it explicitly specifies that vectorization should be for Intel
-Xeon Phi x200 processors making it easier to cross-compile. For
-users with recent installations of Intel Parallel Studio, the
-process can be as simple as:
+it explicitly specifies that vectorization should be for Intel Xeon
+Phi x200 processors making it easier to cross-compile. For users with
+recent installations of Intel Parallel Studio, the process can be as
+simple as:

 make yes-user-intel
 source /opt/intel/parallel_studio_xe_2016.3.067/psxevars.sh
 # or psxevars.csh for C-shell
 make intel_cpu_intelmpi :pre

-Alternatively this can be done as a single command with suitable make
-command invocations, as described on the "Packages
-details"_Packages_details.html#USER-INTEL doc page.
-
 Note that if you build with support for a Phi coprocessor, the same
 binary can be used on nodes with or without coprocessors installed.
 However, if you do not have coprocessors on your system, building
@ -253,6 +245,10 @@ required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other
 recommended CCFLAG options for best performance are "-O2 -fno-alias
 -ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div".

+NOTE: See the src/USER-INTEL/README file for additional flags that
+might be needed for best performance on Intel server processors
+code-named "Skylake".
+
 NOTE: The vectorization and math capabilities can differ depending on
 the CPU. For Intel compilers, the "-x" flag specifies the type of
 processor for which to optimize. "-xHost" specifies that the compiler
--- a/doc/src/Speed_kokkos.txt
+++ b/doc/src/Speed_kokkos.txt
@ -37,101 +37,29 @@ task). These are Serial (MPI-only for CPUs and Intel Phi), OpenMP
 GPUs). You choose the mode at build time to produce an executable
 compatible with specific hardware.

-[Building LAMMPS with the KOKKOS package:]
-
 NOTE: Kokkos support within LAMMPS must be built with a C++11 compatible
 compiler. This means GCC version 4.7.2 or later, Intel 14.0.4 or later, or
 Clang 3.5.2 or later is required.

-The recommended method of building the KOKKOS package is to start with
-the provided Kokkos Makefiles in /src/MAKE/OPTIONS/. You may need to
-modify the KOKKOS_ARCH variable in the Makefile to match your specific
-hardware. For example:
-
-for Sandy Bridge CPUs, set KOKKOS_ARCH=SNB
-for Broadwell CPUs, set KOKKOS_ARCH=BWD
-for K80 GPUs, set KOKKOS_ARCH=Kepler37
-for P100 GPUs and Power8 CPUs, set KOKKOS_ARCH=Pascal60,Power8 :ul
-
-See the [Advanced Kokkos Options] section below for a listing of all
-KOKKOS_ARCH options.
-
-[Compile for CPU-only (MPI only, no threading):]
-
-use a C++11 compatible compiler and set KOKKOS_ARCH variable in
-/src/MAKE/OPTIONS/Makefile.kokkos_mpi_only as described above. Then do the
-following:
-
-cd lammps/src
-make yes-kokkos
-make kokkos_mpi_only :pre
-
-[Compile for CPU-only (MPI plus OpenMP threading):]
-
-NOTE: To build with Kokkos support for OpenMP threading, your compiler
-must support the OpenMP interface. You should have one or more
-multi-core CPUs so that multiple threads can be launched by each MPI
-task running on a CPU.
-
-Use a C++11 compatible compiler and set KOKKOS_ARCH variable in
-/src/MAKE/OPTIONS/Makefile.kokkos_omp as described above.  Then do the
-following:
-
-cd lammps/src
-make yes-kokkos
-make kokkos_omp :pre
-
-[Compile for Intel KNL Xeon Phi (Intel Compiler, OpenMPI):]
-
-use a C++11 compatible compiler and do the following:
-
-cd lammps/src
-make yes-kokkos
-make kokkos_phi :pre
-
-[Compile for CPUs and GPUs (with OpenMPI or MPICH):]
-
 NOTE: To build with Kokkos support for NVIDIA GPUs, NVIDIA CUDA
 software version 7.5 or later must be installed on your system. See
 the discussion for the "GPU package"_Speed_gpu.html for details of how
 to check and do this.

 NOTE: Kokkos with CUDA currently implicitly assumes, that the MPI
-library is CUDA-aware and has support for GPU-direct. This is not always
-the case, especially when using pre-compiled MPI libraries provided by
-a Linux distribution. This is not a problem when using only a single
-GPU and a single MPI rank on a desktop. When running with multiple
-MPI ranks, you may see segmentation faults without GPU-direct support.
-These can be avoided by adding the flags
-"-pk kokkos gpu/direct off"_Run_options.html
-to the LAMMPS command line or by using the command
-"package kokkos gpu/direct off"_package.html in the input file.
+library is CUDA-aware and has support for GPU-direct. This is not
+always the case, especially when using pre-compiled MPI libraries
+provided by a Linux distribution. This is not a problem when using
+only a single GPU and a single MPI rank on a desktop. When running
+with multiple MPI ranks, you may see segmentation faults without
+GPU-direct support.  These can be avoided by adding the flags "-pk
+kokkos gpu/direct off"_Run_options.html to the LAMMPS command line or
+by using the command "package kokkos gpu/direct off"_package.html in
+the input file.

-Use a C++11 compatible compiler and set KOKKOS_ARCH variable in
-/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi for both GPU and CPU as
-described above.  Then do the following:
+[Building LAMMPS with the KOKKOS package:]

-cd lammps/src
-make yes-kokkos
-make kokkos_cuda_mpi :pre
-
-[Alternative Methods of Compiling:]
-
-Alternatively, the KOKKOS package can be built by specifying Kokkos variables
-on the make command line. For example:
-
-make mpi KOKKOS_DEVICES=OpenMP KOKKOS_ARCH=SNB     # set the KOKKOS_DEVICES and KOKKOS_ARCH variable explicitly
-make kokkos_cuda_mpi KOKKOS_ARCH=Pascal60,Power8   # set the KOKKOS_ARCH variable explicitly :pre
-
-Setting the KOKKOS_DEVICES and KOKKOS_ARCH variables on the make
-command line requires a GNU-compatible make command. Try "gmake" if
-your system's standard make complains.
-
-NOTE: If you build using make line variables and re-build LAMMPS twice
-with different KOKKOS options and the *same* target, then you *must*
-perform a "make clean-all" or "make clean-machine" before each
-build. This is to force all the KOKKOS-dependent files to be
-re-compiled with the new options.
+See the "Build extras"_Build_extras.html#kokkos doc page for instructions.

 [Running LAMMPS with the KOKKOS package:]

@ -411,50 +339,18 @@ hardware.
 [Advanced Kokkos options:]

 There are other allowed options when building with the KOKKOS package.
-As above, they can be set either as variables on the make command line
-or in Makefile.machine. This is the full list of options, including
-those discussed above. Each takes a value shown below. The default
-value is listed, which is set in the /lib/kokkos/Makefile.kokkos file.
+As explained on the "Build extras"_Build_extras.html#kokkos doc page,
+they can be set either as variables on the make command line or in
+Makefile.machine, or they can be specified as CMake variables.  Each
+takes a value shown below.  The default value is listed, which is set
+in the lib/kokkos/Makefile.kokkos file.

-KOKKOS_DEVICES, values = {Serial}, {OpenMP}, {Pthreads}, {Cuda}, default = {OpenMP}
-KOKKOS_ARCH, values = {KNC}, {SNB}, {HSW}, {Kepler30}, {Kepler32}, {Kepler35}, {Kepler37}, {Maxwell50}, {Maxwell52}, {Maxwell53}, {Pascal60}, {Pascal61}, {ARMv80}, {ARMv81}, {ARMv81}, {ARMv8-ThunderX}, {BGQ}, {Power7}, {Power8}, {Power9}, {KNL}, {BDW}, {SKX}, default = {none}
 KOKKOS_DEBUG, values = {yes}, {no}, default = {no}
 KOKKOS_USE_TPLS, values = {hwloc}, {librt}, {experimental_memkind}, default = {none}
 KOKKOS_CXX_STANDARD, values = {c++11}, {c++1z}, default = {c++11}
 KOKKOS_OPTIONS, values = {aggressive_vectorization}, {disable_profiling}, default = {none}
 KOKKOS_CUDA_OPTIONS, values = {force_uvm}, {use_ldg}, {rdc}, {enable_lambda}, default = {enable_lambda} :ul

-KOKKOS_DEVICES sets the parallelization method used for Kokkos code
-(within LAMMPS). KOKKOS_DEVICES=Serial means that no threading will be used.
-KOKKOS_DEVICES=OpenMP means that OpenMP threading will be
-used. KOKKOS_DEVICES=Pthreads means that pthreads will be used.
-KOKKOS_DEVICES=Cuda means an NVIDIA GPU running CUDA will be used.
-
-KOKKOS_ARCH enables compiler switches needed when compiling for a
-specific hardware:
-
-ARMv80 = ARMv8.0 Compatible CPU
-ARMv81 = ARMv8.1 Compatible CPU
-ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU
-SNB = Intel Sandy/Ivy Bridge CPUs
-HSW = Intel Haswell CPUs
-BDW = Intel Broadwell Xeon E-class CPUs
-SKX = Intel Sky Lake Xeon E-class HPC CPUs (AVX512)
-KNC = Intel Knights Corner Xeon Phi
-KNL = Intel Knights Landing Xeon Phi
-Kepler30 = NVIDIA Kepler generation CC 3.0
-Kepler32 = NVIDIA Kepler generation CC 3.2
-Kepler35 = NVIDIA Kepler generation CC 3.5
-Kepler37 = NVIDIA Kepler generation CC 3.7
-Maxwell50 = NVIDIA Maxwell generation CC 5.0
-Maxwell52 = NVIDIA Maxwell generation CC 5.2
-Maxwell53 = NVIDIA Maxwell generation CC 5.3
-Pascal60 = NVIDIA Pascal generation CC 6.0
-Pascal61 = NVIDIA Pascal generation CC 6.1
-BGQ = IBM Blue Gene/Q CPUs
-Power8 = IBM POWER8 CPUs
-Power9 = IBM POWER9 CPUs :ul
-
 KOKKOS_USE_TPLS=hwloc binds threads to hardware cores, so they do not
 migrate during a simulation. KOKKOS_USE_TPLS=hwloc should always be
 used if running with KOKKOS_DEVICES=Pthreads for pthreads. It is not
--- a/doc/src/Speed_omp.txt
+++ b/doc/src/Speed_omp.txt
@ -16,18 +16,6 @@ improper), several Kspace styles, and a few fix styles.  It uses
 the OpenMP interface for multi-threading, but can also be compiled
 without OpenMP support, providing optimized serial styles in that case.

-Here is a quick overview of how to use the USER-OMP package, assuming
-one or more 16-core nodes.  More details follow.
-
-make yes-user-omp
-make omp                                   # Makefile.omp already has OpenMP settings for GNU compilers
-make mpi                                   # or build with USER-OMP package without OpenMP :pre
-
-env OMP_NUM_THREADS=16 lmp_omp -sf omp -in in.script           # 1 MPI task, 16 threads according to OMP_NUM_THREADS
-lmp_mpi -sf omp -in in.script                                  # 1 MPI task, no threads, optimized kernels
-mpirun -np 4 lmp_omp -sf omp -pk omp 4 -in in.script           # 4 MPI tasks, 4 threads/task
-mpirun -np 32 -ppn 4 lmp_omp -sf omp -pk omp 4 -in in.script   # 8 nodes, 4 MPI tasks/node, 4 threads/task :pre
-
 [Required hardware/software:]

 To enable multi-threading, your compiler must support the OpenMP interface.
@ -36,18 +24,18 @@ launched by each MPI task on the local node (using shared memory).

 [Building LAMMPS with the USER-OMP package:]

-The lines above illustrate how to include/build with the USER-OMP
-package in two steps, using the "make" command.  Or how to do it with
-one command as described on the "Packages
-details"_Packages_details.html#USER-OMP doc page.
-
-Note that the CCFLAGS and LINKFLAGS settings in Makefile.machine must
-include "-fopenmp" for the GNU compilers.  If you use an Intel compiler,
-the corresponding flag is "-qopenmp" and the CCFLAGS setting must also
-include "-restrict".
+See the "Build extras"_Build_extras.html#user-omp doc page for
+instructions.

 [Run with the USER-OMP package from the command line:]

+These example asume one or more 16-core nodes.
+
+env OMP_NUM_THREADS=16 lmp_omp -sf omp -in in.script           # 1 MPI task, 16 threads according to OMP_NUM_THREADS
+lmp_mpi -sf omp -in in.script                                  # 1 MPI task, no threads, optimized kernels
+mpirun -np 4 lmp_omp -sf omp -pk omp 4 -in in.script           # 4 MPI tasks, 4 threads/task
+mpirun -np 32 -ppn 4 lmp_omp -sf omp -pk omp 4 -in in.script   # 8 nodes, 4 MPI tasks/node, 4 threads/task :pre
+
 The mpirun or mpiexec command sets the total number of MPI tasks used
 by LAMMPS (one or multiple per compute node) and the number of MPI
 tasks used per node.  E.g. the mpirun command in MPICH does this via
--- a/doc/src/Speed_opt.txt
+++ b/doc/src/Speed_opt.txt
@ -15,34 +15,21 @@ Technologies).  It contains a handful of pair styles whose compute()
 methods were rewritten in C++ templated form to reduce the overhead
 due to if tests and other conditional code.

-Here is a quick overview of how to use the OPT package.  More details
-follow.
-
-make yes-opt
-make mpi                               # build with the OPT package :pre
-
-lmp_mpi -sf opt -in in.script                # run in serial
-mpirun -np 4 lmp_mpi -sf opt -in in.script   # run in parallel :pre
-
 [Required hardware/software:]

 None.

 [Building LAMMPS with the OPT package:]

-The lines above illustrate how to build LAMMPS with the OPT package in
-two steps, using the "make" command.  Or how to do it with one command
-as described on the "Packages details"_Packages_details.html#OPT doc
-page.
-
-Note that if you use an Intel compiler to build with the OPT package,
-the CCFLAGS setting in your Makefile.machine must include "-restrict".
+See the "Build extras"_Build_extras.html#opt doc page for instructions.

 [Run with the OPT package from the command line:]

-As in the lines above, use the "-sf opt" "command-line
-switch"_Run_options.html, which will automatically append "opt" to
-styles that support it.
+lmp_mpi -sf opt -in in.script                # run in serial
+mpirun -np 4 lmp_mpi -sf opt -in in.script   # run in parallel :pre
+
+Use the "-sf opt" "command-line switch"_Run_options.html, which will
+automatically append "opt" to styles that support it.

 [Or run with the OPT package by editing an input script:]

--- a/doc/src/Tools.txt
+++ b/doc/src/Tools.txt
@ -74,7 +74,6 @@ own sub-directories with their own Makefiles and/or README files.
 "vim"_#vim
 "xmgrace"_#xmgrace :ul

-:line
 :line

 amber2lmp tool :h3,link(amber)
--- a/doc/src/compute_chunk_atom.txt
+++ b/doc/src/compute_chunk_atom.txt
@ -134,7 +134,6 @@ timesteps it specifies, while it accumulates per-chunk averages.

 The details are described below.

-:line
 :line

 The different chunk styles operate as follows.  For each style, how it
@ -294,7 +293,6 @@ invoke other computes, fixes, or variables when they are evaluated, so
 this is a very general means of generating per-atom quantities to
 treat as a chunk ID.

-:line
 :line

 Normally, {Nchunk} = the number of chunks, is re-calculated every time
@ -322,7 +320,6 @@ the same compute chunk/atom compute.  However, the time windows they
 induce for holding {Nchunk} constant must be identical, else an error
 will be generated.

-:line
 :line

 The various optional keywords operate as follows.  Note that some of
--- a/doc/src/dump_modify.txt
+++ b/doc/src/dump_modify.txt
@ -133,7 +133,6 @@ dump_modify option below is valid for the {atom} style, it is also
 valid for the {atom/mpiio} style, and similarly for the other styles
 which allow for use of MPI-IO.

-:line
 :line

 These keywords apply to various dump styles, including the "dump
@ -629,7 +628,6 @@ the coordinate would be if it had not been wrapped back into the
 periodic box.  Note that these coordinates may thus be far outside the
 box size stored with the snapshot.

-:line
 :line

 These keywords apply only to the "dump image"_dump_image.html and
@ -894,7 +892,6 @@ frame rate higher than 24 is not recommended, as it will result in
 simply dropping the rendered images. It is more efficient to dump
 images less frequently.

-:line
 :line

 [Restrictions:] none
--- a/doc/src/fix_box_relax.txt
+++ b/doc/src/fix_box_relax.txt
@ -126,8 +126,6 @@ minimizer from the new adjusted box size/shape, since that creates a
 new objective function valid for the new box size/shape.  Repeat as
 necessary until the box size/shape has reached its new equilibrium.

-:line
-:line
 :line

 The {couple} keyword allows two or three of the diagonal components of
--- a/lib/gpu/Install.py
+++ b/lib/gpu/Install.py
@ -26,12 +26,13 @@ optionally copies Makefile.auto to a new Makefile.osuffix
  -h = set CUDA_HOME variable in Makefile.auto to hdir
       hdir = path to NVIDIA Cuda software, e.g. /usr/local/cuda
  -a = set CUDA_ARCH variable in Makefile.auto to arch
-       use arch = 20 for Tesla C2050/C2070 (Fermi) (deprecated as of CUDA 8.0)
+       use arch = 20 for Fermi (C2050/C2070, deprecated as of CUDA 8.0)
                     or GeForce GTX 580 or similar
-       use arch = 30 for Tesla K10 (Kepler)
-       use arch = 35 for Tesla K40 (Kepler) or GeForce GTX Titan or similar
-       use arch = 37 for Tesla dual K80 (Kepler)
-       use arch = 60 for Tesla P100 (Pascal)
+       use arch = 30 for Kepler (K10)
+       use arch = 35 for Kepler (K40) or GeForce GTX Titan or similar
+       use arch = 37 for Kepler (dual K80)
+       use arch = 60 for Pascal (P100)
+       use arch = 70 for Volta
  -p = set CUDA_PRECISION variable in Makefile.auto to precision
       use precision = double or mixed or single
  -e = set EXTRAMAKE variable in Makefile.auto to Makefile.lammps.esuffix
--- a/src/KSPACE/fft3d.cpp
+++ b/src/KSPACE/fft3d.cpp
@ -14,7 +14,7 @@
 /* ----------------------------------------------------------------------
   Contributing authors: Jim Shepherd (GA Tech) added SGI SCSL support
                         Axel Kohlmeyer (Temple U) added support for
-                         FFTW3, KISSFFT, Dfti/MKL, and ACML.
+                         FFTW3, KISS FFT, Dfti/MKL, and ACML.
                         Phil Blood (PSC) added single precision FFT.
                         Paul Coffman (IBM) added MPI collectives remap
 ------------------------------------------------------------------------- */
@ -26,7 +26,7 @@
 #include "fft3d.h"
 #include "remap.h"

-#ifdef FFT_KISSFFT
+#ifdef FFT_KISS
 /* include kissfft implementation */
 #include "kissfft.h"
 #endif
--- a/src/KSPACE/fft3d.h
+++ b/src/KSPACE/fft3d.h
@ -24,8 +24,8 @@ typedef float FFT_SCALAR;
 typedef double FFT_SCALAR;
 #endif

-
 // set default fftw library. switch to FFT_FFTW3 when convenient.
+
 #ifdef FFT_FFTW
 #define FFT_FFTW3
 #endif
@ -57,8 +57,9 @@ typedef fftwf_complex FFT_DATA;
 #else

 /* use a stripped down version of kiss fft as default fft */
-#ifndef FFT_KISSFFT
-#define FFT_KISSFFT
+
+#ifndef FFT_KISS
+#define FFT_KISS
 #endif
 #define kiss_fft_scalar float
 typedef struct {
@ -97,8 +98,8 @@ typedef fftw_complex FFT_DATA;
 #else

 /* use a stripped down version of kiss fft as default fft */
-#ifndef FFT_KISSFFT
-#define FFT_KISSFFT
+#ifndef FFT_KISS
+#define FFT_KISS
 #endif
 #define kiss_fft_scalar double
 typedef struct {
@ -152,7 +153,7 @@ struct fft_plan_3d {
  FFTW_API(plan) plan_mid_backward;
  FFTW_API(plan) plan_slow_forward;
  FFTW_API(plan) plan_slow_backward;
-#elif defined(FFT_KISSFFT)
+#elif defined(FFT_KISS)
  kiss_fft_cfg cfg_fast_forward;
  kiss_fft_cfg cfg_fast_backward;
  kiss_fft_cfg cfg_mid_forward;
--- a/src/KSPACE/kissfft.h
+++ b/src/KSPACE/kissfft.h
@ -13,6 +13,7 @@

   changes 2008-2011 by Axel Kohlmeyer <akohlmey@gmail.com>
 */
+
 #ifndef LMP_FFT_KISSFFT
 #define LMP_FFT_KISSFFT

--- a/src/pack.h
+++ b/src/pack.h
@ -22,9 +22,8 @@ struct pack_plan_3d {
  int nqty;                  // # of values/element
 };

-
-#if !defined(PACK_POINTER) && !defined(PACK_MEMCPY)
-#define PACK_ARRAY
+#if !defined(FFT_PACK_POINTER) && !defined(FFT_PACK_MEMCPY)
+#define FFT_PACK_ARRAY
 #endif

 #ifndef PACK_DATA
@ -47,7 +46,7 @@ struct pack_plan_3d {
   pack/unpack with array indices
 ------------------------------------------------------------------------- */

-#ifdef PACK_ARRAY
+#ifdef FFT_PACK_ARRAY

 /* ----------------------------------------------------------------------
   pack from data -> buf
@ -274,7 +273,7 @@ static void unpack_3d_permute2_n(PACK_DATA *buf, PACK_DATA *data, struct pack_pl
   pack/unpack with pointers
 ------------------------------------------------------------------------- */

-#ifdef PACK_POINTER
+#ifdef FFT_PACK_POINTER

 /* ----------------------------------------------------------------------
   pack from data -> buf
@ -523,7 +522,7 @@ static void unpack_3d_permute2_n(PACK_DATA *buf, PACK_DATA *data, struct pack_pl
     just use PACK_POINTER versions
 ------------------------------------------------------------------------- */

-#ifdef PACK_MEMCPY
+#ifdef FFT_PACK_MEMCPY

 /* ----------------------------------------------------------------------
   pack from data -> buf