Merge pull request #1207 from akohlmey/next-patch-release

Patch release 15 November 2018
Merge pull request #1206 from akohlmey/collected-small-changes
2018-11-15 19:33:52 -05:00 · 2018-11-15 17:29:26 -05:00 · 2018-11-15 16:50:56 -05:00 · 2018-11-15 16:50:56 -05:00 · 2018-11-15 16:50:56 -05:00 · 2018-11-15 14:58:02 -05:00
420 changed files with 21554 additions and 2289 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -29,6 +29,7 @@ src/USER-MEAMC/*      @martok
 src/USER-MOFFF/*      @hheenen
 src/USER-MOLFILE/*    @akohlmey
 src/USER-NETCDF/*     @pastewka
+src/USER-PLUMED/*     @gtribello
 src/USER-PHONON/*     @lingtikong
 src/USER-PTM/*        @pmla
 src/USER-OMP/*        @akohlmey
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -304,7 +304,7 @@ pkg_depends(USER-SCAFACOS MPI)

 find_package(OpenMP QUIET)
 option(BUILD_OMP "Build with OpenMP support" ${OpenMP_FOUND})
-if(BUILD_OMP OR PKG_USER-OMP OR PKG_KOKKOS OR PKG_USER-INTEL)
+if(BUILD_OMP OR PKG_KOKKOS OR PKG_USER-INTEL)
  find_package(OpenMP REQUIRED)
  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
@ -349,7 +349,7 @@ if(PKG_KSPACE)
  endif()
 endif()

-if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-QUIP OR PKG_LATTE)
+if(PKG_MSCG OR PKG_USER-ATC OR PKG_USER-AWPMD OR PKG_USER-QUIP OR PKG_LATTE OR PKG_USER-PLUMED)
  find_package(LAPACK)
  find_package(BLAS)
  if(NOT LAPACK_FOUND OR NOT BLAS_FOUND)
@ -531,6 +531,12 @@ endif()

 if(PKG_USER-PLUMED)
  find_package(GSL REQUIRED)
+  set(PLUMED_MODE "static" CACHE STRING "Linkage mode for Plumed2 library")
+  set(PLUMED_MODE_VALUES static shared runtime)
+  set_property(CACHE PLUMED_MODE PROPERTY STRINGS ${PLUMED_MODE_VALUES})
+  validate_option(PLUMED_MODE PLUMED_MODE_VALUES)
+  string(TOUPPER ${PLUMED_MODE} PLUMED_MODE)
+
  option(DOWNLOAD_PLUMED "Download Plumed (instead of using the system's one)" OFF)
  if(DOWNLOAD_PLUMED)
    include(ExternalProject)
@ -543,13 +549,29 @@ if(PKG_USER-PLUMED)
    ExternalProject_get_property(plumed_build INSTALL_DIR)
    set(PLUMED_INSTALL_DIR ${INSTALL_DIR})
    list(APPEND LAMMPS_DEPS plumed_build)
-    list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/plumed/obj/kernel.o
-      ${PLUMED_INSTALL_DIR}/lib/plumed/obj/PlumedStatic.o ${GSL_LIBRARIES} ${CMAKE_DL_LIBS})
+    if(PLUMED_MODE STREQUAL "STATIC")
+      add_definitions(-D__PLUMED_WRAPPER_CXX=1)
+      list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/plumed/obj/kernel.o
+        "${PLUMED_INSTALL_DIR}/lib/plumed/obj/PlumedStatic.o" ${GSL_LIBRARIES} ${CMAKE_DL_LIBS} ${LAPACK_LIBRARIES})
+    elseif(PLUMED_MODE STREQUAL "SHARED")
+      list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumed.so ${CMAKE_DL_LIBS})
+    elseif(PLUMED_MODE STREQUAL "RUNTIME")
+      add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_INSTALL_DIR}/lib/libplumedKernel.so)
+      list(APPEND LAMMPS_LINK_LIBS ${PLUMED_INSTALL_DIR}/lib/libplumedWrapper.a -rdynamic ${CMAKE_DL_LIBS})
+    endif()
    set(PLUMED_INCLUDE_DIRS "${PLUMED_INSTALL_DIR}/include")
  else()
    find_package(PkgConfig REQUIRED)
    pkg_check_modules(PLUMED plumed REQUIRED)
-    include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
+    if(PLUMED_MODE STREQUAL "STATIC")
+      add_definitions(-D__PLUMED_WRAPPER_CXX=1)
+      include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.static)
+    elseif(PLUMED_MODE STREQUAL "SHARED")
+      include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.shared)
+    elseif(PLUMED_MODE STREQUAL "RUNTIME")
+      add_definitions(-D__PLUMED_HAS_DLOPEN=1 -D__PLUMED_DEFAULT_KERNEL=${PLUMED_LIBDIR}/libplumedKernel.so)
+      include(${PLUMED_LIBDIR}/plumed/src/lib/Plumed.cmake.runtime)
+    endif()
    list(APPEND LAMMPS_LINK_LIBS ${PLUMED_LOAD})
  endif()
  include_directories(${PLUMED_INCLUDE_DIRS})
--- a/cmake/README.md
+++ b/cmake/README.md
@ -1492,6 +1492,11 @@ target API.
  </dl>
  </td>
 </tr>
+<tr>
+  <td><code>BIN2C</code> (CUDA only)</td>
+  <td>Path to bin2c executable, will automatically pick up the first one in your $PATH.</td>
+  <td>(automatic)</td>
+</tr>
 </tbody>
 </table>

@ -1647,9 +1652,8 @@ requires `gzip` to be in your `PATH`
 </tr>
 <tr>
  <td><code>GZIP_EXECUTABLE</code></td>
-  <td></td>
-  <td>
-  </td>
+  <td>Path to gzip executable, will automatically pick up the first one in your $PATH.</td>
+  <td>(automatic)</td>
 </tr>
 </tbody>
 </table>
@ -1679,9 +1683,8 @@ requires `ffmpeg` to be in your `PATH`
 </tr>
 <tr>
  <td><code>FFMPEG_EXECUTABLE</code></td>
-  <td></td>
-  <td>
-  </td>
+  <td>Path to ffmpeg executable, will automatically pick up the first one in your $PATH.</td>
+  <td>(automatic)</td>
 </tr>
 </tbody>
 </table>
--- a/cmake/presets/all_off.cmake
+++ b/cmake/presets/all_off.cmake
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA

 set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
                      USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
-                      USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
+                      USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)

 set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

--- a/cmake/presets/all_on.cmake
+++ b/cmake/presets/all_on.cmake
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA

 set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
                      USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
-                      USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
+                      USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)

 set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

--- a/cmake/presets/manual_selection.cmake
+++ b/cmake/presets/manual_selection.cmake
@ -56,7 +56,8 @@ set(PKG_USER-MOFFF OFF CACHE BOOL "" FORCE)
 set(PKG_USER-MOLFILE OFF CACHE BOOL "" FORCE)
 set(PKG_USER-NETCDF OFF CACHE BOOL "" FORCE)
 set(PKG_USER-OMP OFF CACHE BOOL "" FORCE)
-set(PKG_USER-PHOFFOFF OFF CACHE BOOL "" FORCE)
+set(PKG_USER-PHONON OFF CACHE BOOL "" FORCE)
+set(PKG_USER-PLUMED OFF CACHE BOOL "" FORCE)
 set(PKG_USER-QMMM OFF CACHE BOOL "" FORCE)
 set(PKG_USER-QTB OFF CACHE BOOL "" FORCE)
 set(PKG_USER-QUIP OFF CACHE BOOL "" FORCE)
--- a/cmake/presets/nolib.cmake
+++ b/cmake/presets/nolib.cmake
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA

 set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
                      USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
-                      USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
+                      USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)

 set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

--- a/cmake/presets/user.cmake
+++ b/cmake/presets/user.cmake
@ -13,7 +13,7 @@ set(USER_PACKAGES USER-ATC USER-AWPMD USER-BOCS USER-CGDNA USER-CGSDK USER-COLVA

 set(PACKAGES_WITH_LIB COMPRESS GPU KIM KOKKOS LATTE MEAM MPIIO MSCG POEMS PYTHON REAX VORONOI
                      USER-ATC USER-AWPMD USER-COLVARS USER-H5MD USER-LB USER-MOLFILE
-                      USER-NETCDF USER-QMMM USER-QUIP USER-SMD USER-VTK)
+                      USER-NETCDF USER-PLUMED USER-QMMM USER-QUIP USER-SMD USER-VTK)

 set(ALL_PACKAGES ${STANDARD_PACKAGES} ${USER_PACKAGES})

--- a/doc/github-development-workflow.md
+++ b/doc/github-development-workflow.md
@ -0,0 +1,184 @@
+# Outline of the GitHub Development Workflow
+
+This purpose of this document is to provide a point of reference for the
+core LAMMPS developers and other LAMMPS contibutors to understand the
+choices the LAMMPS developers have agreed on. Git and GitHub provide the
+tools, but do not set policies, so it is up to the developers to come to
+an agreement as to how to define and interpret policies. This document
+is likely to change as our experiences and needs change and we try to
+adapt accordingly. Last change 2018-11-15.
+
+## Table of Contents
+
+  * [GitHub Merge Management](#github-merge-management)
+  * [Pull Requests](#pull-requests)
+    * [Pull Request Assignments](#pull-request-assignments)
+    * [Pull Request Reviews](#pull-request-reviews)
+    * [Pull Request Discussions](#pull-request-discussions)
+    * [Checklist for Pull Requests](#checklist-for-pull-requests)
+  * [GitHub Issues](#github-issues)
+  * [Milestones and Release Planning](#milestones-and-release-planning)
+
+## GitHub Merge Management
+
+In the interest of consistency, ONLY ONE of the core LAMMPS developers
+should doing the merging itself.  This is currently
+[@akohlmey](https://github.com/akohlmey) (Axel Kohlmeyer).
+If this assignment needs to be changed, it shall be done right after a
+stable release.
+
+## Pull Requests
+
+ALL changes to the LAMMPS code and documentation, however trivial, MUST
+be submitted as a pull request to GitHub. All changes to the "master"
+branch must be made exclusively through merging pull requests. The
+"unstable" and "stable" branches, respectively are only to be updated
+upon patch or stable releases with fast-forward merges based on the
+associated tags. Pull requests may also be submitted to (long-running)
+feature branches created by LAMMPS developers inside the LAMMPS project,
+if needed. Those are not subject to the merge and review restrictions
+discussed in this document, though, but get manages as needed on a
+case-by-case basis.
+
+### Pull Request Assignments
+
+Pull requests can be "chaperoned" by one of the LAMMPS core developers.
+This is indicated by who the pull request is assigned to. LAMMPS core
+developers can self-assign or they can decide to assign a pull request
+to a different LAMMPS developer. Being assigned to a pull request means,
+that this pull request may need some work and the assignee is tasked to
+determine what this might be needed or not, and may either implement the
+required changes or ask the submitter of the pull request to implement
+them.  Even though, all LAMMPS developers may have write access to pull
+requests (if enabled by the submitter, which is the default), only the
+submitter or the assignee of a pull request may do so.  During this
+period the "work_in_progress" label shall be applied to the pull
+request.  The assignee gets to decide what happens to the pull request
+next, e.g. whether it should be assigned to a different developer for
+additional checks and changes, or is recommended to be merged.  Removing
+the "work_in_progress" label and assigning the pull request to the
+developer tasked with merging signals that a pull request is ready to be
+merged.
+
+### Pull Request Reviews
+
+People can be assigned to review a pull request in two ways:
+
+  * They can be assigned manually to review a pull request
+    by the submitter or a LAMMPS developer
+  * They can be automatically assigned, because a developers matches
+    a file pattern in the `.github/CODEOWNERS` file, which associates
+    developers with the code they contributed and maintain.
+
+Reviewers are requested to state their appraisal of the proposed changes
+and either approve or request changes. People may unassign themselves
+from review, if they feel not competent about the changes proposed. At
+least one review from a LAMMPS developer with write access is required
+before merging in addition to the automated compilation tests.  The
+feature, that reviews from code owners are "hard" reviews (i.e. they
+must all be approved before merging is allowed), is currently disabled
+and it is in the discretion of the merge maintainer to assess when
+a sufficient degree of approval has been reached.  Reviews may be
+(automatically) dismissed, when the reviewed code has been changed,
+and then approval is required a second time.
+
+### Pull Request Discussions
+
+All discussions about a pull request should be kept as much as possible
+on the pull request discussion page on GitHub, so that other developers
+can later review the entire discussion after the fact and understand the
+rationale behind choices made.  Exceptions to this policy are technical
+discussions, that are centered on tools or policies themselves
+(git, github, c++) rather than on the content of the pull request.
+
+### Checklist for Pull Requests
+
+Here are some items to check:
+  * source and text files should not have CR/LF line endings (use dos2unix to remove)
+  * every new command or style should have documentation. The names of
+  source files (c++ and manual) should follow the name of the style.
+  (example: `src/fix_nve.cpp`, `src/fix_nve.h` for `fix nve` command,
+  implementing the class `FixNVE`, documented in `doc/src/fix_nve.txt`)
+  * all new style names should be lower case, the must be no dashes,
+  blanks, or underscores separating words, only forward slashes.
+  * new style docs should be added to the "overview" files in
+  `doc/src/Commands_*.txt`, `doc/src/{fixes,computes,pairs,bonds,...}.txt`
+  and `doc/src/lammps.book`
+  * new files in packages should be added to `src/.gitignore`
+  * removed or renamed files in packages should be added to `src/Purge.list`
+  * C++ source files should use C++ style include files for accessing
+  C-library APIs, e.g. `#include <cstdlib>` instead of `#include <stdlib.h>`.
+  And they should use angular brackets instead of double quotes. Full list:
+    * assert.h -> cassert
+    * ctype.h -> cctype
+    * errno.h -> cerrno
+    * float.h -> cfloat
+    * limits.h -> climits
+    * math.h -> cmath
+    * omplex.h -> complex
+    * setjmp.h -> csetjmp
+    * signal.h -> csignal
+    * stddef.h -> cstddef
+    * stdint.h -> cstdint
+    * stdio.h -> cstdio
+    * stdlib.h -> cstdlib
+    * string.h -> cstring
+    * time.h -> ctime
+  Do not replace (as they are C++-11): `inttypes.h` and `stdint.h`.
+  * Code should follow the C++-98 standard. C++-11 is only accepted
+  in individual special purpose packages
+  * indentation is two spaces per level
+  * there should be no tabs and no trailing whitespace
+  * header files, especially of new styles, should not include any
+  other headers, except the header with the base class or cstdio.
+  Forward declarations should be used instead when possible.
+  * iostreams should be avoided. LAMMPS uses stdio from the C-library.
+  * use of STL in headers and class definitions should be avoided.
+  * static class members should be avoided at all cost.
+  * anything storing atom IDs should be using `tagint` and not `int`.
+  This can be flagged by the compiler only for pointers and only when
+  compiling LAMMPS with `-DLAMMPS_BIGBIG`.
+  * when including both `lmptype.h` (and using defines or macros from it)
+  and `mpi.h`, `lmptype.h` must be included first.
+
+## GitHub Issues
+
+The GitHub issue tracker is the location where the LAMMPS developers
+and other contributors or LAMMPS users can report issues or bugs with
+the LAMMPS code or request new features to be added. Feature requests
+are usually indicated by a `[Feature Request]` marker in the subject.
+Issues are assigned to a person, if this person is working on this
+feature or working to resolve an issue. Issues that have nobody working
+on them at the moment, have the label `volunteer needed` attached.
+
+When an issue, say `#125` is resolved by a specific pull request,
+the comment for the pull request shall contain the text `closes #125`
+or `fixes #125`, so that the issue is automatically deleted when
+the pull request is merged.
+
+## Milestones and Release Planning
+
+LAMMPS uses a continuous release development model with incremental
+changes, i.e. significant effort is made - including automated pre-merge
+testing - that the code in the branch "master" does not get broken.
+More extensive testing (including regression testing) is performed after
+code is merged to the "master" branch. There are patch releases of
+LAMMPS every 1-3 weeks at a point, when the LAMMPS developers feel, that
+a sufficient amount of changes have happened, and the post-merge testing
+has been successful. These patch releases are marked with a
+`patch_<version date>` tag and the "unstable" branch follows only these
+versions (and thus is always supposed to be of production quality,
+unlike "master", which may be temporary broken, in the case of larger
+change sets or unexpected incompatibilities or side effects.
+
+About 3-4 times each year, there are going to be "stable" releases
+of LAMMPS.  These have seen additional, manual testing and review of
+results from testing with instrumented code and static code analysis.
+Also, in the last 2-3 patch releases before a stable release are
+"release candidate" versions which only contain bugfixes and
+documentation updates.  For release planning and the information of
+code contributors, issues and pull requests being actively worked on
+are assigned a "milestone", which corresponds to the next stable
+release or the stable release after that, with a tentative release
+date.
+
--- a/doc/src/Build_basics.txt
+++ b/doc/src/Build_basics.txt
@ -137,9 +137,9 @@ simply loading the appropriate module before building LAMMPS.
 -D CMAKE_C_COMPILER=name              # name of C compiler
 -D CMAKE_Fortran_COMPILER=name        # name of Fortran compiler :pre

-D CMAKE_CXX_FlAGS=string             # flags to use with C++ compiler
-D CMAKE_C_FlAGS=string               # flags to use with C compiler
-D CMAKE_Fortran_FlAGS=string         # flags to use with Fortran compiler :pre
+-D CMAKE_CXX_FLAGS=string             # flags to use with C++ compiler
+-D CMAKE_C_FLAGS=string               # flags to use with C compiler
+-D CMAKE_Fortran_FLAGS=string         # flags to use with Fortran compiler :pre

 By default CMake will use a compiler it finds and it will add
 optimization flags appropriate to that compiler and any "accelerator
--- a/doc/src/Build_extras.txt
+++ b/doc/src/Build_extras.txt
@ -41,11 +41,11 @@ This is the list of packages that may require additional steps.
 "USER-ATC"_#user-atc,
 "USER-AWPMD"_#user-awpmd,
 "USER-COLVARS"_#user-colvars,
-"USER-PLUMED" _#user-plumed,
 "USER-H5MD"_#user-h5md,
 "USER-INTEL"_#user-intel,
 "USER-MOLFILE"_#user-molfile,
 "USER-NETCDF"_#user-netcdf,
+"USER-PLUMED"_#user-plumed,
 "USER-OMP"_#user-omp,
 "USER-QMMM"_#user-qmmm,
 "USER-QUIP"_#user-quip,
@ -715,57 +715,98 @@ a corresponding Makefile.lammps.machine file.

 USER-PLUMED package :h4,link(user-plumed)

+Before building LAMMPS with this package, you must first build PLUMED.
+PLUMED can be built as part of the LAMMPS build or installed separately
+from LAMMPS using the generic "plumed installation instructions"_plumedinstall.
+:link(plumedinstall,http://plumed.github.io/doc-master/user-doc/html/_installation.html)
+
+PLUMED can be linked into MD codes in three different modes: static,
+shared, and runtime.  With the "static" mode, all required PLUMED code
+is linked statically into the MD code. The MD code is then fully
+independent from the PLUMED installation, but also you have to
+rebuild/relink the MD code to update the PLUMED code inside it.  With
+"shared" linkage mode, the MD code is linked to a shared library
+containing the PLUMED code, preferably after it was installed in a
+globally accessible location. This way the same installed PLUMED code
+can be shared across multiple MD packages and can be updated, for as
+long as the shared PLUMED library is ABI-compatible. The third linkage
+mode is "runtime" which allows to switch the PLUMED kernel at runtime
+between different variants through setting the PLUMED_KERNEL environment
+varible, which has to point to the location of the libplumedKernel.so
+dynamical shared object, which is then loaded at runtime. This is
+particularly convenient for doing PLUMED development and comparing
+multiple PLUMED versions without having to recompile the hosting MD
+code. All three linkage modes are supported by LAMMPS on selected
+operating systems (e.g. Linux) and using either CMake or traditional
+make build. The "static" mode should be most portable, the "runtime"
+mode support in LAMMPS makes the most assumptions about operating
+system and compiler environment. If one mode does not work, try a
+different one, or switch to a different build system, or consider
+a global PLUMED installation or downloading it during building LAMMPS.
+
 [CMake build]:

+-D DOWNLOAD_PLUMED=value   # download PLUMED for build, value = no (default) or yes
+-D PLUMED_MODE=value       # Linkage mode for PLUMED, value = static (default), shared, or runtime :pre
+
+If DOWNLOAD_PLUMED is set to "yes", the PLUMED library will be
+downloaded (the version of that is hardcoded to a vetted version of
+PLUMED, usually a recent stable release version) and built inside the
+CMake build directory.  If DOWNLOAD_PLUMED is set to "no" (the default),
+CMake will try to detect an installed version of PLUMED and link to
+that. For this to work, the PLUMED library has to be installed into a
+location where the pkg-config tool can find it or the PKG_CONFIG_PATH
+environment variable has to be set up accordingly.
+
+The PLUMED_MODE setting determines the linkage mode of the PLUMED
+library.  Allowed values are "static" (default), "shared", or "runtime".
+For a discussion of PLUMED linkage modes, please see above.  When
+enabling DOWNLOAD_PLUMED, the static linkage mode is recommended.
+
 [Traditional make]:

-Before building LAMMPS with this package, you must first build 
-PLUMED.  We recommending building PLUMED separately to LAMMPS using 
-the instructions that can be found at http://plumed.github.io/doc-master/user-doc/html/_installation.html.
-Before compiling LAMMPS you can then install the fix plumed command
-and compile LAMMPS in the usual manner:
+Before installing the USER-PLUMED package, first the PLUMED library
+needs to be configured so that LAMMPS can find the right settings when
+compiling and linking the LAMMPS executable itself. You can either
+download and build PLUMED inside the LAMMPS plumed library folder or use
+a previously installed PLUMED library and point LAMMPS to its
+location. You also have to choose the linkage mode: "static" (default),
+"shared" or "runtime".  For a discussion of PLUMED linkage modes, please
+see above.
+
+Download/compilation/configuration of the plumed library can be done
+from the src folder through the following make args:
+
+make lib-plumed                         # print help message
+make lib-plumed args="-b"               # download and build PLUMED in lib/plumed/plumed2
+make lib-plumed args="-p $HOME/.local"  # use existing PLUMED installation in $HOME/.local
+make lib-plumed args="-p /usr/local -m shared"  # use existing PLUMED installation in
+                                                # /usr/local and use shared linkage mode
+:pre
+
+Note that 2 symbolic (soft) links, "includelink" and "liblink" are
+created in lib/plumed to point into the location of the PLUMED build to
+use and also a new file lib/plumed/Makefile.lammps is created with
+settings suitable for LAMMPS to compile and link PLUMED in the desired
+linkage mode. After this step is compleded, you can install the
+USER-PLUMED package and compile LAMMPS in the usual manner:

 make yes-user-plumed 
 make machine :pre

-Once this compilation completes you should be able to run LAMMPS in the usual
-way.  When running LAMMPS with an input script that contains a fix
-plumed command LAMMPS will try to call the PLUMED runtime library.  PLUMED
-must therefore be available in your path if LAMMPS is compiled in this way.
+Once this compilation completes you should be able to run LAMMPS in the
+usual way.  For shared linkage mode, libplumed.so must be found by the
+LAMMPS executable, which on many operating systems means, you have to
+set the LD_LIBRARY_PATH environment variable accordingly.

-On some machines it is not possible to call runtime libraries in the way described
-above.  When compiling on these machines it is thus better to statically link
-PLUMED when compiling LAMMPS.  To do this you must either download a PLUMED
-tarball from http://www.plumed.org/get-it or clone it using
-git clone https://github.com/plumed/plumed2.git.  If you download the tarball
-unpack it in the /lib/plumed directory.  Similarly if you clone 
-it clone it to the /lib/plumed directory as if there is a version of PLUMED within
-this directory LAMMPS will always try to statically link the version of PLUMED
-that this directory contains instead of dynamically linking the library.
+Support for the different linkage modes in LAMMPS varies for different
+operating systems, using the static linkage is expected to be the most
+portable, and thus set to be the default.

-Once you have downloaded PLUMED into /lib/plumed you must again build the code
-here by following the instructions that can be found at
-http://plumed.github.io/doc-master/user-doc/html/_installation.html.
-
-You can statically link PLUMED manually and if you want to access the full
-range of PLUMED functionalities this is what you should do.  If you only want the
-basic range of functionalities, however, (i.e. no user contributed modules) then
-you can download and compile PLUMED in one step from the lammps/src dir, using a
-command like like those below:
-
-make lib-plumed                       # print help message
-make lib-plumed  args="-b"            # download and build the latest stable version of PLUMED
-
-These commands will simply invoke the lib/plumed/Install.py script with
-args specified.  Furthermore, once the script has completed you should
-have a compiled version of PLUMED.  With this built you can install/un-install
-PLUMED and build LAMMPS in the usual manner:
-
-make yes-user-plumed
-make machine :pre
-
-make no-user-plumed
-make machine :pre
+If you want to change the linkage mode, you have to re-run "make
+lib-plumed" with the desired settings [and] do a reinstall if the
+USER-PLUMED package with "make yes-user-plumed" to update the required
+makefile settings with the changes in the lib/plumed folder.

 :line

--- a/doc/src/Build_package.txt
+++ b/doc/src/Build_package.txt
@ -56,6 +56,7 @@ packages:
 "USER-INTEL"_Build_extras.html#user-intel,
 "USER-MOLFILE"_Build_extras.html#user-molfile,
 "USER-NETCDF"_Build_extras.html#user-netcdf,
+"USER-PLUMED"_Build_extras.html#user-plumed,
 "USER-OMP"_Build_extras.html#user-omp,
 "USER-QMMM"_Build_extras.html#user-qmmm,
 "USER-QUIP"_Build_extras.html#user-quip,
--- a/doc/src/Commands_all.txt
+++ b/doc/src/Commands_all.txt
@ -59,6 +59,7 @@ An alphabetic list of all LAMMPS commands.
 "fix_modify"_fix_modify.html,
 "group"_group.html,
 "group2ndx"_group2ndx.html,
+"hyper"_hyper.html,
 "if"_if.html,
 "info"_info.html,
 "improper_coeff"_improper_coeff.html,
--- a/doc/src/Commands_fix.txt
+++ b/doc/src/Commands_fix.txt
@ -78,6 +78,8 @@ OPT.
 "grem"_fix_grem.html,
 "halt"_fix_halt.html,
 "heat"_fix_heat.html,
+"hyper/global"_fix_hyper_global.html,
+"hyper/local"_fix_hyper_local.html,
 "imd"_fix_imd.html,
 "indent"_fix_indent.html,
 "ipi"_fix_ipi.html,
@ -108,7 +110,7 @@ OPT.
 "nph/asphere (o)"_fix_nph_asphere.html,
 "nph/body"_fix_nph_body.html,
 "nph/eff"_fix_nh_eff.html,
-"nph/sphere (ko)"_fix_nph_sphere.html,
+"nph/sphere (o)"_fix_nph_sphere.html,
 "nphug (o)"_fix_nphug.html,
 "npt (iko)"_fix_nh.html,
 "npt/asphere (o)"_fix_npt_asphere.html,
@ -128,7 +130,7 @@ OPT.
 "nve/line"_fix_nve_line.html,
 "nve/manifold/rattle"_fix_nve_manifold_rattle.html,
 "nve/noforce"_fix_nve_noforce.html,
-"nve/sphere (o)"_fix_nve_sphere.html,
+"nve/sphere (ko)"_fix_nve_sphere.html,
 "nve/spin"_fix_nve_spin.html,
 "nve/tri"_fix_nve_tri.html,
 "nvk"_fix_nvk.html,
@ -147,6 +149,7 @@ OPT.
 "phonon"_fix_phonon.html,
 "pimd"_fix_pimd.html,
 "planeforce"_fix_planeforce.html,
+"plumed"_fix_plumed.html,
 "poems"_fix_poems.html,
 "pour"_fix_pour.html,
 "precession/spin"_fix_precession_spin.html,
--- a/doc/src/Manual.txt
+++ b/doc/src/Manual.txt
@ -1,7 +1,7 @@
 <!-- HTML_ONLY -->
 <HEAD>
 <TITLE>LAMMPS Users Manual</TITLE>
-<META NAME="docnumber" CONTENT="9 Nov 2018 version">
+<META NAME="docnumber" CONTENT="15 Nov 2018 version">
 <META NAME="author" CONTENT="http://lammps.sandia.gov - Sandia National Laboratories">
 <META NAME="copyright" CONTENT="Copyright (2003) Sandia Corporation. This software and manual is distributed under the GNU General Public License.">
 </HEAD>
@ -21,7 +21,7 @@
 :line

 LAMMPS Documentation :c,h1
-9 Nov 2018 version :c,h2
+15 Nov 2018 version :c,h2

 "What is a LAMMPS version?"_Manual_version.html

--- a/doc/src/Packages_details.txt
+++ b/doc/src/Packages_details.txt
@ -89,6 +89,7 @@ as contained in the file name.
 "USER-NETCDF"_#PKG-USER-NETCDF,
 "USER-OMP"_#PKG-USER-OMP,
 "USER-PHONON"_#PKG-USER-PHONON,
+"USER-PLUMED"_#PKG-USER-PLUMED,
 "USER-PTM"_#PKG-USER-PTM,
 "USER-QMMM"_#PKG-USER-QMMM,
 "USER-QTB"_#PKG-USER-QTB,
@ -1187,7 +1188,7 @@ the NAMD MD code, but with portability in mind.  Axel Kohlmeyer
 [Install:] 

 This package has "specific installation
-instructions"_Build_extras.html#gpu on the "Build
+instructions"_Build_extras.html#user-colvars on the "Build
 extras"_Build_extras.html doc page.

 [Supporting info:]
@ -1201,18 +1202,20 @@ examples/USER/colvars :ul

 :line

-USER-PLUMED package :link(USER-PLUMED),h4
+USER-PLUMED package :link(PKG-USER-PLUMED),h4

 [Contents:]

-The fix plumed command allows you to use the plugin for molecular
-dynamics PLUMED to analyse and bias your LAMMPS trajectory on the fly.
-In practise PLUMED is called from within the lammps input script by using
-the "fix plumed _fix_plumed.html command.
+The fix plumed command allows you to use the PLUMED free energy plugin
+for molecular dynamics to analyse and bias your LAMMPS trajectory on
+the fly.  The PLUMED library is called from within the LAMMPS input
+script by using the "fix plumed _fix_plumed.html command.

-[Authors:] The PLUMED library is written and maintained by
-Massimilliano Bonomi, Giovanni Bussi, Carlo Camiloni and
-Gareth Tribello. 
+[Authors:] The "PLUMED library"_#PLUMED is written and maintained by
+Massimilliano Bonomi, Giovanni Bussi, Carlo Camiloni and Gareth
+Tribello.
+
+:link(PLUMED,http://www.plumed.org)

 [Install:]

@ -1224,7 +1227,7 @@ extras"_Build_extras.html doc page.

 src/USER-PLUMED/README
 lib/plumed/README
-"fix plumed "_fix_plumed.html
+"fix plumed"_fix_plumed.html
 examples/USER/plumed :ul

 :line
--- a/doc/src/Packages_user.txt
+++ b/doc/src/Packages_user.txt
@ -62,17 +62,20 @@ Package, Description, Doc page, Example, Library
 "USER-NETCDF"_Packages_details.html#PKG-USER-NETCDF, dump output via NetCDF,"dump netcdf"_dump_netcdf.html, n/a, ext
 "USER-OMP"_Packages_details.html#PKG-USER-OMP, OpenMP-enabled styles,"Speed omp"_Speed_omp.html, "Benchmarks"_http://lammps.sandia.gov/bench.html, no
 "USER-PHONON"_Packages_details.html#PKG-USER-PHONON, phonon dynamical matrix,"fix phonon"_fix_phonon.html, USER/phonon, no
+"USER-PLUMED"_Packages_details.html#PKG-USER-PLUMED, "PLUMED"_#PLUMED free energy library,"fix plumed"_fix_plumed.html, USER/plumed, ext
 "USER-PTM"_Packages_details.html#PKG-USER-PTM, Polyhedral Template Matching,"compute ptm/atom"_compute_ptm_atom.html, n/a, no
 "USER-QMMM"_Packages_details.html#PKG-USER-QMMM, QM/MM coupling,"fix qmmm"_fix_qmmm.html, USER/qmmm, ext
 "USER-QTB"_Packages_details.html#PKG-USER-QTB, quantum nuclear effects,"fix qtb"_fix_qtb.html "fix qbmsst"_fix_qbmsst.html, qtb, no
 "USER-QUIP"_Packages_details.html#PKG-USER-QUIP, QUIP/libatoms interface,"pair_style quip"_pair_quip.html, USER/quip, ext
 "USER-REAXC"_Packages_details.html#PKG-USER-REAXC, ReaxFF potential (C/C++) ,"pair_style reaxc"_pair_reaxc.html, reax, no
 "USER-SCAFACOS"_Packages_details.html#PKG-USER-SCAFACOS, wrapper on ScaFaCoS solver,"kspace_style scafacos"_kspace_style.html, USER/scafacos, ext
-"USER-SDPD"_Packages_details.html#PKG-USER-SDPD, smoothed dissipative particle dynamics,"pair_style sdpd/taitwater/isothermal"_pair_sdpd_taitwater_isothermal, USER/sdpd, no
+"USER-SDPD"_Packages_details.html#PKG-USER-SDPD, smoothed dissipative particle dynamics,"pair_style sdpd/taitwater/isothermal"_pair_sdpd_taitwater_isothermal.html, USER/sdpd, no
 "USER-SMD"_Packages_details.html#PKG-USER-SMD, smoothed Mach dynamics,"SMD User Guide"_PDF/SMD_LAMMPS_userguide.pdf, USER/smd, ext
 "USER-SMTBQ"_Packages_details.html#PKG-USER-SMTBQ, second moment tight binding QEq potential,"pair_style smtbq"_pair_smtbq.html, USER/smtbq, no
 "USER-SPH"_Packages_details.html#PKG-USER-SPH, smoothed particle hydrodynamics,"SPH User Guide"_PDF/SPH_LAMMPS_userguide.pdf, USER/sph, no
 "USER-TALLY"_Packages_details.html#PKG-USER-TALLY, pairwise tally computes,"compute XXX/tally"_compute_tally.html, USER/tally, no
 "USER-UEF"_Packages_details.html#PKG-USER-UEF, extensional flow,"fix nvt/uef"_fix_nh_uef.html, USER/uef, no
 "USER-VTK"_Packages_details.html#PKG-USER-VTK, dump output via VTK, "compute vtk"_dump_vtk.html, n/a, ext :tb(ea=c,ca1=l)
+
 :link(MOFplus,https://www.mofplus.org/content/show/MOF-FF)
+:link(PLUMED,http://www.plumed.org)
--- a/doc/src/commands_list.txt
+++ b/doc/src/commands_list.txt
@ -44,6 +44,7 @@ Commands :h1
   fix_modify
   group
   group2ndx
+   hyper
   if
   improper_coeff
   improper_style
--- a/doc/src/compute.txt
+++ b/doc/src/compute.txt
@ -176,6 +176,7 @@ compute"_Commands_compute.html doc page are followed by one or more of
 (g,i,k,o,t) to indicate which accelerated styles exist.

 "ackland/atom"_compute_ackland_atom.html - 
+"adf"_compute_adf.html - angular distribution function
 "aggregate/atom"_compute_cluster_atom.html - aggregate ID for each atom
 "angle"_compute_angle.html - 
 "angle/local"_compute_angle_local.html - 
--- a/doc/src/compute_ptm_atom.txt
+++ b/doc/src/compute_ptm_atom.txt
@ -117,5 +117,5 @@ package"_Build_package.html doc page for more info.
 :line

 :link(Larsen)
-[(Larsen)] Larsen, Schmidt, Schiøtz, Modelling Simul Mater Sci Eng, 24, 055007 (2016).
+[(Larsen)] Larsen, Schmidt, Schiotz, Modelling Simul Mater Sci Eng, 24, 055007 (2016).

--- a/doc/src/fix.txt
+++ b/doc/src/fix.txt
@ -221,6 +221,8 @@ accelerated styles exist.
 "grem"_fix_grem.html - 
 "halt"_fix_halt.html - terminate a dynamics run or minimization
 "heat"_fix_heat.html - add/subtract momentum-conserving heat
+"hyper/global"_fix_hyper_global.html - global hyperdynamics
+"hyper/local"_fix_hyper_local.html - local hyperdynamics
 "imd"_fix_imd.html - 
 "indent"_fix_indent.html - impose force due to an indenter
 "ipi"_fix_ipi.html - 
@ -238,6 +240,7 @@ accelerated styles exist.
 "manifoldforce"_fix_manifoldforce.html - 
 "meso"_fix_meso.html - 
 "meso"_fix_meso_move.html - move mesoscopic SPH/SDPD particles in a prescribed fashion
+"meso/move"_fix_meso_move.html - 
 "meso/stationary"_fix_meso_stationary.html - 
 "momentum"_fix_momentum.html - zero the linear and/or angular momentum of a group of atoms
 "move"_fix_move.html - move atoms in a prescribed fashion
@ -293,6 +296,7 @@ accelerated styles exist.
 "phonon"_fix_phonon.html - 
 "pimd"_fix_pimd.html - 
 "planeforce"_fix_planeforce.html - constrain atoms to move in a plane
+"plumed"_fix_plumed.html - wrapper on PLUMED free energy library
 "poems"_fix_poems.html - constrain clusters of atoms to move as coupled rigid bodies
 "pour"_fix_pour.html - pour new atoms/molecules into a granular simulation domain
 "precession/spin"_fix_precession_spin.html - 
--- a/doc/src/fix_bond_react.txt
+++ b/doc/src/fix_bond_react.txt
@ -41,7 +41,7 @@ react = mandatory argument indicating new reaction specification :l
      fraction = initiate reaction with this probability if otherwise eligible
      seed = random number seed (positive integer)
    {stabilize_steps} value = timesteps
-      timesteps = number of timesteps to apply internally created nve/limit.html
+      timesteps = number of timesteps to apply internally created nve/limit fix :pre
    {update_edges} value = {none} or {charges} :l
      none = do not update topology near the edges of reaction templates
      charges = update atomic charges of all atoms in reaction templates
--- a/doc/src/fix_colvars.txt
+++ b/doc/src/fix_colvars.txt
@ -116,7 +116,8 @@ not a limitation of functionality.

 [Related commands:]

-"fix smd"_fix_smd.html
+"fix smd"_fix_smd.html, "fix spring"_fix_spring.html,
+"fix plumed"_fix_plumed.html

 [Default:]

@ -126,4 +127,4 @@ and tstat = NULL.
 :line

 :link(Fiorin)
-[(Fiorin)] Fiorin , Klein, Henin, Mol. Phys., DOI:10.1080/00268976.2013.813594
+[(Fiorin)] Fiorin, Klein, Henin, Mol. Phys., DOI:10.1080/00268976.2013.813594
--- a/doc/src/fix_hyper_global.txt
+++ b/doc/src/fix_hyper_global.txt
@ -0,0 +1,260 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+fix hyper/global command :h3
+
+[Syntax:]
+
+fix ID group-ID hyper/global cutbond qfactor Vmax Tequil :pre
+
+ID, group-ID are documented in "fix"_fix.html command
+hyper/global = style name of this fix command
+cutbond = max distance at which a pair of atoms is considered bonded (distance units)
+qfactor = max strain at which bias potential goes to 0.0 (unitless)
+Vmax = height of bias potential (energy units)
+Tequil = equilibration temperature (temperature units) :ul
+
+[Examples:]
+
+fix 1 all hyper/global 1.0 0.3 0.8 300.0 :pre
+
+[Description:]
+
+This fix is meant to be used with the "hyper"_hyper.html command to
+perform a bond-boost global hyperdynamics (GHD) simulation.  The role
+of this fix is to a select a single pair of atoms in the system at
+each timestep to add a global bias potential to, which will alter the
+dynamics of the system in a manner that effectively accelerates time.
+This is in contrast to the "fix hyper/local"_fix_hyper_local.html
+command, which can be user to perform a local hyperdynamics (LHD)
+simulation, by adding a local bias potential to multiple pairs of
+atoms at each timestep.  GHD can time accelerate a small simulation
+with up to a few 100 atoms.  For larger systems, LHD is needed to
+achieve good time acceleration.
+
+For a system that undergoes rare transition events, where one or more
+atoms move over an energy barrier to a new potential energy basin, the
+effect of the bias potential is to induce more rapid transitions.
+This can lead to a dramatic speed-up in the rate at which events
+occurs, without altering their relative frequencies, thus leading to
+an overall increase in the elapsed real time of the simulation as
+compared to running for the same number of timesteps with normal MD.
+See the "hyper"_hyper.html doc page for a more general discussion of
+hyperdynamics and citations that explain both GHD and LHD.
+
+The equations and logic used by this fix and described here to perform
+GHD follow the description given in "(Voter2013)"_#Voter2013ghd.  The
+bond-boost form of a bias potential for HD is due to Miron and
+Fichthorn as described in "(Miron)"_#Mironghd.  In LAMMPS we use a
+simplified version of bond-boost GHD where a single bond in the system
+is biased at any one timestep.
+
+Bonds are defined between each pair of I,J atoms whose R0ij distance
+is less than {cutbond}, when the system is in a quenched state
+(minimum) energy.  Note that these are not "bonds" in a covalent
+sense.  A bond is simply any pair of atoms that meet the distance
+criterion.  {Cutbond} is an argument to this fix; it is discussed
+below.  A bond is only formed if one or both of the I.J atoms are in
+the specified group.
+
+The current strain of bond IJ (when running dynamics) is defined as
+
+Eij = (Rij - R0ij) / R0ij :pre
+
+where Rij is the current distance between atoms I,J, and R0ij is the
+equilibrium distance in the quenched state.
+
+The bias energy Vij of any bond IJ is defined as
+
+Vij = Vmax * (1 - (Eij/q)^2) for abs(Eij) < qfactor
+    = 0 otherwise :pre
+
+where the prefactor {Vmax} and the cutoff {qfactor} are arguments to
+this fix; they are discussed below.  This functional form is an
+inverse parabola centered at 0.0 with height Vmax and which goes to
+0.0 at +/- qfactor.
+
+Let Emax = the maximum of abs(Eij) for all IJ bonds in the system on a
+given timestep.  On that step, Vij is added as a bias potential to
+only the single bond with strain Emax, call it Vij(max).  Note that
+Vij(max) will be 0.0 if Emax >= qfactor on that timestep.  Also note
+that Vij(max) is added to the normal interatomic potential that is
+computed between all atoms in the system at every step.
+
+The derivative of Vij(max) with respect to the position of each atom
+in the Emax bond gives a bias force Fij(max) acting on the bond as
+
+Fij(max) = - dVij(max)/dEij = 2 Vmax Eij / qfactor^2   for abs(Eij) < qfactor
+         = 0 otherwise :pre
+
+which can be decomposed into an equal and opposite force acting on
+only the two I,J atoms in the Emax bond.
+
+The time boost factor for the system is given each timestep I by
+
+Bi = exp(beta * Vij(max)) :pre
+
+where beta = 1/kTequil, and {Tequil} is the temperature of the system
+and an argument to this fix.  Note that Bi >= 1 at every step.
+
+NOTE: To run GHD, the input script must also use the "fix
+langevin"_fix_langevin.html command to thermostat the atoms at the
+same {Tequil} as specified by this fix, so that the system is running
+constant-temperature (NVT) dynamics.  LAMMPS does not check that this
+is done.
+
+The elapsed time t_hyper for a GHD simulation running for {N}
+timesteps is simply
+
+t_hyper = Sum (i = 1 to N) Bi * dt :pre
+
+where dt is the timestep size defined by the "timestep"_timestep.html
+command.  The effective time acceleration due to GHD is thus t_hyper /
+N*dt, where N*dt is elapsed time for a normal MD run of N timesteps.
+
+Note that in GHD, the boost factor varies from timestep to timestep.
+Likewise, which bond has Emax strain and thus which pair of atoms the
+bias potential is added to, will also vary from timestep to timestep.
+This is in contrast to local hyperdynamics (LHD) where the boost
+factor is an input parameter; see the "fix
+hyper/local"_fix_hyper_local.html doc page for details.
+
+:line
+
+Here is additional information on the input parameters for GHD.
+
+The {cutbond} argument is the cutoff distance for defining bonds
+between pairs of nearby atoms.  A pair of I,J atoms in their
+equilibrium, minimum-energy configuration, which are separated by a
+distance Rij < {cutbond}, are flagged as a bonded pair.  Setting
+{cubond} to be ~25% larger than the nearest-neighbor distance in a
+crystalline lattice is a typical choice for solids, so that bonds
+exist only between nearest neighbor pairs.
+
+The {qfactor} argument is the limiting strain at which the bias
+potential goes to 0.0.  It is dimensionless, so a value of 0.3 means a
+bond distance can be up to 30% larger or 30% smaller than the
+equilibrium (quenched) R0ij distance and the two atoms in the bond
+could still experience a non-zero bias force.
+
+If {qfactor} is set too large, then transitions from one energy basin
+to another are affected because the bias potential is non-zero at the
+transition state (e.g. saddle point).  If {qfactor} is set too small
+than little boost is achieved because the Eij strain of some bond in
+the system will (nearly) always exceed {qfactor}.  A value of 0.3 for
+{qfactor} is typically reasonable.
+
+The {Vmax} argument is the prefactor on the bias potential.  Ideally,
+tt should be set to a value slightly less than the smallest barrier
+height for an event to occur.  Otherwise the applied bias potential
+may be large enough (when added to the interatomic potential) to
+produce a local energy basin with a maxima in the center.  This can
+produce artificial energy minima in the same basin that trap an atom.
+Or if {Vmax} is even larger, it may induce an atom(s) to rapidly
+transition to another energy basin.  Both cases are "bad dynamics"
+which violate the assumptions of GHD that guarantee an accelerated
+time-accurate trajectory of the system.
+
+Note that if {Vmax} is set too small, the GHD simulation will run
+correctly.  There will just be fewer events because the hyper time
+(t_hyper equation above) will be shorter.
+
+NOTE: If you have no physical intuition as to the smallest barrier
+height in your system, a reasonable strategy to determine the largest
+{Vmax} you can use for an LHD model, is to run a sequence of
+simulations with smaller and smaller {Vmax} values, until the event
+rate does not change.
+
+The {Tequil} argument is the temperature at which the system is
+simulated; see the comment above about the "fix
+langevin"_fix_langevin.html thermostatting.  It is also part of the
+beta term in the exponential factor that determines how much boost is
+achieved as a function of the bias potential.
+
+In general, the lower the value of {Tequil} and the higher the value
+of {Vmax}, the more boost will be achievable by the GHD algorithm.
+
+:line
+
+[Restart, fix_modify, output, run start/stop, minimize info:]
+
+No information about this fix is written to "binary restart
+files"_restart.html.
+
+The "fix_modify"_fix_modify.html {energy} option is supported by this
+fix to add the energy of the bias potential to the the system's
+potential energy as part of "thermodynamic output"_thermo_style.html.
+
+This fix computes a global scalar and global vector of length 11, which
+can be accessed by various "output commands"_Howto_output.html.  The
+scalar is the magnitude of the bias potential (energy units) applied on
+the current timestep.  The vector stores the following quantities:
+
+1 = boost factor on this step (unitless)
+2 = max strain Eij of any bond on this step (unitless)
+3 = ID of first atom in the max-strain bond
+4 = ID of second atom in the max-strain bond
+5 = average # of bonds/atom on this step :ul
+
+6 = fraction of timesteps with bias = 0.0 during this run
+7 = max drift distance of any atom during this run (distance units)
+8 = max bond length during this run (distance units) :ul
+
+9 = cummulative hyper time since fix was defined (time units)
+10 = cummulative count of event timesteps since fix was defined
+11 = cummulative count of atoms in events since fix was defined :ul
+
+The first 5 quantities are for the current timestep.  Quantities 6-8
+are for the current hyper run.  Quantities 9-11 are cummulative across
+multiple runs (since the fix was defined in the input script).
+
+For value 7, drift is the distance an atom moves between timesteps
+when the bond list is reset, i.e. between events.  Atoms involved in
+an event will typically move the greatest distance since others are
+typically oscillating around their lattice site.
+
+For value 10, events are checked for by the "hyper"_hyper.html command
+once every {Nevent} timesteps.  This value is the count of those
+timesteps on which one (or more) events was detected.  It is NOT the
+number of distinct events, since more than one event may occur in the
+same {Nevent} time window.
+
+For value 11, each time the "hyper"_hyper.html command checks for an
+event, it invokes a compute to flag zero or more atoms as
+participating in one or more events.  E.g. atoms that have displaced
+more than some distance from the previous quench state.  Value 11 is
+the cummulative count of the number of atoms participating in any of
+the events that were found.
+
+The scalar and vector values calculated by this fix are all
+"intensive".
+
+No parameter of this fix can be used with the {start/stop} keywords of
+the "run"_run.html command.  This fix is not invoked during "energy
+minimization"_minimize.html.
+
+[Restrictions:]
+
+This command can only be used if LAMMPS was built with the REPLICA
+package.  See the "Build package"_Build_package.html doc page for more
+info.
+
+[Related commands:]
+
+"hyper"_hyper.html, "fix hyper/local"_fix_hyper_local.html
+
+[Default:] None
+
+:line
+
+:link(Voter2013ghd)
+[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
+144110 (2013).
+
+:link(Mironghd)
+[(Miron)] R. A. Miron and K. A. Fichthorn, J Chem Phys, 119, 6210 (2003).
--- a/doc/src/fix_hyper_local.txt
+++ b/doc/src/fix_hyper_local.txt
@ -0,0 +1,404 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+fix hyper/local command :h3
+
+[Syntax:]
+
+fix ID group-ID hyper/local cutbond qfactor Vmax Tequil Dcut alpha Btarget :pre
+
+ID, group-ID are documented in "fix"_fix.html command :ulb,l
+hyper/local = style name of this fix command :l
+cutbond = max distance at which a pair of atoms is considered bonded (distance units) :l
+qfactor = max strain at which bias potential goes to 0.0 (unitless) :l
+Vmax = estimated height of bias potential (energy units) :l
+Tequil = equilibration temperature (temperature units) :l
+Dcut = minimum distance between boosted bonds (distance units) :l
+alpha = boostostat relaxation time (time units) :l
+Btarget = desired time boost factor (unitless) :l
+zero or more keyword/value pairs may be appended :l
+keyword = {lost} or {check/bias} or {check/coeff}
+  {lostbond} value = error/warn/ignore
+  {check/bias} values = Nevery error/warn/ignore
+  {check/coeff} values = Nevery error/warn/ignore :pre
+:ule
+
+[Examples:]
+
+fix 1 all hyper/local 1.0 0.3 0.8 300.0 :pre
+
+[Description:]
+
+This fix is meant to be used with the "hyper"_hyper.html command to
+perform a bond-boost local hyperdynamics (LHD) simulation.  The role
+of this fix is to a select multiple pairs of atoms in the system at
+each timestep to add a local bias potential to, which will alter the
+dynamics of the system in a manner that effectively accelerates time.
+This is in contrast to the "fix hyper/global"_fix_hyper_global.html
+command, which can be user to perform a global hyperdynamics (GHD)
+simulation, by adding a global bias potential to a single pair of
+atoms at each timestep.  GHD can time accelerate a small simulation
+with up to a few 100 atoms.  For larger systems, LHD is needed to
+achieve good time acceleration.
+
+For a system that undergoes rare transition events, where one or more
+atoms move over an energy barrier to a new potential energy basin, the
+effect of the bias potential is to induce more rapid transitions.
+This can lead to a dramatic speed-up in the rate at which events
+occurs, without altering their relative frequencies, thus leading to
+an overall increase in the elapsed real time of the simulation as
+compared to running for the same number of timesteps with normal MD.
+See the "hyper"_hyper.html doc page for a more general discussion of
+hyperdynamics and citations that explain both GHD and LHD.
+
+The equations and logic used by this fix and described here to perform
+LHD follow the description given in "(Voter2013)"_#Voter2013lhd.  The
+bond-boost form of a bias potential for HD is due to Miron and
+Fichthorn as described in "(Miron)"_#Mironlhd.
+
+To understand this description, you should first read the description
+of the GHD algorithm on the "fix hyper/global"_fix_hyper_global.html
+doc page.  This description of LHD builds on the GHD description.
+
+The definition of bonds, Eij, and Emax are the same for GHD and LHD.
+The formulas for Vij(max) and Fij(max) are also the same except for a
+pre-factor Cij, explained below.
+
+The bias energy Vij applied to a bond IJ with maximum strain is
+
+Vij(max) = Cij * Vmax * (1 - (Eij/q)^2) for abs(Eij) < qfactor
+         = 0 otherwise :pre
+
+The derivative of Vij(max) with respect to the position of each atom
+in the IJ bond gives a bias force Fij(max) acting on the bond as
+
+Fij(max) = - dVij(max)/dEij = 2 Cij Vmax Eij / qfactor^2   for abs(Eij) < qfactor
+         = 0 otherwise :pre
+
+which can be decomposed into an equal and opposite force acting on
+only the two I,J atoms in the IJ bond.
+
+The key difference is that in GHD a bias energy and force is added (on
+a particular timestep) to only one bond (pair of atoms) in the system,
+which is the bond with maximum strain Emax.
+
+In LHD, a bias energy and force can be added to multiple bonds
+separated by the specified {Dcut} distance or more.  A bond IJ is
+biased if it is the maximum strain bond within its local
+"neighborhood", which is defined as the bond IJ plus any neighbor
+bonds within a distance {Dcut} from IJ.  The "distance" between bond
+IJ and bond KL is the minimum distance between any of the IK, IL, JK,
+JL pairs of atoms.
+
+For a large system, multiple bonds will typically meet this
+requirement, and thus a bias potential Vij(max) will be applied to
+many bonds on the same timestep.
+
+In LHD, all bonds store a Cij prefactor which appears in the Vij(max)
+and Fij(max) equations above.  Note that the Cij factor scales the
+strength of the bias energy and forces whenever bond IJ is the maximum
+strain bond in its neighborhood.
+
+Cij is initialized to 1.0 when a bond between the I,J atoms is first
+defined.  The specified {Btarget} factor is then used to adjust the
+Cij prefactors for each bond every timestep in the following manner.
+
+An instantaneous boost factor Bij is computed each timestep
+for each bond, as
+
+Bij = exp(beta * Vkl(max)) :pre
+
+where Vkl(max) is the bias energy of the maxstrain bond KL within bond
+IJ's neighborhood, beta = 1/kTequil, and {Tequil} is the temperature
+of the system and an argument to this fix.
+
+NOTE: To run LHD, the input script must also use the "fix
+langevin"_fix_langevin.html command to thermostat the atoms at the
+same {Tequil} as specified by this fix, so that the system is running
+constant-temperature (NVT) dynamics.  LAMMPS does not check that this
+is done.
+
+Note that if IJ = KL, then bond IJ is a biased bond on that timestep,
+otherwise it is not.  But regardless, the boost factor Bij can be
+thought of an estimate of time boost currently being applied within a
+local region centered on bond IJ.  For LHD, we want this to be the
+specified {Btarget} value everywhere in the simulation domain.
+
+To accomplish this, if Bij < Btarget, the Cij prefactor for bond IJ is
+incremented on the current timestep by an amount proportional to the
+inverse of the specified {alpha} and the difference (Bij - Btarget).
+Conversely if Bij > Btarget, Cij is decremented by the same amount.
+This procedure is termed "boostostatting" in
+"(Voter2013)"_#Voter2013lhd.  It drives all of the individual Cij to
+values such that when Vij{max} is applied as a bias to bond IJ, the
+resulting boost factor Bij will be close to {Btarget} on average.
+Thus the LHD time acceleration factor for the overall system is
+effectively {Btarget}.
+
+Note that in LHD, the boost factor {Btarget} is specified by the user.
+This is in contrast to global hyperdynamics (GHD) where the boost
+factor varies each timestep and is computed as a function of {Vmax},
+Emax, and {Tequil}; see the "fix hyper/global"_fix_hyper_global.html
+doc page for details.
+
+:line
+
+Here is additional information on the input parameters for LHD.
+
+Note that the {cutbond}, {qfactor}, and {Tequil} arguments have the
+same meaning as for GHD.  The {Vmax} argument is slightly different.
+The {Dcut}, {alpha}, and {Btarget} parameters are unique to LHD.
+
+The {cutbond} argument is the cutoff distance for defining bonds
+between pairs of nearby atoms.  A pair of I,J atoms in their
+equilibrium, minimum-energy configuration, which are separated by a
+distance Rij < {cutbond}, are flagged as a bonded pair.  Setting
+{cubond} to be ~25% larger than the nearest-neighbor distance in a
+crystalline lattice is a typical choice for solids, so that bonds
+exist only between nearest neighbor pairs.
+
+The {qfactor} argument is the limiting strain at which the bias
+potential goes to 0.0.  It is dimensionless, so a value of 0.3 means a
+bond distance can be up to 30% larger or 30% smaller than the
+equilibrium (quenched) R0ij distance and the two atoms in the bond
+could still experience a non-zero bias force.
+
+If {qfactor} is set too large, then transitions from one energy basin
+to another are affected because the bias potential is non-zero at the
+transition state (e.g. saddle point).  If {qfactor} is set too small
+than little boost can be achieved because the Eij strain of some bond in
+the system will (nearly) always exceed {qfactor}.  A value of 0.3 for
+{qfactor} is typically a reasonable value.
+
+The {Vmax} argument is a fixed prefactor on the bias potential.  There
+is a also a dynamic prefactor Cij, driven by the choice of {Btarget}
+as discussed above.  The product of these should be a value less than
+the smallest barrier height for an event to occur.  Otherwise the
+applied bias potential may be large enough (when added to the
+interatomic potential) to produce a local energy basin with a maxima
+in the center.  This can produce artificial energy minima in the same
+basin that trap an atom.  Or if Cij*{Vmax} is even larger, it may
+induce an atom(s) to rapidly transition to another energy basin.  Both
+cases are "bad dynamics" which violate the assumptions of LHD that
+guarantee an accelerated time-accurate trajectory of the system.
+
+NOTE: It may seem that {Vmax} can be set to any value, and Cij will
+compensate to reduce the overall prefactor if necessary.  However the
+Cij are initialized to 1.0 and the boostostatting procedure typically
+operates slowly enough that there can be a time period of bad dynamics
+if {Vmax} is set too large.  A better strategy is to set {Vmax} to the
+smallest barrier height for an event (the same as for GHD), so that
+the Cij remain near unity.
+
+The {Tequil} argument is the temperature at which the system is
+simulated; see the comment above about the "fix
+langevin"_fix_langevin.html thermostatting.  It is also part of the
+beta term in the exponential factor that determines how much boost is
+achieved as a function of the bias potential.  See the discussion of
+the {Btarget} argument below.
+
+As discussed above, the {Dcut} argument is the distance required
+between two locally maxstrain bonds for them to both be selected as
+biased bonds on the same timestep.  Computationally, the larger {Dcut}
+is, the more work (computation and communication) must be done each
+timestep within the LHD algorithm.  And the fewer bonds can be
+simultaneously biased, which may mean the specified {Btarget} time
+acceleration cannot be achieved.
+
+Physically {Dcut} should be a long enough distance that biasing two
+pairs of atoms that close together will not influence the dynamics of
+each pair.  E.g. something like 2x the cutoff of the interatomic
+potential.  In practice a {Dcut} value of ~10 Angstroms seems to work
+well for many solid-state systems.
+
+NOTE: You must also insure that ghost atom communication is performed
+for a distance of at least {Dcut} + {cutevent} where {cutevent} = the
+distance one or more atoms move (between quenched states) to be
+considered an "event".  It is an argument to the "compute
+event/displace" command used to detect events.  By default the ghost
+communication distance is set by the pair_style cutoff, which will
+typically be < {Dcut}.  The "comm_modify cutoff"_comm_modify.html
+command can be used to set the ghost cutoff explicitly, e.g.
+
+comm_modify cutoff 12.0 :pre
+
+This fix does not know the {cutevent} parameter, but uses half the
+bond length as an estimate to warn if the ghost cutoff is not long
+enough.
+
+As described above the {alpha} argument is a pre-factor in the
+boostostat update equation for each bond's Cij prefactor.  {Alpha} is
+specified in time units, similar to other thermostat or barostat
+damping parameters.  It is roughly the physical time it will take the
+boostostat to adjust a Cij value from a too high (or too low) value to
+a correct one.  An {alpha} setting of a few ps is typically good for
+solid-state systems.  Note that the {alpha} argument here is the
+inverse of the alpha parameter discussed in
+"(Voter2013)"_#Voter2013lhd.
+
+The {Btarget} argument is the desired time boost factor (a value > 1)
+that all the atoms in the system will experience.  The elapsed time
+t_hyper for an LHD simulation running for {N} timesteps is simply
+
+t_hyper = Btarget * N*dt :pre
+
+where dt is the timestep size defined by the "timestep"_timestep.html
+command.  The effective time acceleration due to LHD is thus t_hyper /
+N*dt = Btarget, where N*dt is elapsed time for a normal MD run
+of N timesteps.
+
+You cannot choose an arbitrarily large setting for {Btarget}.  The
+maximum value you should choose is
+
+Btarget = exp(beta * Vsmall) :pre
+
+where Vsmall is the smallest event barrier height in your system, beta
+= 1/kTequil, and {Tequil} is the specified temperature of the system
+(both by this fix and the Langevin thermostat).
+
+Note that if {Btarget} is set smaller than this, the LHD simulation
+will run correctly.  There will just be fewer events because the hyper
+time (t_hyper equation above) will be shorter.
+
+NOTE: If you have no physical intuition as to the smallest barrier
+height in your system, a reasonable strategy to determine the largest
+{Btarget} you can use for an LHD model, is to run a sequence of
+simulations with smaller and smaller {Btarget} values, until the event
+rate does not change.
+
+:line
+
+[Restart, fix_modify, output, run start/stop, minimize info:]
+
+No information about this fix is written to "binary restart
+files"_restart.html.
+
+The "fix_modify"_fix_modify.html {energy} option is supported by this
+fix to add the energy of the bias potential to the the system's
+potential energy as part of "thermodynamic output"_thermo_style.html.
+
+This fix computes a global scalar and global vector of length 23,
+which can be accessed by various "output
+commands"_Howto_output.html.  The scalar is the magnitude of
+the bias potential (energy units) applied on the current timestep,
+summed over all biased bonds.  The vector stores the following
+quantities:
+
+1 = # of biased bonds on this step
+2 = max strain Eij of any bond on this step (unitless)
+3 = average bias potential for all biased bonds on this step (energy units)
+4 = average # of bonds/atom on this step
+5 = average neighbor bonds/bond on this step within {Dcut} :ul
+
+6 = fraction of steps and bonds with no bias during this run
+7 = max drift distance of any atom during this run (distance units)
+8 = max bond length during this run (distance units)
+9 = average # of biased bonds/step during this run
+10 = average bias potential for all biased bonds during this run (energy units)
+11 = max bias potential for any biased bond during this run (energy units)
+12 = min bias potential for any biased bond during this run (energy units)
+13 = max distance from my sub-box of any ghost atom with maxstrain < qfactor during this run (distance units)
+14 = max distance outside my box of any ghost atom with any maxstrain during this run (distance units)
+15 = count of ghost neighbor atoms not found on reneighbor steps during this run
+16 = count of lost bond partners during this run
+17 = average bias coeff for lost bond partners during this run
+18 = count of bias overlaps found during this run
+19 = count of non-matching bias coefficients found during this run :ul
+
+20 = cummulative hyper time since fix created (time units)
+21 = cummulative count of event timesteps since fix created
+22 = cummulative count of atoms in events since fix created
+23 = cummulative # of new bonds since fix created :ul
+
+The first quantities (1-5) are for the current timestep.  Quantities
+6-19 are for the current hyper run.  They are reset each time a new
+hyper run is performed.  Quantities 20-23 are cummulative across
+multiple runs (since the fix was defined in the input script).
+
+For value 6, the numerator is a count of all biased bonds on every
+timestep whose bias energy = 0.0 due to Eij >= {qfactor}.  The
+denominator is the count of all biased bonds on all timesteps.
+
+For value 7, drift is the distance an atom moves between timesteps
+when the bond list is reset, i.e. between events.  Atoms involved in
+an event will typically move the greatest distance since others are
+typically oscillating around their lattice site.
+
+For values 13 and 14, the maxstrain of a ghost atom is the maxstrain
+of any bond it is part of, and it is checked for ghost atoms within
+the bond neighbor cutoff.
+
+Values 15-19 are mostly useful for debugging and diagnositc purposes.
+
+For values 15-17, it is possible that a ghost atom owned by another
+processor will move far enough (e.g. as part of an event-in-progress)
+that it will no longer be within the communication cutoff distance for
+acquiring ghost atoms.  Likewise it may be a ghost atom bond partner
+that cannot be found because it has moved too far.  These values count
+those occurrences.  Because they typically involve atoms that are part
+of events, they do not usually indicate bad dynamics.  Value 16 is the
+average bias coefficient for bonds where a partner atom was lost.
+
+For value 18, no two bonds should be biased if they are within a
+{Dcut} distance of each other.  This value should be zero, indicating
+that no pair of bonds "overlap", meaning they are closer than {Dcut}
+from each other.
+
+For value 19, the same bias coefficient is stored by both atoms in an
+IJ bond.  This value should be zero, indicating that for all bonds,
+each atom in the bond stores the a bias coefficient with the same
+value.
+
+Value 20 is simply the specified {boost} factor times the number of
+timestep times the timestep size.
+
+For value 21, events are checked for by the "hyper"_hyper.html command
+once every {Nevent} timesteps.  This value is the count of those
+timesteps on which one (or more) events was detected.  It is NOT the
+number of distinct events, since more than one event may occur in the
+same {Nevent} time window.
+
+For value 22, each time the "hyper"_hyper.html command checks for an
+event, it invokes a compute to flag zero or more atoms as
+participating in one or more events.  E.g. atoms that have displaced
+more than some distance from the previous quench state.  Value 22 is
+the cummulative count of the number of atoms participating in any of
+the events that were found.
+
+Value 23 tallies the number of new bonds created by the bond reset
+operation.  Bonds between a specific I,J pair of atoms may persist for
+the entire hyperdynamics simulation if neither I or J are involved in
+an event.
+
+The scalar and vector values calculated by this fix are all
+"intensive".
+
+No parameter of this fix can be used with the {start/stop} keywords of
+the "run"_run.html command.  This fix is not invoked during "energy
+minimization"_minimize.html.
+
+[Restrictions:]
+
+This fix is part of the REPLICA package.  It is only enabled if LAMMPS
+was built with that package.  See the "Build package"_Build_package.html
+doc page for more info.
+
+[Related commands:]
+
+"hyper"_hyper.html, "fix hyper/global"_fix_hyper_global.html
+
+[Default:] None
+
+:line
+
+:link(Voter2013lhd)
+[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
+144110 (2013).
+
+:link(Mironlhd)
+[(Miron)] R. A. Miron and K. A. Fichthorn, J Chem Phys, 119, 6210 (2003).
--- a/doc/src/fix_plumed.txt
+++ b/doc/src/fix_plumed.txt
@ -25,33 +25,32 @@ fix pl all plumed all plumed plumedfile plumed.dat outfile p.log

 [Description:]

-This fix instructs LAMMPS to call the PLUMED library, which allows one
-to perform various forms of trajectory analysis on the fly and to also
-use methods such as umbrella sampling and metadynamics to enhance the
-sampling of phase space.
+This fix instructs LAMMPS to call the "PLUMED"_plumedhome library, which
+allows one to perform various forms of trajectory analysis on the fly
+and to also use methods such as umbrella sampling and metadynamics to
+enhance the sampling of phase space.

-The documentation included here only describes the fix plumed command.
-This command is LAMMPS specific whereas most of the functionality
-implemented in PLUMED will work with a range of MD codes and also when
-PLUMED is used as a stand alone code.  The full documentation for PLUMED
-is available at "this website"_http://www.plumed.org/documentation
+The documentation included here only describes the fix plumed command
+itself.  This command is LAMMPS specific, whereas most of the
+functionality implemented in PLUMED, however, will work with a range of
+MD codes, and when PLUMED is used as a stand alone code for analysis.
+The full "documentation for PLUMED"_plumeddocs is available online and
+included in the PLUMED source code.  The PLUMED library development is
+hosted at
+"https://github.com/plumed/plumed2"_https://github.com/plumed/plumed2
+A detailed discussion of the code can be found in "(PLUMED)"_#PLUMED.

-The PLUMED library is developed at
-"https://github.com/plumed/plumed2"_https://github.com/plumed/plumed2 A
-detailed discussion of the code can be found in "(PLUMED)"_#PLUMED.
-
-There are some example scripts for using this package with LAMMPS in the
+There is an example input for using this package with LAMMPS in the
 examples/USER/plumed directory.

 :line

-The command to call PLUMED above is reasonably self explanatory.  Within
-the input file for lammps the user is required to specify the input file
-for PLUMED and a file on which to output the PLUMED log.  The user must
-specify both of these arguments every time PLUMED is to be used.
-Furthermore, the fix plumed command should appear in the LAMMPS input
-file after the relevant input paramters (e.g. the timestep) have been
-set.
+The command to make LAMMPS call PLUMED during a run requires two keyword
+value pairs pointing to the PLUMED input file and an output file for the
+PLUMED log. The user must specify these arguments every time PLUMED is
+to be used.  Furthermore, the fix plumed command should appear in the
+LAMMPS input file [after] relevant input paramters (e.g. the timestep)
+have been set.

 The {group-ID} entry is ignored. LAMMPS will always pass all the atoms
 to PLUMED and there can only be one instance of the plumed fix at a
@ -64,10 +63,10 @@ functionality by only allowing only one plumed fix in the LAMMPS input.
 The {plumedfile} keyword allows the user to specify the name of the
 PLUMED input file.  Instructions as to what should be included in a
 plumed input file can be found in the "documentation for
-PLUMED"_http://www.plumed.org/documentation.
+PLUMED"_plumeddocs

 The {outfile} keyword allows the user to specify the name of a file on
-which to output the PLUMED log.  This log file normally just parots the
+which to output the PLUMED log.  This log file normally just parrots the
 information that is contained in the input file.  The names of the files
 on which the results from the various analyses that have been performed
 using PLUMED will be specified by the user in the PLUMED input file.
@ -76,12 +75,13 @@ using PLUMED will be specified by the user in the PLUMED input file.

 When performing a restart of a calculation that involves PLUMED you must
 include a RESTART command in the PLUMED input file as detailed in the
-"PLUMED documentation"_http://www.plumed.org/documentation.  When the
-restart command is found in the PLUMED input PLUMED will append to the
-files that were generated in the run that was performed previously.
-Furthermore, any history dependent bias potentials that were accumulated
-in previous calculations will be read in when the restart command is
-included in the PLUMED input.
+"PLUMED documentation"_plumeddocs.  When the restart command is found in
+the PLUMED input PLUMED will append to the files that were generated in
+the run that was performed previously.  No part of the PLUMED restart
+data is included in the LAMMPS restart files.  Furthermore, any history
+dependent bias potentials that were accumulated in previous calculations
+will be read in when the RESTART command is included in the PLUMED
+input.

 The "fix_modify"_fix_modify.html {energy} option is not supported by
 this fix.
@ -97,10 +97,7 @@ This fix is part of the USER-PLUMED package.  It is only enabled if
 LAMMPS was built with that package.  See the "Build
 package"_Build_package.html doc page for more info.

-There can only be one plumed fix active at a time. Since the interface
-communicates only the minimum amount of information and since the PLUMED
-module itself can handle an arbitrary number of analysis and biasing
-methods, this is not a limitation of functionality.
+There can only be one plumed fix active at a time.

 [Related commands:]

@ -115,3 +112,6 @@ The default options are plumedfile = NULL and outfile = NULL

 :link(PLUMED)
 [(PLUMED)] G.A. Tribello, M. Bonomi, D. Branduardi, C. Camilloni and G. Bussi, Comp. Phys. Comm 185, 604 (2014)
+
+:link(plumeddocs,http://www.plumed.org/documentation)
+:link(plumedhome,http://www.plumed.org/)
--- a/doc/src/fix_smd.txt
+++ b/doc/src/fix_smd.txt
@ -137,7 +137,8 @@ package"_Build_package.html doc page for more info.

 "fix drag"_fix_drag.html, "fix spring"_fix_spring.html,
 "fix spring/self"_fix_spring_self.html,
-"fix spring/rg"_fix_spring_rg.html
+"fix spring/rg"_fix_spring_rg.html,
+"fix colvars"_fix_colvars.html, "fix plumed"_fix_plumed.html

 [Default:] none

--- a/doc/src/fixes.txt
+++ b/doc/src/fixes.txt
@ -57,6 +57,8 @@ Fixes :h1
   fix_grem
   fix_halt
   fix_heat
+   fix_hyper_global
+   fix_hyper_local
   fix_imd
   fix_indent
   fix_ipi
--- a/doc/src/hyper.txt
+++ b/doc/src/hyper.txt
@ -0,0 +1,192 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+hyper command :h3
+
+[Syntax:]
+
+hyper N Nevent fix-ID compute-ID keyword values ... :pre
+
+N = # of timesteps to run :ulb,l
+Nevent = check for events every this many steps :l
+fix-ID = ID of a fix that applies a global or local bias potential, can be NULL :l
+compute-ID = ID of a compute that identifies when an event has occurred :l
+zero or more keyword/value pairs may be appended :l
+keyword = {min} or {dump} or {rebond} :l
+  {min} values = etol ftol maxiter maxeval
+    etol = stopping tolerance for energy, used in quenching
+    ftol = stopping tolerance for force, used in quenching
+    maxiter = max iterations of minimize, used in quenching
+    maxeval = max number of force/energy evaluations, used in quenching
+  {dump} value = dump-ID
+    dump-ID = ID of dump to trigger whenever an event takes place
+  {rebond} value = Nrebond
+    Nrebond = frequency at which to reset bonds, even if no event has occurred
+ :pre
+:ule
+
+[Examples:]
+
+compute event all event/displace 1.0
+fix HG mobile hyper/global 3.0 0.3 0.4 800.0
+hyper 5000 100 HG event min 1.0e-6 1.0e-6 100 100 dump 1 dump 5 :pre
+
+[Description:]
+
+Run a bond-boost hyperdynamics (HD) simulation where time is
+accelerated by application of a bias potential to one or more pairs of
+nearby atoms in the system.  This command can be used to run both
+global and local hyperdyamics.  In global HD a single bond within the
+system is biased on each timestep.  In local HD multiple bonds
+(separated by a sufficient distance) can be biased simultaneously at
+each timestep.  In the bond-boost hyperdynamics context, a "bond" is
+not a covalent bond between a pair of atoms in a molecule.  Rather it
+is simply a pair of nearby atoms as discussed below.
+
+Both global and local HD are described in "(Voter2013)"_#Voter2013 by
+Art Voter and collaborators.  Similar to parallel replica dynamics
+(PRD), global and local HD are methods for performing accelerated
+dynamics that are suitable for infrequent-event systems that obey
+first-order kinetics.  A good overview of accelerated dynamics methods
+for such systems in given in "(Voter2002)"_#Voter2002hd from the same
+group.  To quote from the review paper: "The dynamical evolution is
+characterized by vibrational excursions within a potential basin,
+punctuated by occasional transitions between basins."  The transition
+probability is characterized by p(t) = k*exp(-kt) where k is the rate
+constant.  Running multiple replicas gives an effective enhancement in
+the timescale spanned by the multiple simulations, while waiting for
+an event to occur.
+
+Both HD and PRD produce a time-accurate trajectory that effectively
+extends the timescale over which a system can be simulated, but they
+do it differently.  HD uses a single replica of the system and
+accelerates time by biasing the interaction potential in a manner such
+that each timestep is effectively longer.  PRD creates Nr replicas of
+the system and runs dynamics on each independently with a normal
+unbiased potential until an event occurs in one of the replicas.  The
+time between events is reduced by a factor of Nr replicas.  For both
+methods, per CPU second, more physical time elapses and more events
+occur.  See the "prd"_prd.html doc page for more info about PRD.
+
+An HD run has several stages, which are repeated each time an event
+occurs, as explained below.  The logic for an HD run is as follows:
+
+quench
+create initial list of bonds :pre
+
+while (time remains):
+  run dynamics for Nevent steps
+  quench
+  check for an event
+  if event occurred: reset list of bonds
+  restore pre-quench state :pre
+
+The list of bonds is the list of atom pairs of atoms that are within a
+short cutoff distance of each other after the system energy is
+minimized (quenched).  This list is created and reset by a "fix
+hyper/global"_fix_hyper_global.html or "fix
+hyper/local"_fix_hyper_local.html command specified as {fix-ID}.  At
+every dynamics timestep, the same fix selects one of more bonds to
+apply a bias potential to.
+
+IMPORTANT NOTE: The style of fix associated with the specified
+{fix-ID} determines whether you are running the global versus local
+hyperdynamics algorithm.
+
+Dynamics (with the bias potential) is run continuously, stopping every
+{Nevent} steps to check if a transition event has occurred.  The
+specified {N} for total steps must be a multiple of {Nevent}.  check
+is performed by quenching the system and comparing the resulting atom
+coordinates to the coordinates from the previous basin.
+
+A quench is an energy minimization and is performed by whichever
+algorithm has been defined by the "min_style"_min_style.html command.
+Minimization parameters may be set via the
+"min_modify"_min_modify.html command and by the {min} keyword of the
+hyper command.  The latter are the settings that would be used with
+the "minimize"_minimize.html command.  Note that typically, you do not
+need to perform a highly-converged minimization to detect a transition
+event, though you may need to in order to prevent a set of atoms in
+the system from relaxing to a saddle point.
+
+The event check is performed by a compute with the specified
+{compute-ID}.  Currently there is only one compute that works with the
+hyper command, which is the "compute
+event/displace"_compute_event_displace.html command.  Other
+event-checking computes may be added.  "Compute
+event/displace"_compute_event_displace.html checks whether any atom in
+the compute group has moved further than a specified threshold
+distance.  If so, an event has occurred.
+
+If this happens, the list of bonds is reset, since some bond pairs
+are likely now too far apart, and new pairs are likely close enough
+to be considered a bond.  The pre-quenched state of the
+system (coordinates and velocities) is restored, and dynamics continue.
+
+At the end of the hyper run, a variety of statistics are output to the
+screen and logfile.  These include info relevant to both global and
+local hyperdynamics, such as the number of events and the elapsed
+hyper time (acclerated time), And it includes info specific to one or
+the other, depending on which style of fix was specified by {fix-ID}.
+
+:line
+
+The optional keywords operate as follows.
+
+As explained above, the {min} keyword can be used to specify
+parameters for the quench.  Their meaning is the same
+as for the "minimize"_minimize.html command
+
+The {dump} keyword can be used to trigger a specific dump command with
+the specified {dump-ID} to output a snapshot each time an event is
+detected.  It can be specified multiple times with different {dump-ID}
+values, as in the example above.  These snapshots will be for the
+quenched state of the system on a timestep that is a multiple of
+{Nevent}, i.e. a timestep after the event has occurred.  Note that any
+dump command in the input script will also output snapshots at
+whatever timestep interval it defines via its {N} argument; see the
+"dump"_dump.html command for details.  This means if you only want a
+particular dump to output snapshots when events are detected, you
+should specify its {N} as a value larger than the length of the
+hyperdynamics run.
+
+As in the code logic above, the bond list is normally only reset when
+an event occurs.  The {rebond} keyword will force a reset of the bond
+list every {Nrebond} steps, even if an event has not occurred.
+{Nrebond} must be a multiple of {Nevent}.  This can be useful to check
+if more frequent resets alter event statistics, perhaps because the
+parameters chosen for defining what is a bond and what is an event are
+producing bad dynamics in the presence of the bias potential.
+
+:line
+
+[Restrictions:]
+
+This command can only be used if LAMMPS was built with the REPLICA
+package.  See the "Build package"_Build_package.html doc
+page for more info.
+
+[Related commands:]
+
+"fix hyper/global"_fix_hyper_global.html, "fix
+hyper/local"_fix_hyper_local.html, "compute
+event/displace"_compute_event_displace.html, "prd"_prd.html
+
+[Default:]
+
+The option defaults are min = 0.1 0.1 40 50 and time = steps.
+
+:line
+
+:link(Voter2013)
+[(Voter2013)] S. Y. Kim, D. Perez, A. F. Voter, J Chem Phys, 139,
+144110 (2013).
+
+:link(Voter2002hd)
+[(Voter2002)] Voter, Montalenti, Germann, Annual Review of Materials
+Research 32, 321 (2002).
--- a/doc/src/lammps.book
+++ b/doc/src/lammps.book
@ -160,6 +160,7 @@ dump_cfg_uef.html
 echo.html
 group.html
 group2ndx.html
+hyper.html
 if.html
 include.html
 info.html
@ -277,6 +278,8 @@ fix_gravity.html
 fix_grem.html
 fix_halt.html
 fix_heat.html
+fix_hyper_global.html
+fix_hyper_local.html
 fix_imd.html
 fix_indent.html
 fix_ipi.html
--- a/doc/src/prd.txt
+++ b/doc/src/prd.txt
@ -48,11 +48,12 @@ replicas of a system.  One or more replicas can be used.  The total
 number of steps {N} to run can be interpreted in one of two ways; see
 discussion of the {time} keyword below.

-PRD is described in "this paper"_#Voter1998 by Art Voter.  It is a method
-for performing accelerated dynamics that is suitable for
-infrequent-event systems that obey first-order kinetics.  A good
-overview of accelerated dynamics methods for such systems in given in
-"this review paper"_#Voter2002prd from the same group.  To quote from the
+PRD is described in "(Voter1998)"_#Voter1998 by Art Voter.  Similar to
+global or local hyperdynamics (HD), PRD is a method for performing
+accelerated dynamics that is suitable for infrequent-event systems
+that obey first-order kinetics.  A good overview of accelerated
+dynamics methods for such systems in given in this review paper
+"(Voter2002)"_#Voter2002prd from Art's group.  To quote from the
 paper: "The dynamical evolution is characterized by vibrational
 excursions within a potential basin, punctuated by occasional
 transitions between basins."  The transition probability is
@ -61,15 +62,26 @@ Running multiple replicas gives an effective enhancement in the
 timescale spanned by the multiple simulations, while waiting for an
 event to occur.

-Each replica runs on a partition of one or more processors.  Processor
-partitions are defined at run-time using the "-partition command-line
-switch"_Run_options.html.  Note that if you have MPI installed, you
-can run a multi-replica simulation with more replicas (partitions)
-than you have physical processors, e.g you can run a 10-replica
-simulation on one or two processors.  However for PRD, this makes
-little sense, since running a replica on virtual instead of physical
-processors,offers no effective parallel speed-up in searching for
-infrequent events.  See the "Howto replica"_Howto_replica.html doc
+Both PRD and HD produce a time-accurate trajectory that effectively
+extends the timescale over which a system can be simulated, but they
+do it differently.  PRD creates Nr replicas of the system and runs
+dynamics on each independently with a normal unbiased potential until
+an event occurs in one of the replicas.  The time between events is
+reduced by a factor of Nr replicas.  HD uses a single replica of the
+system and accelerates time by biasing the interaction potential in a
+manner such that each timestep is effectively longer.  For both
+methods, per CPU second, more physical time elapses and more events
+occur.  See the "hyper"_hyper.html doc page for more info about HD.
+
+In PRD, each replica runs on a partition of one or more processors.
+Processor partitions are defined at run-time using the "-partition
+command-line switch"_Run_options.html.  Note that if you have MPI
+installed, you can run a multi-replica simulation with more replicas
+(partitions) than you have physical processors, e.g you can run a
+10-replica simulation on one or two processors.  However for PRD, this
+makes little sense, since running a replica on virtual instead of
+physical processors,offers no effective parallel speed-up in searching
+for infrequent events.  See the "Howto replica"_Howto_replica.html doc
 page for further discussion.

 When a PRD simulation is performed, it is assumed that each replica is
@ -78,8 +90,8 @@ I.e. the simulation domain, the number of atoms, the interaction
 potentials, etc should be the same for every replica.

 A PRD run has several stages, which are repeated each time an "event"
-occurs in one of the replicas, as defined below.  The logic for a PRD
-run is as follows:
+occurs in one of the replicas, as explained below.  The logic for a
+PRD run is as follows:

 while (time remains):
  dephase for n_dephase*t_dephase steps
@ -129,7 +141,8 @@ Minimization parameters may be set via the
 PRD command.  The latter are the settings that would be used with the
 "minimize"_minimize.html command.  Note that typically, you do not
 need to perform a highly-converged minimization to detect a transition
-event.
+event, though you may need to in order to prevent a set of atoms in
+the system from relaxing to a saddle point.

 The event check is performed by a compute with the specified
 {compute-ID}.  Currently there is only one compute that works with the
@ -307,7 +320,7 @@ deposit"_fix_deposit.html.
 "min_modify"_min_modify.html, "min_style"_min_style.html,
 "run_style"_run_style.html, "minimize"_minimize.html,
 "velocity"_velocity.html, "temper"_temper.html, "neb"_neb.html,
-"tad"_tad.html
+"tad"_tad.html, "hyper"_hyper.html

 [Default:]

--- a/examples/README
+++ b/examples/README
@ -78,6 +78,7 @@ friction: frictional contact of spherical asperities between 2d surfaces
 gcmc:     Grand Canonical Monte Carlo (GCMC) via the fix gcmc command
 granregion: use of fix wall/region/gran as boundary on granular particles
 hugoniostat: Hugoniostat shock dynamics
+hyper:    global and local hyperdynamics of diffusion on Pt surface
 indent:	  spherical indenter into a 2d solid
 kim:      use of potentials in Knowledge Base for Interatomic Models (KIM)
 latte:    use of LATTE density-functional tight-binding quantum code
--- a/examples/hyper/adatoms.list.37K
+++ b/examples/hyper/adatoms.list.37K
@ -0,0 +1,184 @@
+create_atoms 1 single 27.5 9.5 4
+create_atoms 1 single 16 9 4
+create_atoms 1 single 10 12 4
+create_atoms 1 single 31 44 4
+create_atoms 1 single 13 17 4
+create_atoms 1 single 8.5 28.5 4
+create_atoms 1 single 23 26 4
+create_atoms 1 single 38 27 4
+create_atoms 1 single 37.5 4.5 4
+create_atoms 1 single 41.5 47.5 4
+create_atoms 1 single 20.5 37.5 4
+create_atoms 1 single 5 8 4
+create_atoms 1 single 2.5 16.5 4
+create_atoms 1 single 38.5 45.5 4
+create_atoms 1 single 9 0 4
+create_atoms 1 single 39 32 4
+create_atoms 1 single 45.5 11.5 4
+create_atoms 1 single 40 0 4
+create_atoms 1 single 44.5 2.5 4
+create_atoms 1 single 4.5 44.5 4
+create_atoms 1 single 24.5 13.5 4
+create_atoms 1 single 47.5 23.5 4
+create_atoms 1 single 1 20 4
+create_atoms 1 single 38.5 31.5 4
+create_atoms 1 single 12.5 12.5 4
+create_atoms 1 single 2 27 4
+create_atoms 1 single 21 5 4
+create_atoms 1 single 47 12 4
+create_atoms 1 single 32.5 46.5 4
+create_atoms 1 single 9.5 40.5 4
+create_atoms 1 single 8.5 2.5 4
+create_atoms 1 single 41.5 22.5 4
+create_atoms 1 single 29 11 4
+create_atoms 1 single 3.5 3.5 4
+create_atoms 1 single 5 21 4
+create_atoms 1 single 46.5 31.5 4
+create_atoms 1 single 35 46 4
+create_atoms 1 single 40.5 41.5 4
+create_atoms 1 single 10 22 4
+create_atoms 1 single 43.5 14.5 4
+create_atoms 1 single 42 42 4
+create_atoms 1 single 4 26 4
+create_atoms 1 single 19 34 4
+create_atoms 1 single 33 9 4
+create_atoms 1 single 0.5 45.5 4
+create_atoms 1 single 30.5 32.5 4
+create_atoms 1 single 25.5 5.5 4
+create_atoms 1 single 47.5 39.5 4
+create_atoms 1 single 15 13 4
+create_atoms 1 single 21 21 4
+create_atoms 1 single 14 28 4
+create_atoms 1 single 9 34 4
+create_atoms 1 single 7 38 4
+create_atoms 1 single 11 35 4
+create_atoms 1 single 20.5 45.5 4
+create_atoms 1 single 30.5 31.5 4
+create_atoms 1 single 32.5 2.5 4
+create_atoms 1 single 21.5 3.5 4
+create_atoms 1 single 23 12 4
+create_atoms 1 single 4.5 33.5 4
+create_atoms 1 single 46 43 4
+create_atoms 1 single 42.5 45.5 4
+create_atoms 1 single 4.5 10.5 4
+create_atoms 1 single 33.5 15.5 4
+create_atoms 1 single 24 5 4
+create_atoms 1 single 13 16 4
+create_atoms 1 single 16.5 23.5 4
+create_atoms 1 single 45.5 28.5 4
+create_atoms 1 single 44.5 5.5 4
+create_atoms 1 single 27.5 46.5 4
+create_atoms 1 single 44.5 12.5 4
+create_atoms 1 single 12 41 4
+create_atoms 1 single 6 4 4
+create_atoms 1 single 31.5 10.5 4
+create_atoms 1 single 1 44 4
+create_atoms 1 single 31 4 4
+create_atoms 1 single 21 33 4
+create_atoms 1 single 3 33 4
+create_atoms 1 single 15 10 4
+create_atoms 1 single 28.5 22.5 4
+create_atoms 1 single 43 1 4
+create_atoms 1 single 3.5 0.5 4
+create_atoms 1 single 41 37 4
+create_atoms 1 single 18.5 43.5 4
+create_atoms 1 single 17 27 4
+create_atoms 1 single 3 5 4
+create_atoms 1 single 18.5 23.5 4
+create_atoms 1 single 31.5 14.5 4
+create_atoms 1 single 41 31 4
+create_atoms 1 single 22 3 4
+create_atoms 1 single 14.5 40.5 4
+create_atoms 1 single 9 38 4
+create_atoms 1 single 36 42 4
+create_atoms 1 single 33 22 4
+create_atoms 1 single 15.5 47.5 4
+create_atoms 1 single 3 0 4
+create_atoms 1 single 25.5 27.5 4
+create_atoms 1 single 2.5 28.5 4
+create_atoms 1 single 29.5 28.5 4
+create_atoms 1 single 44.5 18.5 4
+create_atoms 1 single 26 40 4
+create_atoms 1 single 41 27 4
+create_atoms 1 single 39.5 5.5 4
+create_atoms 1 single 3 38 4
+create_atoms 1 single 35 29 4
+create_atoms 1 single 11 19 4
+create_atoms 1 single 18 1 4
+create_atoms 1 single 39.5 40.5 4
+create_atoms 1 single 46 17 4
+create_atoms 1 single 1.5 23.5 4
+create_atoms 1 single 28.5 23.5 4
+create_atoms 1 single 10 28 4
+create_atoms 1 single 19 47 4
+create_atoms 1 single 10.5 16.5 4
+create_atoms 1 single 38 45 4
+create_atoms 1 single 42.5 41.5 4
+create_atoms 1 single 47.5 42.5 4
+create_atoms 1 single 38 7 4
+create_atoms 1 single 10 44 4
+create_atoms 1 single 29.5 27.5 4
+create_atoms 1 single 45 30 4
+create_atoms 1 single 3 9 4
+create_atoms 1 single 8.5 35.5 4
+create_atoms 1 single 24 44 4
+create_atoms 1 single 47 4 4
+create_atoms 1 single 7.5 8.5 4
+create_atoms 1 single 32.5 41.5 4
+create_atoms 1 single 0.5 34.5 4
+create_atoms 1 single 11 8 4
+create_atoms 1 single 2 40 4
+create_atoms 1 single 25 24 4
+create_atoms 1 single 47.5 6.5 4
+create_atoms 1 single 39.5 28.5 4
+create_atoms 1 single 17 21 4
+create_atoms 1 single 32 43 4
+create_atoms 1 single 16.5 29.5 4
+create_atoms 1 single 34 34 4
+create_atoms 1 single 11.5 3.5 4
+create_atoms 1 single 39 22 4
+create_atoms 1 single 24.5 36.5 4
+create_atoms 1 single 33 31 4
+create_atoms 1 single 35.5 35.5 4
+create_atoms 1 single 14.5 34.5 4
+create_atoms 1 single 34 28 4
+create_atoms 1 single 37 41 4
+create_atoms 1 single 33 46 4
+create_atoms 1 single 27.5 28.5 4
+create_atoms 1 single 40.5 22.5 4
+create_atoms 1 single 27.5 1.5 4
+create_atoms 1 single 12 2 4
+create_atoms 1 single 36 43 4
+create_atoms 1 single 28.5 9.5 4
+create_atoms 1 single 20.5 25.5 4
+create_atoms 1 single 3 3 4
+create_atoms 1 single 38 33 4
+create_atoms 1 single 3 20 4
+create_atoms 1 single 35 11 4
+create_atoms 1 single 5 25 4
+create_atoms 1 single 36.5 6.5 4
+create_atoms 1 single 19.5 24.5 4
+create_atoms 1 single 27 41 4
+create_atoms 1 single 39.5 11.5 4
+create_atoms 1 single 21.5 2.5 4
+create_atoms 1 single 46.5 15.5 4
+create_atoms 1 single 13 24 4
+create_atoms 1 single 11 37 4
+create_atoms 1 single 11.5 31.5 4
+create_atoms 1 single 47 0 4
+create_atoms 1 single 25.5 17.5 4
+create_atoms 1 single 32 11 4
+create_atoms 1 single 8 17 4
+create_atoms 1 single 27.5 12.5 4
+create_atoms 1 single 25 7 4
+create_atoms 1 single 25.5 37.5 4
+create_atoms 1 single 12 15 4
+create_atoms 1 single 1 7 4
+create_atoms 1 single 18.5 47.5 4
+create_atoms 1 single 5 38 4
+create_atoms 1 single 42 19 4
+create_atoms 1 single 30.5 7.5 4
+create_atoms 1 single 42.5 7.5 4
+create_atoms 1 single 26.5 18.5 4
+create_atoms 1 single 18.5 1.5 4
+create_atoms 1 single 41.5 10.5 4
--- a/examples/hyper/global.10Oct18.000000.jpg
+++ b/examples/hyper/global.10Oct18.000000.jpg
--- a/examples/hyper/global.10Oct18.003000.jpg
+++ b/examples/hyper/global.10Oct18.003000.jpg
--- a/examples/hyper/global.10Oct18.038000.jpg
+++ b/examples/hyper/global.10Oct18.038000.jpg
--- a/examples/hyper/global.10Oct18.059000.jpg
+++ b/examples/hyper/global.10Oct18.059000.jpg
--- a/examples/hyper/in.hyper.global
+++ b/examples/hyper/in.hyper.global
@ -0,0 +1,95 @@
+# 3d EAM surface for global HD
+
+# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
+# hop event on (100) surface is same distance
+# exchange event is 2 atoms moving same distance
+
+variable        Tequil index 500.0
+variable        Vmax index 0.5
+variable        qfactor index 0.3
+variable        cutbond index 3.2
+variable        cutevent index 1.1
+variable        steps index 100000
+variable        nevent index 1000
+variable        zoom index 1.8
+
+units           metal
+atom_style	atomic
+atom_modify     map array
+boundary        p p p
+
+lattice		fcc 3.92
+region		box block 0 6 0 6 0 4
+create_box	3 box
+create_atoms	1 box
+
+mass            * 1.0
+
+change_box      all z final -0.1 5.0 boundary p p f 
+create_atoms    2 single 3.5 3.5 4
+
+# define frozen substrate and mobile atoms
+
+group           adatom type 2
+region          base block INF INF INF INF 0 1.8
+set             region base type 3
+group           base type 3
+group           mobile type 1 2
+
+# pair style
+
+pair_style	eam/alloy
+pair_coeff	* * ptvoterlammps.eam Pt Pt Pt
+
+neighbor	0.5 bin
+neigh_modify    every 1 delay 5 check yes
+
+fix		1 mobile nve
+fix		2 mobile langevin ${Tequil} ${Tequil} 1.0 858872873 zero yes
+
+timestep	0.005
+
+compute         tmobile mobile temp
+
+thermo		100
+thermo_modify   temp tmobile
+
+# thermal equilibration
+
+run             1000
+reset_timestep  0
+
+# pin base so will not move during quenches
+
+fix             freeze base setforce 0.0 0.0 0.0
+
+# event detection
+
+compute         event all event/displace ${cutevent}
+
+# hyper/global
+
+fix             HG mobile hyper/global ${cutbond} ${qfactor} ${Vmax} ${Tequil}
+
+# thermo output
+
+thermo_style    custom step temp pe f_HG f_HG[*]
+
+thermo_modify   lost ignore
+thermo_modify   temp tmobile
+
+thermo          ${nevent}
+
+# dump output options
+
+region          substrate block INF INF INF INF 1.8 3.8
+region          adatoms block INF INF INF INF 3.8 INF
+variable        acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
+
+dump		1 all image 1000000 global.*.jpg v_acolor type &
+		zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
+dump_modify	1 pad 6 amap 1 3 sa 1 3 blue red green
+
+# run
+
+hyper           ${steps} ${nevent} HG event min 1.0e-6 1.0e-6 100 100 dump 1
--- a/examples/hyper/in.hyper.local
+++ b/examples/hyper/in.hyper.local
@ -0,0 +1,112 @@
+# 3d EAM surface for local HD
+
+# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
+# hop event on (100) surface is same distance
+# exchange event is 2 atoms moving same distance
+
+variable        Tequil index 400.0
+variable        Vmax index 0.4
+variable        qfactor index 0.3
+variable        cutbond index 3.2
+variable        Dcut index 10.0
+variable        cutevent index 1.1
+variable        alpha index 200.0
+variable        boost index 4000.0
+variable        ghostcut index 12.0
+variable        steps index 1500
+variable        nevent index 100
+variable        nx index 8
+variable        ny index 8
+variable        zoom index 1.8
+variable        seed index 826626413
+variable        tol index 1.0e-15
+variable        add index 37K
+
+units           metal
+atom_style	atomic
+atom_modify     map array
+boundary        p p p
+comm_modify     cutoff ${ghostcut}
+
+lattice		fcc 3.92
+region		box block 0 6 0 6 0 4
+create_box	2 box
+create_atoms	1 box
+
+mass            * 1.0
+
+change_box      all z final -0.1 5.0 boundary p p f 
+
+# replicate in xy
+
+replicate       ${nx} ${ny} 1
+
+# add adatoms
+
+include         adatoms.list.${add}
+
+# define frozen substrate and mobile atoms
+
+region          base block INF INF INF INF 0 1.8
+set             region base type 2
+group           base type 2
+group           mobile type 1
+
+# pair style
+
+pair_style	eam/alloy
+pair_coeff	* * ptvoterlammps.eam Pt Pt
+
+neighbor	0.5 bin
+neigh_modify    every 1 delay 5 check yes
+
+fix		1 mobile nve
+fix		2 mobile langevin ${Tequil} ${Tequil} 1.0 ${seed} zero yes
+
+timestep	0.005
+
+compute         tmobile mobile temp
+
+thermo		100
+thermo_modify   temp tmobile
+
+# thermal equilibration
+
+run             1000
+reset_timestep  0
+
+# pin base so will not move during quenches
+
+fix             freeze base setforce 0.0 0.0 0.0
+
+# event detection
+
+compute         event all event/displace ${cutevent}
+
+# hyper/local
+    
+fix             HL mobile hyper/local ${cutbond} ${qfactor} ${Vmax} ${Tequil} &
+                ${Dcut} ${alpha} ${boost}
+
+# thermo output
+
+thermo_style    custom step temp pe f_HL f_HL[*]
+
+thermo_modify   lost ignore
+thermo_modify   temp tmobile
+
+thermo          ${nevent}
+
+# dump
+
+region          substrate block INF INF INF INF 1.8 3.8
+region          adatoms block INF INF INF INF 3.8 INF
+variable        acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
+
+dump		1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 &
+		zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
+dump_modify	1 pad 6 amap 1 3 sa 1 3 blue red green
+
+# run
+
+hyper           ${steps} ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
--- a/examples/hyper/local.10Oct18.000000.jpg
+++ b/examples/hyper/local.10Oct18.000000.jpg
--- a/examples/hyper/local.10Oct18.000700.jpg
+++ b/examples/hyper/local.10Oct18.000700.jpg
--- a/examples/hyper/local.10Oct18.000800.jpg
+++ b/examples/hyper/local.10Oct18.000800.jpg
--- a/examples/hyper/local.10Oct18.001100.jpg
+++ b/examples/hyper/local.10Oct18.001100.jpg
--- a/examples/hyper/log.10Oct18.hyper.global.g++.4
+++ b/examples/hyper/log.10Oct18.hyper.global.g++.4
--- a/examples/hyper/log.10Oct18.hyper.local.g++.16
+++ b/examples/hyper/log.10Oct18.hyper.local.g++.16
@ -0,0 +1,993 @@
+LAMMPS (10 Oct 2018)
+# 3d EAM surface for local HD
+
+# nearest neighbor distance = a * sqrt(2)/2 = 2.77 Angs for Pt with a = 3.92
+# hop event on (100) surface is same distance
+# exchange event is 2 atoms moving same distance
+
+variable        Tequil index 400.0
+variable        Vmax index 0.4
+variable        qfactor index 0.3
+variable        cutbond index 3.2
+variable        Dcut index 10.0
+variable        cutevent index 1.1
+variable        alpha index 200.0
+variable        boost index 4000.0
+variable        ghostcut index 12.0
+variable        steps index 1500
+variable        nevent index 100
+variable        nx index 8
+variable        ny index 8
+variable        zoom index 1.8
+variable        seed index 826626413
+variable        tol index 1.0e-15
+variable        add index 37K
+
+units           metal
+atom_style	atomic
+atom_modify     map array
+boundary        p p p
+comm_modify     cutoff ${ghostcut}
+comm_modify     cutoff 12.0
+
+lattice		fcc 3.92
+Lattice spacing in x,y,z = 3.92 3.92 3.92
+region		box block 0 6 0 6 0 4
+create_box	2 box
+Created orthogonal box = (0 0 0) to (23.52 23.52 15.68)
+  2 by 4 by 2 MPI processor grid
+create_atoms	1 box
+Created 576 atoms
+  Time spent = 0.00108504 secs
+
+mass            * 1.0
+
+change_box      all z final -0.1 5.0 boundary p p f
+  orthogonal box = (0 0 -0.392) to (23.52 23.52 19.6)
+
+# replicate in xy
+
+replicate       ${nx} ${ny} 1
+replicate       8 ${ny} 1
+replicate       8 8 1
+  orthogonal box = (0 0 -0.392) to (188.16 188.16 19.6)
+  4 by 4 by 1 MPI processor grid
+  36864 atoms
+  Time spent = 0.0028758 secs
+
+# add adatoms
+
+include         adatoms.list.${add}
+include         adatoms.list.37K
+create_atoms 1 single 27.5 9.5 4
+Created 1 atoms
+  Time spent = 0.000183105 secs
+create_atoms 1 single 16 9 4
+Created 1 atoms
+  Time spent = 0.000178099 secs
+create_atoms 1 single 10 12 4
+Created 1 atoms
+  Time spent = 0.000179768 secs
+create_atoms 1 single 31 44 4
+Created 1 atoms
+  Time spent = 0.000184059 secs
+create_atoms 1 single 13 17 4
+Created 1 atoms
+  Time spent = 0.000173807 secs
+create_atoms 1 single 8.5 28.5 4
+Created 1 atoms
+  Time spent = 0.000167847 secs
+create_atoms 1 single 23 26 4
+Created 1 atoms
+  Time spent = 0.000179052 secs
+create_atoms 1 single 38 27 4
+Created 1 atoms
+  Time spent = 0.000169992 secs
+create_atoms 1 single 37.5 4.5 4
+Created 1 atoms
+  Time spent = 0.000166178 secs
+create_atoms 1 single 41.5 47.5 4
+Created 1 atoms
+  Time spent = 0.000172138 secs
+create_atoms 1 single 20.5 37.5 4
+Created 1 atoms
+  Time spent = 0.000165939 secs
+create_atoms 1 single 5 8 4
+Created 1 atoms
+  Time spent = 0.00018096 secs
+create_atoms 1 single 2.5 16.5 4
+Created 1 atoms
+  Time spent = 0.000165939 secs
+create_atoms 1 single 38.5 45.5 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 9 0 4
+Created 1 atoms
+  Time spent = 0.000168085 secs
+create_atoms 1 single 39 32 4
+Created 1 atoms
+  Time spent = 0.000170946 secs
+create_atoms 1 single 45.5 11.5 4
+Created 1 atoms
+  Time spent = 0.00018096 secs
+create_atoms 1 single 40 0 4
+Created 1 atoms
+  Time spent = 0.000168085 secs
+create_atoms 1 single 44.5 2.5 4
+Created 1 atoms
+  Time spent = 0.000165939 secs
+create_atoms 1 single 4.5 44.5 4
+Created 1 atoms
+  Time spent = 0.000168085 secs
+create_atoms 1 single 24.5 13.5 4
+Created 1 atoms
+  Time spent = 0.000165939 secs
+create_atoms 1 single 47.5 23.5 4
+Created 1 atoms
+  Time spent = 0.00018096 secs
+create_atoms 1 single 1 20 4
+Created 1 atoms
+  Time spent = 0.000166893 secs
+create_atoms 1 single 38.5 31.5 4
+Created 1 atoms
+  Time spent = 0.000168085 secs
+create_atoms 1 single 12.5 12.5 4
+Created 1 atoms
+  Time spent = 0.000169992 secs
+create_atoms 1 single 2 27 4
+Created 1 atoms
+  Time spent = 0.000188828 secs
+create_atoms 1 single 21 5 4
+Created 1 atoms
+  Time spent = 0.000174999 secs
+create_atoms 1 single 47 12 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 32.5 46.5 4
+Created 1 atoms
+  Time spent = 0.000165939 secs
+create_atoms 1 single 9.5 40.5 4
+Created 1 atoms
+  Time spent = 0.000166893 secs
+create_atoms 1 single 8.5 2.5 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 41.5 22.5 4
+Created 1 atoms
+  Time spent = 0.000174046 secs
+create_atoms 1 single 29 11 4
+Created 1 atoms
+  Time spent = 0.000166893 secs
+create_atoms 1 single 3.5 3.5 4
+Created 1 atoms
+  Time spent = 0.000165224 secs
+create_atoms 1 single 5 21 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 46.5 31.5 4
+Created 1 atoms
+  Time spent = 0.000166178 secs
+create_atoms 1 single 35 46 4
+Created 1 atoms
+  Time spent = 0.000183105 secs
+create_atoms 1 single 40.5 41.5 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 10 22 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 43.5 14.5 4
+Created 1 atoms
+  Time spent = 0.000169992 secs
+create_atoms 1 single 42 42 4
+Created 1 atoms
+  Time spent = 0.000165939 secs
+create_atoms 1 single 4 26 4
+Created 1 atoms
+  Time spent = 0.000174999 secs
+create_atoms 1 single 19 34 4
+Created 1 atoms
+  Time spent = 0.000163078 secs
+create_atoms 1 single 33 9 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 0.5 45.5 4
+Created 1 atoms
+  Time spent = 0.000163078 secs
+create_atoms 1 single 30.5 32.5 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 25.5 5.5 4
+Created 1 atoms
+  Time spent = 0.000178099 secs
+create_atoms 1 single 47.5 39.5 4
+Created 1 atoms
+  Time spent = 0.000165939 secs
+create_atoms 1 single 15 13 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 21 21 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 14 28 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 9 34 4
+Created 1 atoms
+  Time spent = 0.000174999 secs
+create_atoms 1 single 7 38 4
+Created 1 atoms
+  Time spent = 0.000175953 secs
+create_atoms 1 single 11 35 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 20.5 45.5 4
+Created 1 atoms
+  Time spent = 0.000156879 secs
+create_atoms 1 single 30.5 31.5 4
+Created 1 atoms
+  Time spent = 0.000159979 secs
+create_atoms 1 single 32.5 2.5 4
+Created 1 atoms
+  Time spent = 0.000166178 secs
+create_atoms 1 single 21.5 3.5 4
+Created 1 atoms
+  Time spent = 0.000157833 secs
+create_atoms 1 single 23 12 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 4.5 33.5 4
+Created 1 atoms
+  Time spent = 0.000156879 secs
+create_atoms 1 single 46 43 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 42.5 45.5 4
+Created 1 atoms
+  Time spent = 0.000156879 secs
+create_atoms 1 single 4.5 10.5 4
+Created 1 atoms
+  Time spent = 0.000158072 secs
+create_atoms 1 single 33.5 15.5 4
+Created 1 atoms
+  Time spent = 0.000157833 secs
+create_atoms 1 single 24 5 4
+Created 1 atoms
+  Time spent = 0.000154018 secs
+create_atoms 1 single 13 16 4
+Created 1 atoms
+  Time spent = 0.000158072 secs
+create_atoms 1 single 16.5 23.5 4
+Created 1 atoms
+  Time spent = 0.000156164 secs
+create_atoms 1 single 45.5 28.5 4
+Created 1 atoms
+  Time spent = 0.000247002 secs
+create_atoms 1 single 44.5 5.5 4
+Created 1 atoms
+  Time spent = 0.000156164 secs
+create_atoms 1 single 27.5 46.5 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 44.5 12.5 4
+Created 1 atoms
+  Time spent = 0.000157833 secs
+create_atoms 1 single 12 41 4
+Created 1 atoms
+  Time spent = 0.000156879 secs
+create_atoms 1 single 6 4 4
+Created 1 atoms
+  Time spent = 0.0001688 secs
+create_atoms 1 single 31.5 10.5 4
+Created 1 atoms
+  Time spent = 0.00015521 secs
+create_atoms 1 single 1 44 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 31 4 4
+Created 1 atoms
+  Time spent = 0.000156879 secs
+create_atoms 1 single 21 33 4
+Created 1 atoms
+  Time spent = 0.000156879 secs
+create_atoms 1 single 3 33 4
+Created 1 atoms
+  Time spent = 0.000164032 secs
+create_atoms 1 single 15 10 4
+Created 1 atoms
+  Time spent = 0.0001719 secs
+create_atoms 1 single 28.5 22.5 4
+Created 1 atoms
+  Time spent = 0.000153065 secs
+create_atoms 1 single 43 1 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 3.5 0.5 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 41 37 4
+Created 1 atoms
+  Time spent = 0.000153065 secs
+create_atoms 1 single 18.5 43.5 4
+Created 1 atoms
+  Time spent = 0.000213146 secs
+create_atoms 1 single 17 27 4
+Created 1 atoms
+  Time spent = 0.000159979 secs
+create_atoms 1 single 3 5 4
+Created 1 atoms
+  Time spent = 0.000153065 secs
+create_atoms 1 single 18.5 23.5 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 31.5 14.5 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 41 31 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 22 3 4
+Created 1 atoms
+  Time spent = 0.00015521 secs
+create_atoms 1 single 14.5 40.5 4
+Created 1 atoms
+  Time spent = 0.000154018 secs
+create_atoms 1 single 9 38 4
+Created 1 atoms
+  Time spent = 0.000154018 secs
+create_atoms 1 single 36 42 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 33 22 4
+Created 1 atoms
+  Time spent = 0.000163078 secs
+create_atoms 1 single 15.5 47.5 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 3 0 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 25.5 27.5 4
+Created 1 atoms
+  Time spent = 0.000176907 secs
+create_atoms 1 single 2.5 28.5 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 29.5 28.5 4
+Created 1 atoms
+  Time spent = 0.000162125 secs
+create_atoms 1 single 44.5 18.5 4
+Created 1 atoms
+  Time spent = 0.000152826 secs
+create_atoms 1 single 26 40 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 41 27 4
+Created 1 atoms
+  Time spent = 0.000158072 secs
+create_atoms 1 single 39.5 5.5 4
+Created 1 atoms
+  Time spent = 0.000155926 secs
+create_atoms 1 single 3 38 4
+Created 1 atoms
+  Time spent = 0.000152826 secs
+create_atoms 1 single 35 29 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 11 19 4
+Created 1 atoms
+  Time spent = 0.000164986 secs
+create_atoms 1 single 18 1 4
+Created 1 atoms
+  Time spent = 0.000146866 secs
+create_atoms 1 single 39.5 40.5 4
+Created 1 atoms
+  Time spent = 0.000146866 secs
+create_atoms 1 single 46 17 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 1.5 23.5 4
+Created 1 atoms
+  Time spent = 0.000154018 secs
+create_atoms 1 single 28.5 23.5 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 10 28 4
+Created 1 atoms
+  Time spent = 0.000159979 secs
+create_atoms 1 single 19 47 4
+Created 1 atoms
+  Time spent = 0.000148058 secs
+create_atoms 1 single 10.5 16.5 4
+Created 1 atoms
+  Time spent = 0.000147104 secs
+create_atoms 1 single 38 45 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 42.5 41.5 4
+Created 1 atoms
+  Time spent = 0.000161886 secs
+create_atoms 1 single 47.5 42.5 4
+Created 1 atoms
+  Time spent = 0.000147104 secs
+create_atoms 1 single 38 7 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 10 44 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 29.5 27.5 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 45 30 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 3 9 4
+Created 1 atoms
+  Time spent = 0.000154018 secs
+create_atoms 1 single 8.5 35.5 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 24 44 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 47 4 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 7.5 8.5 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 32.5 41.5 4
+Created 1 atoms
+  Time spent = 0.000157833 secs
+create_atoms 1 single 0.5 34.5 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 11 8 4
+Created 1 atoms
+  Time spent = 0.000147104 secs
+create_atoms 1 single 2 40 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 25 24 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 47.5 6.5 4
+Created 1 atoms
+  Time spent = 0.000147104 secs
+create_atoms 1 single 39.5 28.5 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 17 21 4
+Created 1 atoms
+  Time spent = 0.000164032 secs
+create_atoms 1 single 32 43 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 16.5 29.5 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 34 34 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 11.5 3.5 4
+Created 1 atoms
+  Time spent = 0.000154018 secs
+create_atoms 1 single 39 22 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 24.5 36.5 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 33 31 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 35.5 35.5 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 14.5 34.5 4
+Created 1 atoms
+  Time spent = 0.000146866 secs
+create_atoms 1 single 34 28 4
+Created 1 atoms
+  Time spent = 0.000153065 secs
+create_atoms 1 single 37 41 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 33 46 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 27.5 28.5 4
+Created 1 atoms
+  Time spent = 0.000145197 secs
+create_atoms 1 single 40.5 22.5 4
+Created 1 atoms
+  Time spent = 0.000150919 secs
+create_atoms 1 single 27.5 1.5 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 12 2 4
+Created 1 atoms
+  Time spent = 0.000151873 secs
+create_atoms 1 single 36 43 4
+Created 1 atoms
+  Time spent = 0.000144005 secs
+create_atoms 1 single 28.5 9.5 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 20.5 25.5 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 3 3 4
+Created 1 atoms
+  Time spent = 0.000144005 secs
+create_atoms 1 single 38 33 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 3 20 4
+Created 1 atoms
+  Time spent = 0.000154972 secs
+create_atoms 1 single 35 11 4
+Created 1 atoms
+  Time spent = 0.000145912 secs
+create_atoms 1 single 5 25 4
+Created 1 atoms
+  Time spent = 0.000144005 secs
+create_atoms 1 single 36.5 6.5 4
+Created 1 atoms
+  Time spent = 0.000144005 secs
+create_atoms 1 single 19.5 24.5 4
+Created 1 atoms
+  Time spent = 0.000236988 secs
+create_atoms 1 single 27 41 4
+Created 1 atoms
+  Time spent = 0.000169992 secs
+create_atoms 1 single 39.5 11.5 4
+Created 1 atoms
+  Time spent = 0.000138998 secs
+create_atoms 1 single 21.5 2.5 4
+Created 1 atoms
+  Time spent = 0.000136852 secs
+create_atoms 1 single 46.5 15.5 4
+Created 1 atoms
+  Time spent = 0.000138998 secs
+create_atoms 1 single 13 24 4
+Created 1 atoms
+  Time spent = 0.000137091 secs
+create_atoms 1 single 11 37 4
+Created 1 atoms
+  Time spent = 0.000144005 secs
+create_atoms 1 single 11.5 31.5 4
+Created 1 atoms
+  Time spent = 0.000144958 secs
+create_atoms 1 single 47 0 4
+Created 1 atoms
+  Time spent = 0.000138044 secs
+create_atoms 1 single 25.5 17.5 4
+Created 1 atoms
+  Time spent = 0.00014019 secs
+create_atoms 1 single 32 11 4
+Created 1 atoms
+  Time spent = 0.000138044 secs
+create_atoms 1 single 8 17 4
+Created 1 atoms
+  Time spent = 0.000138044 secs
+create_atoms 1 single 27.5 12.5 4
+Created 1 atoms
+  Time spent = 0.000137806 secs
+create_atoms 1 single 25 7 4
+Created 1 atoms
+  Time spent = 0.000146151 secs
+create_atoms 1 single 25.5 37.5 4
+Created 1 atoms
+  Time spent = 0.000139952 secs
+create_atoms 1 single 12 15 4
+Created 1 atoms
+  Time spent = 0.000138044 secs
+create_atoms 1 single 1 7 4
+Created 1 atoms
+  Time spent = 0.000138998 secs
+create_atoms 1 single 18.5 47.5 4
+Created 1 atoms
+  Time spent = 0.000138044 secs
+create_atoms 1 single 5 38 4
+Created 1 atoms
+  Time spent = 0.000136852 secs
+create_atoms 1 single 42 19 4
+Created 1 atoms
+  Time spent = 0.000149012 secs
+create_atoms 1 single 30.5 7.5 4
+Created 1 atoms
+  Time spent = 0.000138044 secs
+create_atoms 1 single 42.5 7.5 4
+Created 1 atoms
+  Time spent = 0.000138998 secs
+create_atoms 1 single 26.5 18.5 4
+Created 1 atoms
+  Time spent = 0.000153065 secs
+create_atoms 1 single 18.5 1.5 4
+Created 1 atoms
+  Time spent = 0.000137091 secs
+create_atoms 1 single 41.5 10.5 4
+Created 1 atoms
+  Time spent = 0.000140905 secs
+
+# define frozen substrate and mobile atoms
+
+region          base block INF INF INF INF 0 1.8
+set             region base type 2
+  18432 settings made for type
+group           base type 2
+18432 atoms in group base
+group           mobile type 1
+18616 atoms in group mobile
+
+# pair style
+
+pair_style	eam/alloy
+pair_coeff	* * ptvoterlammps.eam Pt Pt
+
+neighbor	0.5 bin
+neigh_modify    every 1 delay 5 check yes
+
+fix		1 mobile nve
+fix		2 mobile langevin ${Tequil} ${Tequil} 1.0 ${seed} zero yes
+fix		2 mobile langevin 400.0 ${Tequil} 1.0 ${seed} zero yes
+fix		2 mobile langevin 400.0 400.0 1.0 ${seed} zero yes
+fix		2 mobile langevin 400.0 400.0 1.0 826626413 zero yes
+
+timestep	0.005
+
+compute         tmobile mobile temp
+
+thermo		100
+thermo_modify   temp tmobile
+WARNING: Temperature for thermo pressure is not for group all (../thermo.cpp:488)
+
+# thermal equilibration
+
+run             1000
+Neighbor list info ...
+  update every 1 steps, delay 5 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.07583
+  ghost atom cutoff = 12
+  binsize = 3.03792, bins = 62 62 7
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair eam/alloy, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 3.359 | 3.359 | 3.36 Mbytes
+Step Temp E_pair E_mol TotEng Press 
+       0            0   -206220.22            0   -206220.22   -52155.664 
+     100    188.18127   -206044.43            0   -205591.63    -25068.83 
+     200    274.34464   -205860.78            0   -205200.66   -40191.797 
+     300    325.66286   -205750.01            0    -204966.4   -31510.222 
+     400    352.48242   -205675.42            0   -204827.28   -35058.064 
+     500    370.88571   -205619.66            0   -204727.25   -32735.022 
+     600    388.62129   -205592.87            0   -204657.78   -33904.556 
+     700    389.54874   -205579.73            0    -204642.4   -32769.852 
+     800    395.56074   -205576.82            0   -204625.03   -33755.948 
+     900    398.03458   -205564.48            0   -204606.74   -32777.103 
+    1000    401.24089   -205562.85            0    -204597.4   -33785.341 
+Loop time of 4.3687 on 16 procs for 1000 steps with 37048 atoms
+
+Performance: 98.885 ns/day, 0.243 hours/ns, 228.901 timesteps/s
+98.4% CPU use with 16 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 3.2988     | 3.3828     | 3.4667     |   2.3 | 77.43
+Neigh   | 0.20856    | 0.23127    | 0.24382    |   1.9 |  5.29
+Comm    | 0.33313    | 0.45075    | 0.55485    |   9.2 | 10.32
+Output  | 0.00042987 | 0.00044042 | 0.00049591 |   0.0 |  0.01
+Modify  | 0.18811    | 0.28363    | 0.36798    |   9.7 |  6.49
+Other   |            | 0.01983    |            |       |  0.45
+
+Nlocal:    2315.5 ave 2332 max 2297 min
+Histogram: 2 0 0 3 4 0 2 1 2 2
+Nghost:    3186.31 ave 3205 max 3170 min
+Histogram: 2 1 3 0 2 3 2 1 0 2
+Neighs:    55590.9 ave 56174 max 55103 min
+Histogram: 2 2 1 1 4 1 3 0 0 2
+
+Total # of neighbors = 889454
+Ave neighs/atom = 24.0082
+Neighbor list builds = 105
+Dangerous builds = 0
+reset_timestep  0
+
+# pin base so will not move during quenches
+
+fix             freeze base setforce 0.0 0.0 0.0
+
+# event detection
+
+compute         event all event/displace ${cutevent}
+compute         event all event/displace 1.1
+
+# hyper/local
+
+fix             HL mobile hyper/local ${cutbond} ${qfactor} ${Vmax} ${Tequil}                 ${Dcut} ${alpha} ${boost}
+fix             HL mobile hyper/local 3.2 ${qfactor} ${Vmax} ${Tequil}                 ${Dcut} ${alpha} ${boost}
+fix             HL mobile hyper/local 3.2 0.3 ${Vmax} ${Tequil}                 ${Dcut} ${alpha} ${boost}
+fix             HL mobile hyper/local 3.2 0.3 0.4 ${Tequil}                 ${Dcut} ${alpha} ${boost}
+fix             HL mobile hyper/local 3.2 0.3 0.4 400.0                 ${Dcut} ${alpha} ${boost}
+fix             HL mobile hyper/local 3.2 0.3 0.4 400.0                 10.0 ${alpha} ${boost}
+fix             HL mobile hyper/local 3.2 0.3 0.4 400.0                 10.0 200.0 ${boost}
+fix             HL mobile hyper/local 3.2 0.3 0.4 400.0                 10.0 200.0 4000.0
+
+# thermo output
+
+thermo_style    custom step temp pe f_HL f_HL[*]
+WARNING: New thermo_style command, previous thermo_modify settings will be lost (../output.cpp:705)
+
+thermo_modify   lost ignore
+thermo_modify   temp tmobile
+WARNING: Temperature for thermo pressure is not for group all (../thermo.cpp:488)
+
+thermo          ${nevent}
+thermo          100
+
+# dump
+
+region          substrate block INF INF INF INF 1.8 3.8
+region          adatoms block INF INF INF INF 3.8 INF
+variable        acolor atom rmask(base)+2*rmask(substrate)+3*rmask(adatoms)
+
+dump		1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 		zoom ${zoom} adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
+dump		1 all image 10000000 local.*.jpg v_acolor type size 1024 1024 		zoom 1.8 adiam 2.5 view 0.0 0.0 up 0 1 0 axes yes 0.9 0.01
+dump_modify	1 pad 6 amap 1 3 sa 1 3 blue red green
+
+# run
+
+hyper           ${steps} ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
+hyper           1500 ${nevent} HL event min ${tol} ${tol} 1000 1000 dump 1
+hyper           1500 100 HL event min ${tol} ${tol} 1000 1000 dump 1
+hyper           1500 100 HL event min 1.0e-15 ${tol} 1000 1000 dump 1
+hyper           1500 100 HL event min 1.0e-15 1.0e-15 1000 1000 dump 1
+WARNING: Resetting reneighboring criteria during hyper (../hyper.cpp:133)
+Neighbor list info ...
+  update every 1 steps, delay 0 steps, check yes
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 6.07583
+  ghost atom cutoff = 12
+  binsize = 3.03792, bins = 62 62 7
+  2 neighbor lists, perpetual/occasional/extra = 1 1 0
+  (1) pair eam/alloy, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d/newton
+      bin: standard
+  (2) fix hyper/local, occasional
+      attributes: full, newton on, cut 10
+      pair build: full/bin/atomonly
+      stencil: full/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 7.566 | 7.567 | 7.567 Mbytes
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+       0    401.24089   -205562.85            0            0            0            1            0            0            0            0            0            0            0            0        4e+19            0            0            0            0            0            0            0            0            0            0            0 
+      77    401.24089   -206534.96            0            0            0            1            0            0            0            0            0            0            0            0        4e+19            0            0            0            0            0            0            0         1540            0            0            0 
+Loop time of 0.540347 on 16 procs for 77 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+       0    401.24089   -205562.85    23.271302           74   0.18753621            1    6.0138739    703.62325            0   0.55802338    3.5350432            0            0            0        4e+19    10.115141    10.115141            0            0            0            0            0            0            0            0            0 
+     100    399.15639   -205546.21    22.904368           90   0.32935524   0.39929142    6.0138739    703.62325  0.026229865   0.91517139    3.9968927        91.88    0.3995539    0.4009724   0.39695676    10.262823    10.262823            0            0            0            0            0         2000            0            0            0 
+Loop time of 0.579085 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     100    399.15639   -205546.21    22.904368           90   0.32935524   0.39929142    6.0138739    703.62325  0.026229865   0.91517139    3.9968927        91.88    0.3995539    0.4009724   0.39695676    10.262823    10.262823            0            0            0            0            0         2000            0            0            0 
+     184    399.15639   -206534.96    22.904368           90   0.32935524   0.39929142    6.0138739    703.62325  0.026229865   0.91517139    3.9968927    49.934783   0.21714886    0.4009724   0.39695676    10.262823    10.262823            0            0            0            0            0         3680            0            0            0 
+Loop time of 0.556056 on 16 procs for 84 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     100    399.15639   -205546.21    22.903938           90   0.32935524   0.39929142    6.0138739    703.62325  0.026229865   0.91517139    3.9968927        91.88    0.3995539    0.4009724   0.39695676    10.262823    10.262823            0            0            0            0            0         2000            0            0            0 
+     200    403.01717   -205543.17    20.844359           90    0.3291605   0.39888693    6.0138739    703.62325  0.039527213   0.94418421    4.0368484        90.95   0.39930574    0.4019706   0.39554353    10.262823    10.262823            0            0            0            0            0         4000            0            0            0 
+Loop time of 0.581214 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     200    403.01717   -205543.17    20.844359           90    0.3291605   0.39888693    6.0138739    703.62325  0.039527213   0.94418421    4.0368484        90.95   0.39930574    0.4019706   0.39554353    10.262823    10.262823            0            0            0            0            0         4000            0            0            0 
+     275    403.01717   -206534.96    20.844359           90    0.3291605   0.39888693    6.0138739    703.62325  0.039527213   0.94418421    4.0368484    66.145455   0.29040418    0.4019706   0.39554353    10.262823    10.262823            0            0            0            0            0         5500            0            0            0 
+Loop time of 0.481812 on 16 procs for 75 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     200    403.01717   -205543.17    21.115577           91    0.3291605   0.39888693    6.0138739    703.62325  0.039527213   0.94418421    4.0368484        90.95   0.39930574    0.4019706   0.39554353    10.262823    10.262823            0            0            0            0            0         4000            0            0            0 
+     300    399.01963   -205541.46    19.137336           85   0.32442182   0.39862755    6.0138739    703.62325  0.046873868   0.94776891    4.0368484        92.02   0.39912484   0.40296919   0.39497622    10.288936    10.288936            0            0            0            0            0         6000            0            0            0 
+Loop time of 0.5757 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     300    399.01963   -205541.46    19.137336           85   0.32442182   0.39862755    6.0138739    703.62325  0.046873868   0.94776891    4.0368484        92.02   0.39912484   0.40296919   0.39497622    10.288936    10.288936            0            0            0            0            0         6000            0            0            0 
+     377    399.01963   -206534.96    19.137336           85   0.32442182   0.39862755    6.0138739    703.62325  0.046873868   0.94776891    4.0368484    73.225464   0.31760598   0.40296919   0.39497622    10.288936    10.288936            0            0            0            0            0         7540            0            0            0 
+Loop time of 0.514907 on 16 procs for 77 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     300    399.01963   -205541.46    19.137003           85   0.32442182   0.39862755    6.0138739    703.62325  0.046873868   0.94776891    4.0368484        92.02   0.39912484   0.40296919   0.39497622    10.288936    10.288936            0            0            0            0            0         6000            0            0            0 
+     400    398.15351   -205544.87    20.470844           93   0.34589451   0.39828754    6.0138739    703.62325  0.049952465   0.94776891    4.0779385      92.0375   0.39894967   0.40395328    0.3932824    10.307052    10.307052            0            0            0            0            0         8000            0            0            0 
+Loop time of 0.577371 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     400    398.15351   -205544.87    20.470844           93   0.34589451   0.39828754    6.0138739    703.62325  0.049952465   0.94776891    4.0779385      92.0375   0.39894967   0.40395328    0.3932824    10.307052    10.307052            0            0            0            0            0         8000            0            0            0 
+     471    398.15351   -206534.96    20.470844           93   0.34589451   0.39828754    6.0138739    703.62325  0.049952465   0.94776891    4.0779385    78.163482   0.33881076   0.40395328    0.3932824    10.307052    10.307052            0            0            0            0            0         9420            0            0            0 
+Loop time of 0.465473 on 16 procs for 71 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     400    398.15351   -205544.87    20.470689           93   0.34589451   0.39828754    6.0138739    703.62325  0.049952465   0.94776891    4.0779385      92.0375   0.39894967   0.40395328    0.3932824    10.307052    10.307052            0            0            0            0            0         8000            0            0            0 
+     500    400.29399   -205544.98    17.051242           83   0.42140172   0.39805251    6.0138739    703.62325  0.056986933    1.0907861    4.0779385       91.986   0.39879563   0.40493836   0.39165573    10.307052    10.307052            0            0            0            0            0        10000            0            0            0 
+Loop time of 0.579188 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     500    400.29399   -205544.98    17.051242           83   0.42140172   0.39805251    6.0138739    703.62325  0.056986933    1.0907861    4.0779385       91.986   0.39879563   0.40493836   0.39165573    10.307052    10.307052            0            0            0            0            0        10000            0            0            0 
+     577    400.29399   -206534.96    17.051242           83   0.42140172   0.39805251    6.0138739    703.62325  0.056986933    1.0910651    4.0779385    79.710572    0.3455768   0.40493836   0.39165573    10.307052    10.307052            0            0            0            0            0        11540            0            0            0 
+Loop time of 0.502193 on 16 procs for 77 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     500    400.29399   -205544.98    17.051107           83   0.42140172   0.39805251    6.0138739    703.62325  0.056986933    1.0910651    4.0779385       91.986   0.39879563   0.40493836   0.39165573    10.307052    10.307052            0            0            0            0            0        10000            0            0            0 
+     600    400.96099   -205544.56    20.904479           91   0.41219484   0.39780769    6.0138739    703.62325  0.061331691    1.1358732    4.0779385    92.013333   0.39864794   0.40593806   0.39067432    10.307052    10.307052            0            0            0            0            0        12000            0            0            0 
+Loop time of 0.694955 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     600    400.96099   -205544.56    20.904479           91   0.41219484   0.39780769    6.0138739    703.62325  0.061331691    1.1358732    4.0779385    92.013333   0.39864794   0.40593806   0.39067432    10.307052    10.307052            0            0            0            0            0        12000            0            0            0 
+     680    400.96099   -206534.96    20.904479           91   0.41219484   0.39780769    6.0138739    703.62325  0.061331691    1.1358732    4.0779385    81.188235   0.35174818   0.40593806   0.39067432    10.307052    10.307052            0            0            0            0            0        13600            0            0            0 
+Loop time of 0.529041 on 16 procs for 80 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     600    400.96099   -205544.56    20.904088           91   0.41219484   0.39780769    6.0138739    703.62325  0.061331691    1.1358732    4.0779385    92.013333   0.39864794   0.40593806   0.39067432    10.307052    10.307052            0            0            0            0            0        12000            0            0            0 
+     700    397.78618   -205534.96    20.361513           95   0.54466603   0.39757442    6.0138739    703.62325  0.061146951    1.1853748    4.1995704        92.12   0.39850836   0.40693553   0.38981834    10.307052    10.307052            0            0            0            0            0        14000            0            0            0 
+Loop time of 0.590093 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     700    397.78618   -205534.96    20.361513           95   0.54466603   0.39757442    6.0138739    703.62325  0.061146951    1.2139704    4.1995704        92.12   0.39850836   0.40693553   0.38981834    10.307052    10.307052            0            0            0            0            0        14000            0            0            0 
+     790    397.78618   -206534.96    20.361513           95   0.54466603   0.39757442    6.0138739    703.62325  0.061146951    2.2107138    4.1995704    81.625316   0.35310868   0.40693553   0.38981834    10.307052    10.307052            0            0            0            0            0        15800            0            0            0 
+Loop time of 0.594281 on 16 procs for 90 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     700    397.78618   -205534.96    20.236364           94   0.51088027   0.39757442    6.0138739    703.62325  0.061146951    2.2107138     4.205089        92.12   0.39850836   0.40693553   0.38981834    10.307052    10.307052            0            0            0            0            0        14000            1            2            6 
+     800    399.66919   -205547.44    21.285461           94   0.56079766   0.39739855    6.0138739    703.62325   0.06556778    2.2107138    4.3041291     92.36625    0.3983806   0.40793368      0.38875    10.307052    10.385797            0            0            0            0            0        16000            1            2            6 
+Loop time of 0.583824 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     800    399.66919   -205547.44    21.285461           94   0.56079766   0.39739855    6.0138739    703.62325   0.06556778    2.2107138    4.3041291     92.36625    0.3983806   0.40793368      0.38875    10.307052    10.385797            0            0            0            0            0        16000            1            2            6 
+     872    399.66919   -206535.54    21.285461           94   0.56079766   0.39739855    6.0138739    703.62325   0.06556778    2.3177682    4.3041291    84.739679   0.36548679   0.40793368      0.38875    10.307052    10.385797            0            0            0            0            0        17440            1            2            6 
+Loop time of 0.46886 on 16 procs for 72 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     800    399.66919   -205547.44      21.2852           94   0.44964213   0.39739855    6.0138739    703.62325   0.06556778    2.3177682    4.3041291     92.36625    0.3983806   0.40793368      0.38875    10.307052    10.385797            0            0            0            0            0        16000            2            4           13 
+     900     401.5853   -205544.22    19.308189           94   0.47610389   0.39719191    6.0138739    703.62325  0.066991886    2.3177682    4.3041291    92.017778   0.39825974   0.40893337    0.3878576    10.307052    10.385797            0            0            0            0            0        18000            2            4           13 
+Loop time of 0.585137 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     900     401.5853   -205544.22    19.308189           94   0.47610389   0.39719191    6.0138739    703.62325  0.066991886    2.3177682    4.3041291    92.017778   0.39825974   0.40893337    0.3878576    10.307052    10.385797            0            0            0            0            0        18000            2            4           13 
+     975     401.5853   -206535.54    19.308189           94   0.47610389   0.39719191    6.0138739    703.62325  0.066991886    2.3177682    4.3041291    84.939487   0.36762438   0.40893337    0.3878576    10.307052    10.385797            0            0            0            0            0        19500            2            4           13 
+Loop time of 0.502012 on 16 procs for 75 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+     900     401.5853   -205544.22    19.307938           94   0.47610389   0.39719191    6.0138739    703.62325  0.066991886    2.3177682    4.3041291    92.017778   0.39825974   0.40893337    0.3878576    10.307052    10.385797            0            0            0            0            0        18000            2            4           13 
+    1000    395.06218   -205526.35    17.514295           91   0.42044925   0.39716259    6.0138739    703.62325  0.067937867    2.3177682    4.3041291       92.511   0.39814962   0.40993184    0.3867545    10.307052    10.385797            0            0            0            0            0        20000            2            4           13 
+Loop time of 0.588597 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1000    395.06218   -205526.35    17.514295           91   0.42044925   0.39716259    6.0138739    703.62325  0.067937867    2.3177682    4.3041291       92.511   0.39814962   0.40993184    0.3867545    10.307052    10.385797            0            0            0            0            0        20000            2            4           13 
+    1083    395.06218   -206535.54    17.514295           91   0.42044925   0.39716259    6.0138739    703.62325  0.067937867    2.3177682    4.3041291    85.421053   0.36763584   0.40993184    0.3867545    10.307052    10.385797            0            0            0            0            0        21660            2            4           13 
+Loop time of 0.543222 on 16 procs for 83 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1000    395.06218   -205526.35    17.514191           91   0.42044925   0.39716259    6.0138739    703.62325  0.067937867    2.3177682    4.3041291       92.511   0.39814962   0.40993184    0.3867545    10.307052    10.385797            0            0            0            0            0        20000            2            4           13 
+    1100    400.04484   -205545.92     19.52012           89   0.58919981   0.39704631    6.0138739    703.62325  0.069136967    2.3177682    4.4265979    92.517273   0.39805636   0.41093134   0.38574293    10.307052    10.385797            0            0            0            0            0        22000            2            4           13 
+Loop time of 0.590075 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1100    400.04484   -205545.92     19.52012           89   0.58919981   0.39704631    6.0138739    703.62325  0.069136967    2.3177682    4.4265979    92.517273   0.39805636   0.41093134   0.38574293    10.307052    10.385797            0            0            0            0            0        22000            2            4           13 
+    1177    400.04484   -206535.53     19.52012           89   0.58919981   0.39704631    6.0138739    703.62325  0.069136967    2.3177682    4.4265979    86.464741   0.37201529   0.41093134   0.38574293    10.307052    10.385797            0            0            0            0            0        23540            2            4           13 
+Loop time of 0.500839 on 16 procs for 77 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1100    400.04484   -205545.92    19.518413           89     0.429675   0.39705701    6.0137119     703.6043  0.069136967    2.3177682    4.4265979    92.517273   0.39805636   0.41093134   0.38574293    10.307052    10.385797            0            0            0            0            0        22000            3            6           19 
+    1200     400.7462    -205543.2    21.169548           91   0.32511134   0.39679665    6.0137119     703.6043   0.06750442    2.3177682    4.4265979    92.376667   0.39796198   0.41191655    0.3846039    10.307052    10.385797            0            0            0            0            0        24000            3            6           19 
+Loop time of 0.583971 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1200     400.7462    -205543.2    21.169548           91   0.32511134   0.39679665    6.0137119     703.6043   0.06750442    2.3177682    4.4265979    92.376667   0.39796198   0.41191655    0.3846039    10.307052    10.385797            0            0            0            0            0        24000            3            6           19 
+    1277     400.7462   -206535.53    21.169548           91   0.32511134   0.39679665    6.0137119     703.6043   0.06750442    2.3177682    4.4265979    86.806578   0.37396584   0.41191655    0.3846039    10.307052    10.385797            0            0            0            0            0        25540            3            6           19 
+Loop time of 0.509118 on 16 procs for 77 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1200     400.7462    -205543.2    21.169281           91   0.32511134   0.39679665    6.0137119     703.6043   0.06750442    2.3177682    4.4265979    92.376667   0.39796198   0.41191655    0.3846039    10.307052    10.385797            0            0            0            0            0        24000            3            6           19 
+    1300    398.53702   -205539.33     21.35815           94   0.38773898   0.39659935    6.0137119     703.6043  0.067808168    2.3177682    4.4265979    92.500769   0.39786514   0.41289519    0.3846039    10.307052    10.385797            0            0            0            0            0        26000            3            6           19 
+Loop time of 0.587306 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1300    398.53702   -205539.33     21.35815           94   0.38773898   0.39659935    6.0137119     703.6043  0.067808168    2.3177682    4.4265979    92.500769   0.39786514   0.41289519    0.3846039    10.307052    10.385797            0            0            0            0            0        26000            3            6           19 
+    1375    398.53702   -206535.53     21.35815           94   0.38773898   0.39659935    6.0137119     703.6043  0.067808168    2.3177682    4.4265979    87.455273   0.37616341   0.41289519    0.3846039    10.307052    10.385797            0            0            0            0            0        27500            3            6           19 
+Loop time of 0.483781 on 16 procs for 75 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1300    398.53702   -205539.33     21.35787           94   0.38773898   0.39659935    6.0137119     703.6043  0.067808168    2.3177682    4.4265979    92.500769   0.39786514   0.41289519    0.3846039    10.307052    10.385797            0            0            0            0            0        26000            3            6           19 
+    1400    402.80537    -205549.3    19.481887           95   0.32554201   0.39648737    6.0137119     703.6043  0.069550538    2.3177682    4.4265979    92.666429   0.39776836   0.41389491   0.38420043    10.307052    10.385797            0            0            0            0            0        28000            3            6           19 
+Loop time of 0.586411 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1400    402.80537    -205549.3    19.481887           95   0.32554201   0.39648737    6.0137119     703.6043  0.069550538    2.3177682    4.4265979    92.666429   0.39776836   0.41389491   0.38420043    10.307052    10.385797            0            0            0            0            0        28000            3            6           19 
+    1471    402.80537   -206535.53    19.481887           95   0.32554201   0.39648737    6.0137119     703.6043  0.069550538    2.3177682    4.4265979    88.193746   0.37856948   0.41389491   0.38420043    10.307052    10.385797            0            0            0            0            0        29420            3            6           19 
+Loop time of 0.473799 on 16 procs for 71 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1400    402.80537    -205549.3    19.481632           95   0.32554201   0.39648737    6.0137119     703.6043  0.069550538    2.3177682    4.4265979    92.666429   0.39776836   0.41389491   0.38420043    10.307052    10.385797            0            0            0            0            0        28000            3            6           19 
+    1500     402.0803    -205537.7    20.903964           99    0.3340498   0.39635609    6.0137119     703.6043  0.070409086    2.3177682    4.4265979    92.857333   0.39767858   0.41489448   0.38372784    10.333041    10.385797            0            0            0            0            0        30000            3            6           19 
+Loop time of 0.587342 on 16 procs for 100 steps with 37048 atoms
+
+Step Temp PotEng f_HL f_HL[1] f_HL[2] f_HL[3] f_HL[4] f_HL[5] f_HL[6] f_HL[7] f_HL[8] f_HL[9] f_HL[10] f_HL[11] f_HL[12] f_HL[13] f_HL[14] f_HL[15] f_HL[16] f_HL[17] f_HL[18] f_HL[19] f_HL[20] f_HL[21] f_HL[22] f_HL[23] 
+    1500     402.0803    -205537.7    20.903964           99    0.3340498   0.39635609    6.0137119     703.6043  0.070409086    2.3177682    4.4265979    92.857333   0.39767858   0.41489448   0.38372784    10.333041    10.385797            0            0            0            0            0        30000            3            6           19 
+    1574     402.0803   -206535.53    20.903964           99    0.3340498   0.39635609    6.0137119     703.6043  0.070409086    2.3177682    4.4265979    88.491741   0.37898213   0.41489448   0.38372784    10.333041    10.385797            0            0            0            0            0        31480            3            6           19 
+Loop time of 0.493982 on 16 procs for 74 steps with 37048 atoms
+
+Final hyper stats ...
+
+Cummulative quantities for fix hyper:
+  hyper time = 30000
+  event timesteps = 3
+  # of atoms in events = 6
+Quantities for this hyper run:
+  event timesteps = 3
+  # of atoms in events = 6
+  max length of any bond = 4.4266
+  max drift distance of any atom = 2.31777
+  fraction of steps & bonds with zero bias = 0.0704091
+Current quantities:
+  ave bonds/atom = 6.01371
+Cummulative quantities specific tofix hyper/local:
+  # of new bonds formed = 19
+  max bonds/atom = 13
+Quantities for this hyper run specific to fix hyper/local:
+  ave boosted bonds/step = 92.8573
+  ave boost coeff of all bonds = 0.397679
+  max boost coeff of any bond = 0.414894
+  min boost coeff of any bond = 0.383728
+  max dist from my box of any non-maxstrain bond ghost atom = 10.333
+  max dist from my box of any bond ghost atom = 10.3858
+  count of ghost bond neighbors not found on reneighbor steps = 0
+  lost bond partners = 0
+  ave bias coeff for lost bond partners = 0
+  bias overlaps = 0
+  non-matching bias coeffs = 0
+  CPU time for bond builds = 0.044807
+Current quantities specific to fix hyper/local:
+  neighbor bonds/bond = 703.604
+  ave boost coeff for all bonds = 0.396356
+
+Loop time of 17.9972 on 16 procs for 1500 steps with 37048 atoms
+
+Performance: 36.006 ns/day, 0.667 hours/ns, 83.346 timesteps/s
+120.7% CPU use with 16 MPI tasks x no OpenMP threads
+
+Hyper stats:
+  Dynamics time (%) = 8.87027 (49.2869)
+  Quench   time (%) = 8.15972 (45.3388)
+  Other    time (%) = 1.2212 (6.78552)
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 11.6       | 11.848     | 12.043     |   3.9 | 65.83
+Neigh   | 0.50025    | 0.52638    | 0.55163    |   2.1 |  2.92
+Comm    | 0.34528    | 0.49905    | 0.66742    |  13.3 |  2.77
+Output  | 0.0021305  | 0.0021461  | 0.0022686  |   0.1 |  0.01
+Modify  | 3.7498     | 3.9009     | 3.9786     |   2.8 | 21.67
+Other   |            | 1.221      |            |       |  6.79
+
+Nlocal:    2315.5 ave 2361 max 2267 min
+Histogram: 1 1 0 4 2 1 3 3 0 1
+Nghost:    3187.88 ave 3236 max 3141 min
+Histogram: 1 0 3 2 2 1 4 1 1 1
+Neighs:    53950.6 ave 54989 max 53049 min
+Histogram: 2 0 3 2 1 2 4 1 0 1
+FullNghs:  542951 ave 554654 max 533224 min
+Histogram: 1 2 3 1 2 2 2 2 0 1
+
+Total # of neighbors = 8687214
+Ave neighs/atom = 234.485
+Neighbor list builds = 165
+Dangerous builds = 0
+Total wall time: 0:00:22
--- a/examples/hyper/ptvoterlammps.eam
+++ b/examples/hyper/ptvoterlammps.eam
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,68 @@
 # Change Log

+## [2.7.24](https://github.com/kokkos/kokkos/tree/2.7.24) (2018-11-04)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.7.00...2.7.24)
+
+**Implemented enhancements:**
+
+- DualView: Add non-templated functions for sync, need\_sync, view, modify [\#1858](https://github.com/kokkos/kokkos/issues/1858)
+- DualView: Avoid needlessly allocates and initializes modify\_host and modify\_device flag views [\#1831](https://github.com/kokkos/kokkos/issues/1831)
+- DualView: Incorrect deduction of "not device type" [\#1659](https://github.com/kokkos/kokkos/issues/1659)
+- BuildSystem: Add KOKKOS\_ENABLE\_CXX14 and KOKKOS\_ENABLE\_CXX17 [\#1602](https://github.com/kokkos/kokkos/issues/1602)
+- BuildSystem: Installed kokkos\_generated\_settings.cmake contains build directories instead of install directories [\#1838](https://github.com/kokkos/kokkos/issues/1838)
+- BuildSystem: KOKKOS\_ARCH: add ticks to printout of improper arch setting [\#1649](https://github.com/kokkos/kokkos/issues/1649)
+- BuildSystem: Make core/src/Makefile for Cuda use needed nvcc\_wrapper [\#1296](https://github.com/kokkos/kokkos/issues/1296)
+- Build: Support PGI as host compiler for NVCC [\#1828](https://github.com/kokkos/kokkos/issues/1828)
+- Build: Many Warnings Fixed e.g.[\#1786](https://github.com/kokkos/kokkos/issues/1786)
+- Capability: OffsetView with non-zero begin index [\#567](https://github.com/kokkos/kokkos/issues/567)
+- Capability: Reductions into device side view [\#1788](https://github.com/kokkos/kokkos/issues/1788)
+- Capability: Add max\_size to Kokkos::Array [\#1760](https://github.com/kokkos/kokkos/issues/1760)
+- Capability: View Assignment: LayoutStride -\> LayoutLeft and LayoutStride -\> LayoutRight [\#1594](https://github.com/kokkos/kokkos/issues/1594)
+- Capability: Atomic function allow implicit conversion of update argument [\#1571](https://github.com/kokkos/kokkos/issues/1571)
+- Capability: Add team\_size\_max with tagged functors [\#663](https://github.com/kokkos/kokkos/issues/663)
+- Capability: Fix allignment of views from Kokkos\_ScratchSpace should use different alignment [\#1700](https://github.com/kokkos/kokkos/issues/1700)
+- Capabilitiy: create\_mirror\_view\_and\_copy for DynRankView [\#1651](https://github.com/kokkos/kokkos/issues/1651)
+- Capability: DeepCopy HBWSpace / HostSpace [\#548](https://github.com/kokkos/kokkos/issues/548)
+- ROCm: support team vector scan  [\#1645](https://github.com/kokkos/kokkos/issues/1645)
+- ROCm:  Merge from rocm-hackathon2 [\#1636](https://github.com/kokkos/kokkos/issues/1636)
+- ROCm:  Add ParallelScanWithTotal [\#1611](https://github.com/kokkos/kokkos/issues/1611)
+- ROCm: Implement MDRange in ROCm [\#1314](https://github.com/kokkos/kokkos/issues/1314)
+- ROCm: Implement Reducers for Nested Parallelism Levels [\#963](https://github.com/kokkos/kokkos/issues/963)
+- ROCm: Add asynchronous deep copy [\#959](https://github.com/kokkos/kokkos/issues/959)
+- Tests: Memory pool test seems to allocate 8GB [\#1830](https://github.com/kokkos/kokkos/issues/1830)
+- Tests: Add unit\_test for team\_broadcast [\#734](https://github.com/kokkos/kokkos/issues/734)
+
+**Fixed bugs:**
+
+- BuildSystem: Makefile.kokkos gets gcc-toolchain wrong if gcc is cached [\#1841](https://github.com/kokkos/kokkos/issues/1841)
+- BuildSystem: kokkos\_generated\_settings.cmake placement is inconsistent [\#1771](https://github.com/kokkos/kokkos/issues/1771)
+- BuildSystem: Invalid escape sequence \. in kokkos\_functions.cmake [\#1661](https://github.com/kokkos/kokkos/issues/1661)
+- BuildSystem: Problem in Kokkos generated cmake file [\#1770](https://github.com/kokkos/kokkos/issues/1770)
+- BuildSystem: invalid file names on windows [\#1671](https://github.com/kokkos/kokkos/issues/1671)
+- Tests: reducers min/max\_loc test fails randomly due to multiple min values and thus multiple valid locations [\#1681](https://github.com/kokkos/kokkos/issues/1681)
+- Tests: cuda.scatterview unit test causes "Bus error" when force\_uvm and enable\_lambda are enabled [\#1852](https://github.com/kokkos/kokkos/issues/1852)
+- Tests: cuda.cxx11 unit test fails when force\_uvm and enable\_lambda are enabled [\#1850](https://github.com/kokkos/kokkos/issues/1850)
+- Tests: threads.reduce\_device\_view\_range\_policy failing with Cuda/8.0.44 and RDC [\#1836](https://github.com/kokkos/kokkos/issues/1836)
+- Build: compile error when compiling Kokkos with hwloc 2.0.1 \(on OSX 10.12.6, with g++ 7.2.0\) [\#1506](https://github.com/kokkos/kokkos/issues/1506)
+- Build: dual\_view.view broken with UVM [\#1834](https://github.com/kokkos/kokkos/issues/1834)
+- Build: White cuda/9.2 + gcc/7.2 warnings triggering errors  [\#1833](https://github.com/kokkos/kokkos/issues/1833)
+- Build: warning: enum constant in boolean context [\#1813](https://github.com/kokkos/kokkos/issues/1813)
+- Capability: Fix overly conservative max\_team\_size thingy [\#1808](https://github.com/kokkos/kokkos/issues/1808)
+- DynRankView: Ctors taking ViewAllocateWithoutInitializing broken [\#1783](https://github.com/kokkos/kokkos/issues/1783)
+- Cuda: Apollo cuda.team\_broadcast test fail with clang-6.0 [\#1762](https://github.com/kokkos/kokkos/issues/1762)
+- Cuda: Clang spurious test failure in impl\_view\_accessible [\#1753](https://github.com/kokkos/kokkos/issues/1753)
+- Cuda: Kokkos::complex\<double\> atomic deadlocks with Clang 6 Cuda build with -O0 [\#1752](https://github.com/kokkos/kokkos/issues/1752)
+- Cuda: LayoutStride Test fails for UVM as default memory space [\#1688](https://github.com/kokkos/kokkos/issues/1688)
+- Cuda: Scan wrong values on Volta [\#1676](https://github.com/kokkos/kokkos/issues/1676)
+- Cuda: Kokkos::deep\_copy error with CudaUVM and Kokkos::Serial spaces [\#1652](https://github.com/kokkos/kokkos/issues/1652)
+- Cuda: cudaErrorInvalidConfiguration with debug build [\#1647](https://github.com/kokkos/kokkos/issues/1647)
+- Cuda: parallel\_for with TeamPolicy::team\_size\_recommended with launch bounds not working -- reported by Daniel Holladay [\#1283](https://github.com/kokkos/kokkos/issues/1283)
+- Cuda: Using KOKKOS\_CLASS\_LAMBDA in a class with Kokkos::Random\_XorShift64\_Pool member data [\#1696](https://github.com/kokkos/kokkos/issues/1696)
+- Long Build Times on Darwin [\#1721](https://github.com/kokkos/kokkos/issues/1721)
+- Capability: Typo in Kokkos\_Sort.hpp - BinOp3D - wrong comparison [\#1720](https://github.com/kokkos/kokkos/issues/1720)
+- Buffer overflow in SharedAllocationRecord in Kokkos\_HostSpace.cpp [\#1673](https://github.com/kokkos/kokkos/issues/1673)
+- Serial unit test failure [\#1632](https://github.com/kokkos/kokkos/issues/1632)
+
 ## [2.7.00](https://github.com/kokkos/kokkos/tree/2.7.00) (2018-05-24)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.6.00...2.7.00)

--- a/lib/kokkos/CMakeLists.txt
+++ b/lib/kokkos/CMakeLists.txt
@ -11,7 +11,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)

  # Define Project Name if this is a standalone build
  IF(NOT DEFINED ${PROJECT_NAME})
-    project(Kokkos CXX) 
+    project(Kokkos CXX)
  ENDIF()

  # Basic initialization (Used in KOKKOS_SETTINGS)
@ -22,7 +22,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
  include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake)
  set_kokkos_cxx_compiler()
  set_kokkos_cxx_standard()
-  
+
  #------------ GET OPTIONS AND KOKKOS_SETTINGS --------------------------------
  # Add Kokkos' modules to CMake's module path.
  set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/")
@ -34,7 +34,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)

  #------------ GENERATE HEADER AND SOURCE FILES -------------------------------
  execute_process(
-    COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} generate_build_settings
+    COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} PREFIX=${CMAKE_INSTALL_PREFIX} generate_build_settings
    WORKING_DIRECTORY "${Kokkos_BINARY_DIR}"
    OUTPUT_FILE ${Kokkos_BINARY_DIR}/core_src_make.out
    RESULT_VARIABLE GEN_SETTINGS_RESULT
@ -45,6 +45,7 @@ IF(NOT KOKKOS_HAS_TRILINOS)
  endif()
  include(${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake)
  install(FILES ${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake DESTINATION lib/cmake/Kokkos)
+  install(FILES ${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake DESTINATION ${CMAKE_INSTALL_PREFIX})
  string(REPLACE " " ";" KOKKOS_TPL_INCLUDE_DIRS "${KOKKOS_GMAKE_TPL_INCLUDE_DIRS}")
  string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_DIRS "${KOKKOS_GMAKE_TPL_LIBRARY_DIRS}")
  string(REPLACE " " ";" KOKKOS_TPL_LIBRARY_NAMES "${KOKKOS_GMAKE_TPL_LIBRARY_NAMES}")
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -6,9 +6,9 @@ ifndef KOKKOS_PATH
 endif
 CXXFLAGS=$(CCFLAGS)

-# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
+# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial
 KOKKOS_DEVICES ?= "OpenMP"
-#KOKKOS_DEVICES ?= "Pthreads"
+#KOKKOS_DEVICES ?= "Pthread"
 # Options: 
 # Intel:    KNC,KNL,SNB,HSW,BDW,SKX
 # NVIDIA:   Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72
@ -21,12 +21,13 @@ KOKKOS_ARCH ?= ""
 KOKKOS_DEBUG ?= "no"
 # Options: hwloc,librt,experimental_memkind
 KOKKOS_USE_TPLS ?= ""
-# Options: c++11,c++1z
+# Options: c++11,c++14,c++1y,c++17,c++1z,c++2a
 KOKKOS_CXX_STANDARD ?= "c++11"
 # Options: aggressive_vectorization,disable_profiling,disable_deprecated_code,enable_large_mem_tests
 KOKKOS_OPTIONS ?= ""
 # Option for setting ETI path
 KOKKOS_ETI_PATH ?= ${KOKKOS_PATH}/core/src/eti
+KOKKOS_CMAKE ?= "no"

 # Default settings specific options.
 # Options: force_uvm,use_ldg,rdc,enable_lambda
@ -41,7 +42,11 @@ kokkos_has_string=$(if $(findstring $2,$1),1,0)
 # Check for general settings.
 KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes)
 KOKKOS_INTERNAL_ENABLE_CXX11 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++11)
+KOKKOS_INTERNAL_ENABLE_CXX14 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++14)
+KOKKOS_INTERNAL_ENABLE_CXX1Y := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1y)
+KOKKOS_INTERNAL_ENABLE_CXX17 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++17)
 KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z)
+KOKKOS_INTERNAL_ENABLE_CXX2A := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++2a)

 # Check for external libraries.
 KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc)
@ -110,6 +115,18 @@ KOKKOS_INTERNAL_COMPILER_CLANG       := $(call kokkos_has_string,$(KOKKOS_CXX_VE
 KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin)
 KOKKOS_INTERNAL_COMPILER_HCC         := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC)

+# Check Host Compiler if using NVCC through nvcc_wrapper
+ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+  KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep nvcc_wrapper | wc -l))
+  ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1)
+
+    KOKKOS_CXX_HOST_VERSION             := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version       2>&1))
+    KOKKOS_INTERNAL_COMPILER_PGI    := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),PGI)
+    KOKKOS_INTERNAL_COMPILER_INTEL  := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),Intel Corporation)
+    KOKKOS_INTERNAL_COMPILER_CLANG  := $(call kokkos_has_string,$(KOKKOS_CXX_HOST_VERSION),clang)
+  endif
+endif
+
 ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
  KOKKOS_INTERNAL_COMPILER_CLANG = 1
 endif
@ -202,18 +219,34 @@ endif
 # Set C++11 flags.
 ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
  KOKKOS_INTERNAL_CXX11_FLAG := --c++11
+  KOKKOS_INTERNAL_CXX14_FLAG := --c++14
+  #KOKKOS_INTERNAL_CXX17_FLAG := --c++17
 else
  ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
     KOKKOS_INTERNAL_CXX11_FLAG := -std=c++11
+     #KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14
+     KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y
+     #KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17
+     #KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1Z
+     #KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a
  else
    ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
      KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
+      KOKKOS_INTERNAL_CXX14_FLAG := -hstd=c++14
+      #KOKKOS_INTERNAL_CXX1Y_FLAG := -hstd=c++1y
+      #KOKKOS_INTERNAL_CXX17_FLAG := -hstd=c++17
+      #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z
+      #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a
    else
      ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
        KOKKOS_INTERNAL_CXX11_FLAG := 
      else
        KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
+        KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14
+        KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y
+        KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17
        KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
+        KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a
      endif
    endif
  endif
@ -336,7 +369,9 @@ endif

 #CPPFLAGS is now unused
 KOKKOS_CPPFLAGS =
-KOKKOS_CXXFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
+ifneq ($(KOKKOS_CMAKE), yes)
+  KOKKOS_CXXFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src -I$(KOKKOS_ETI_PATH)
+endif
 KOKKOS_TPL_INCLUDE_DIRS =
 KOKKOS_TPL_LIBRARY_DIRS =
 KOKKOS_TPL_LIBRARY_NAMES =
@ -347,9 +382,11 @@ endif

 KOKKOS_LIBS = -ldl
 KOKKOS_TPL_LIBRARY_NAMES += dl
-KOKKOS_LDFLAGS = -L$(shell pwd)
-# CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
-KOKKOS_CXXLDFLAGS = -L$(shell pwd)
+ifneq ($(KOKKOS_CMAKE), yes)
+  KOKKOS_LDFLAGS = -L$(shell pwd)
+  # CXXLDFLAGS is used together with CXXFLAGS in a combined compile/link command
+  KOKKOS_CXXLDFLAGS = -L$(shell pwd)
+endif
 KOKKOS_LINK_FLAGS = 
 KOKKOS_SRC =
 KOKKOS_HEADERS =
@ -377,10 +414,12 @@ tmp := $(call kokkos_append_header,"/* Execution Spaces */")

 ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CUDA")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_COMPILER_CUDA_VERSION $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION)")
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
  tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM')
+  tmp := $(call kokkos_append_header,'\#define KOKKOS_IMPL_ROCM_CLANG_WORKAROUND 1')
 endif

 ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
@ -438,11 +477,25 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
 endif
-
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX14), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX14_FLAG)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Y), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Y_FLAG)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX14")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX17), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX17_FLAG)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
+endif
 ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1)
  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG)
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX11")
-  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX1Z")
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX17")
+endif
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX2A), 1)
+  KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX2A_FLAG)
+  tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_CXX20")
 endif

 ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
@ -465,7 +518,9 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
  ifneq ($(HWLOC_PATH),)
-    KOKKOS_CXXFLAGS += -I$(HWLOC_PATH)/include
+    ifneq ($(KOKKOS_CMAKE), yes)
+      KOKKOS_CXXFLAGS += -I$(HWLOC_PATH)/include
+    endif
    KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib
    KOKKOS_CXXLDFLAGS += -L$(HWLOC_PATH)/lib
    KOKKOS_TPL_INCLUDE_DIRS += $(HWLOC_PATH)/include
@ -484,7 +539,9 @@ endif

 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
  ifneq ($(MEMKIND_PATH),)
-    KOKKOS_CXXFLAGS += -I$(MEMKIND_PATH)/include
+    ifneq ($(KOKKOS_CMAKE), yes)
+      KOKKOS_CXXFLAGS += -I$(MEMKIND_PATH)/include
+    endif
    KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
    KOKKOS_CXXLDFLAGS += -L$(MEMKIND_PATH)/lib
    KOKKOS_TPL_INCLUDE_DIRS += $(MEMKIND_PATH)/include
@ -977,7 +1034,9 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_ETI), 1)
 endif
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
  ifneq ($(CUDA_PATH),)
-    KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
+    ifneq ($(KOKKOS_CMAKE), yes)
+      KOKKOS_CXXFLAGS += -I$(CUDA_PATH)/include
+    endif
    KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64
    KOKKOS_CXXLDFLAGS += -L$(CUDA_PATH)/lib64
    KOKKOS_TPL_INCLUDE_DIRS += $(CUDA_PATH)/include
@ -1032,7 +1091,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
  KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
  KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
  ifneq ($(QTHREADS_PATH),)
-    KOKKOS_CXXFLAGS += -I$(QTHREADS_PATH)/include
+    ifneq ($(KOKKOS_CMAKE), yes)
+      KOKKOS_CXXFLAGS += -I$(QTHREADS_PATH)/include
+    endif
    KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
    KOKKOS_CXXLDFLAGS += -L$(QTHREADS_PATH)/lib
    KOKKOS_TPL_INCLUDE_DIRS += $(QTHREADS_PATH)/include
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@ -52,44 +52,47 @@ For specifics see the LICENSE file contained in the repository or distribution.
  * GCC 4.8.4
  * GCC 4.9.3
  * GCC 5.1.0
-  * GCC 5.3.0
+  * GCC 5.5.0
  * GCC 6.1.0
+  * GCC 7.2.0
+  * GCC 7.3.0
+  * GCC 8.1.0
  * Intel 15.0.2
  * Intel 16.0.1
-  * Intel 17.1.043
+  * Intel 17.0.1
  * Intel 17.4.196
-  * Intel 18.0.128
+  * Intel 18.2.128
  * Clang 3.6.1
  * Clang 3.7.1
  * Clang 3.8.1
  * Clang 3.9.0
  * Clang 4.0.0
-  * Clang 4.0.0 for CUDA (CUDA Toolkit 8.0.44)
-  * Clang 6.0.0 for CUDA (CUDA Toolkit 9.1)
-  * PGI 17.10
-  * NVCC 7.0 for CUDA (with gcc 4.8.4)
+  * Clang 6.0.0 for CUDA (CUDA Toolkit 9.0)
+  * Clang 7.0.0 for CUDA (CUDA Toolkit 9.1)
+  * PGI 18.7
  * NVCC 7.5 for CUDA (with gcc 4.8.4)
  * NVCC 8.0.44 for CUDA (with gcc 5.3.0)
  * NVCC 9.1 for CUDA (with gcc 6.1.0)

 ### Primary tested compilers on Power 8 are:
-  * GCC 5.4.0 (OpenMP,Serial)
-  * IBM XL 13.1.6 (OpenMP, Serial)
-  * NVCC 8.0.44 for CUDA (with gcc 5.4.0)
-  * NVCC 9.0.103 for CUDA (with gcc 6.3.0 and XL 13.1.6)
+  * GCC 6.4.0 (OpenMP,Serial)
+  * GCC 7.2.0 (OpenMP,Serial)
+  * IBM XL 16.1.0 (OpenMP, Serial)
+  * NVCC 9.2.88 for CUDA (with gcc 7.2.0 and XL 16.1.0)

 ### Primary tested compilers on Intel KNL are:
-  * GCC 6.2.0
  * Intel 16.4.258 (with gcc 4.7.2)
  * Intel 17.2.174 (with gcc 4.9.3)
-  * Intel 18.0.128 (with gcc 4.9.3)
+  * Intel 18.2.199 (with gcc 4.9.3)

-### Primary tested compilers on ARM
-  * GCC 6.1.0 
+### Primary tested compilers on ARM (Cavium ThunderX2)
+  * GCC 7.2.0 
+  * ARM/Clang 18.4.0
  
 ### Other compilers working:
  * X86:
   - Cygwin 2.1.0 64bit with gcc 4.9.3
+   - GCC 8.1.0 (not warning free)

 ### Known non-working combinations:
  * Power8:
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -697,6 +697,7 @@ namespace Kokkos {
    typedef Random_XorShift64<DeviceType> generator_type;
    typedef DeviceType device_type;

+    KOKKOS_INLINE_FUNCTION
    Random_XorShift64_Pool() {
      num_states_ = 0;
    }
@ -709,12 +710,14 @@ namespace Kokkos {
 #endif
    }

+    KOKKOS_INLINE_FUNCTION
    Random_XorShift64_Pool(const Random_XorShift64_Pool& src):
      locks_(src.locks_),
      state_(src.state_),
      num_states_(src.num_states_)
    {}

+    KOKKOS_INLINE_FUNCTION
    Random_XorShift64_Pool operator = (const Random_XorShift64_Pool& src) {
      locks_ = src.locks_;
      state_ = src.state_;
@ -958,6 +961,7 @@ namespace Kokkos {

    typedef DeviceType device_type;

+    KOKKOS_INLINE_FUNCTION
    Random_XorShift1024_Pool() {
      num_states_ = 0;
    }
@ -972,6 +976,7 @@ namespace Kokkos {
 #endif
    }

+    KOKKOS_INLINE_FUNCTION
    Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src):
      locks_(src.locks_),
      state_(src.state_),
@ -979,6 +984,7 @@ namespace Kokkos {
      num_states_(src.num_states_)
    {}

+    KOKKOS_INLINE_FUNCTION
    Random_XorShift1024_Pool operator = (const Random_XorShift1024_Pool& src) {
      locks_ = src.locks_;
      state_ = src.state_;
--- a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@ -246,8 +246,8 @@ public:
  {
    bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
    bin_count_const =  bin_count_atomic;
-    bin_offsets =      offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
-    sort_order =       offset_type("PermutationVector",range_end-range_begin);
+    bin_offsets =      offset_type(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::bin_offsets"),bin_op.max_bins());
+    sort_order =       offset_type(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sort_order"),range_end-range_begin);
  }

  BinSort( const_key_view_type  keys_
@ -290,7 +290,7 @@ public:

 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
    scratch_view_type
-      sorted_values("Scratch",
+      sorted_values(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sorted_values"),
                    len,
                    values.extent(1),
                    values.extent(2),
@ -301,7 +301,7 @@ public:
                    values.extent(7));
 #else
    scratch_view_type
-      sorted_values("Scratch",
+      sorted_values(ViewAllocateWithoutInitializing("Kokkos::SortImpl::BinSortFunctor::sorted_values"),
                  values.rank_dynamic > 0 ? len : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
                  values.rank_dynamic > 1 ? values.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
                  values.rank_dynamic > 2 ? values.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG,
@ -483,7 +483,7 @@ struct BinOp3D {
    if (keys(i1,0)>keys(i2,0)) return true;
    else if (keys(i1,0)==keys(i2,0)) {
      if (keys(i1,1)>keys(i2,1)) return true;
-      else if (keys(i1,1)==keys(i2,2)) {
+      else if (keys(i1,1)==keys(i2,1)) {
        if (keys(i1,2)>keys(i2,2)) return true;
      }
    }
--- a/lib/kokkos/benchmarks/gups/Makefile
+++ b/lib/kokkos/benchmarks/gups/Makefile
@ -0,0 +1,41 @@
+#Set your Kokkos path to something appropriate
+KOKKOS_PATH = ${HOME}/git/kokkos-github-repo
+KOKKOS_DEVICES = "Cuda"
+KOKKOS_ARCH = "Pascal60"
+KOKKOS_CUDA_OPTIONS = enable_lambda
+#KOKKOS_DEVICES = "OpenMP"
+#KOKKOS_ARCH = "Power8"
+
+SRC = gups-kokkos.cc
+
+default: build
+	echo "Start Build"
+	
+CXXFLAGS = -O3
+CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper
+#CXX = g++
+
+LINK = ${CXX}
+
+LINKFLAGS =  
+EXE = gups-kokkos
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cc=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o $(EXE)
+
+# Compilation rules
+
+%.o:%.cc $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/benchmarks/gups/gups-kokkos.cc
+++ b/lib/kokkos/benchmarks/gups/gups-kokkos.cc
@ -0,0 +1,199 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include "Kokkos_Core.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+#include <sys/time.h>
+
+#define HLINE "-------------------------------------------------------------\n"
+
+#if defined(KOKKOS_ENABLE_CUDA)
+typedef Kokkos::View<int64_t*, Kokkos::CudaSpace>::HostMirror GUPSHostArray;
+typedef Kokkos::View<int64_t*, Kokkos::CudaSpace> GUPSDeviceArray;
+#else
+typedef Kokkos::View<int64_t*, Kokkos::HostSpace>::HostMirror GUPSHostArray;
+typedef Kokkos::View<int64_t*, Kokkos::HostSpace> GUPSDeviceArray;
+#endif
+
+typedef int GUPSIndex;
+
+double now() {
+	struct timeval now;
+	gettimeofday(&now, NULL);
+
+	return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
+}
+
+void randomize_indices(GUPSHostArray& indices, GUPSDeviceArray& dev_indices, const int64_t dataCount) {
+	for( GUPSIndex i = 0; i < indices.extent(0); ++i ) {
+		indices[i] = lrand48() % dataCount;
+	}
+
+	Kokkos::deep_copy(dev_indices, indices);
+}
+
+void run_gups(GUPSDeviceArray& indices, GUPSDeviceArray& data, const int64_t datum,
+	const bool performAtomics) {
+
+	if( performAtomics ) {
+		Kokkos::parallel_for("bench-gups-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
+			Kokkos::atomic_fetch_xor( &data[indices[i]], datum );
+		});
+	} else {
+		Kokkos::parallel_for("bench-gups-non-atomic", indices.extent(0), KOKKOS_LAMBDA(const GUPSIndex i) {
+			data[indices[i]] ^= datum;
+		});
+	}
+
+	Kokkos::fence();
+}
+
+int run_benchmark(const GUPSIndex indicesCount, const GUPSIndex dataCount, const int repeats,
+	const bool useAtomics) {
+
+	printf("Reports fastest timing per kernel\n");
+	printf("Creating Views...\n");
+
+	printf("Memory Sizes:\n");
+	printf("- Elements:      %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(dataCount),
+		1.0e-6 * ((double) dataCount * (double) sizeof(int64_t)));
+	printf("- Indices:       %15" PRIu64 " (%12.4f MB)\n", static_cast<uint64_t>(indicesCount),
+		1.0e-6 * ((double) indicesCount * (double) sizeof(int64_t)));
+	printf(" - Atomics:      %15s\n", (useAtomics ? "Yes" : "No") );
+	printf("Benchmark kernels will be performed for %d iterations.\n", repeats);
+
+	printf(HLINE);
+
+	GUPSDeviceArray dev_indices("indices", indicesCount);
+	GUPSDeviceArray dev_data("data", dataCount);
+	int64_t datum = -1;
+
+	GUPSHostArray indices = Kokkos::create_mirror_view(dev_indices);
+	GUPSHostArray data    = Kokkos::create_mirror_view(dev_data);
+
+	double gupsTime  = 0.0;
+
+	printf("Initializing Views...\n");
+
+#if defined(KOKKOS_HAVE_OPENMP)
+	Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::OpenMP>(0, dataCount),
+#else
+	Kokkos::parallel_for("init-data", Kokkos::RangePolicy<Kokkos::Serial>(0, dataCount),
+#endif
+		KOKKOS_LAMBDA(const int i) {
+
+		data[i] = 10101010101;
+	});
+
+#if defined(KOKKOS_HAVE_OPENMP)
+	Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::OpenMP>(0, indicesCount),
+#else
+	Kokkos::parallel_for("init-indices", Kokkos::RangePolicy<Kokkos::Serial>(0, indicesCount),
+#endif
+		KOKKOS_LAMBDA(const int i) {
+
+		indices[i] = 0;
+	});
+
+	Kokkos::deep_copy(dev_data, data);
+	Kokkos::deep_copy(dev_indices, indices);
+	double start;
+
+	printf("Starting benchmarking...\n");
+
+	for( GUPSIndex k = 0; k < repeats; ++k ) {
+		randomize_indices(indices, dev_indices, data.extent(0));
+
+		start = now();
+		run_gups(dev_indices, dev_data, datum, useAtomics);
+		gupsTime += now() - start;
+	}
+
+	Kokkos::deep_copy(indices, dev_indices);
+	Kokkos::deep_copy(data, dev_data);
+
+	printf(HLINE);
+	printf("GUP/s Random:      %18.6f\n",
+		(1.0e-9 * ((double) repeats) * (double) dev_indices.extent(0)) / gupsTime);
+	printf(HLINE);
+
+	return 0;
+}
+
+int main(int argc, char* argv[]) {
+
+	printf(HLINE);
+	printf("Kokkos GUPS Benchmark\n");
+	printf(HLINE);
+
+	srand48(1010101);
+
+	Kokkos::initialize(argc, argv);
+
+	int64_t indices = 8192;
+	int64_t data    = 33554432;
+	int64_t repeats = 10;
+	bool useAtomics = false;
+
+	for( int i = 1; i < argc; ++i ) {
+		if( strcmp( argv[i], "--indices" ) == 0 ) {
+			indices = std::atoll(argv[i+1]);
+			++i;
+		} else if( strcmp( argv[i], "--data" ) == 0 ) {
+			data = std::atoll(argv[i+1]);
+			++i;
+		} else if( strcmp( argv[i], "--repeats" ) == 0 ) {
+			repeats = std::atoll(argv[i+1]);
+			++i;
+		} else if( strcmp( argv[i], "--atomics" ) == 0 ) {
+			useAtomics = true;
+		}
+	}
+
+	const int rc = run_benchmark(indices, data, repeats, useAtomics);
+
+	Kokkos::finalize();
+
+	return rc;
+}
--- a/lib/kokkos/benchmarks/stream/Makefile
+++ b/lib/kokkos/benchmarks/stream/Makefile
@ -0,0 +1,41 @@
+#Set your Kokkos path to something appropriate
+KOKKOS_PATH = ${HOME}/git/kokkos-github-repo
+#KOKKOS_DEVICES = "Cuda"
+#KOKKOS_ARCH = "Pascal60"
+#KOKKOS_CUDA_OPTIONS = enable_lambda
+KOKKOS_DEVICES = "OpenMP"
+KOKKOS_ARCH = "Power8"
+
+SRC = stream-kokkos.cc
+
+default: build
+	echo "Start Build"
+	
+CXXFLAGS = -O3
+#CXX = ${HOME}/git/kokkos-github-repo/bin/nvcc_wrapper
+CXX = g++
+
+LINK = ${CXX}
+
+LINKFLAGS =  
+EXE = stream-kokkos
+
+DEPFLAGS = -M
+
+OBJ = $(SRC:.cc=.o)
+LIB =
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+build: $(EXE)
+
+$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE)
+
+clean: kokkos-clean 
+	rm -f *.o $(EXE)
+
+# Compilation rules
+
+%.o:%.cc $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
--- a/lib/kokkos/benchmarks/stream/stream-kokkos.cc
+++ b/lib/kokkos/benchmarks/stream/stream-kokkos.cc
@ -0,0 +1,265 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include "Kokkos_Core.hpp"
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+#include <sys/time.h>
+
+#define STREAM_ARRAY_SIZE 100000000
+#define STREAM_NTIMES     20
+
+#define HLINE "-------------------------------------------------------------\n"
+
+#if defined(KOKKOS_ENABLE_CUDA)
+typedef Kokkos::View<double*, Kokkos::CudaSpace>::HostMirror StreamHostArray;
+typedef Kokkos::View<double*, Kokkos::CudaSpace> StreamDeviceArray;
+#else
+typedef Kokkos::View<double*, Kokkos::HostSpace>::HostMirror StreamHostArray;
+typedef Kokkos::View<double*, Kokkos::HostSpace> StreamDeviceArray;
+#endif
+
+typedef int StreamIndex;
+
+double now() {
+	struct timeval now;
+	gettimeofday(&now, NULL);
+
+	return (double) now.tv_sec + ((double) now.tv_usec * 1.0e-6);
+}
+
+void perform_copy(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
+
+	Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
+		c[i] = a[i];
+	});
+
+	Kokkos::fence();
+}
+
+void perform_scale(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
+       	const double scalar) {
+
+	Kokkos::parallel_for("copy", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
+		b[i] = scalar * c[i];
+	});
+
+	Kokkos::fence();
+}
+
+void perform_add(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c) {
+	Kokkos::parallel_for("add", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
+                c[i] = a[i] + b[i];
+        });
+
+	Kokkos::fence();
+}
+
+void perform_triad(StreamDeviceArray& a, StreamDeviceArray& b, StreamDeviceArray& c,
+	const double scalar) {
+
+	Kokkos::parallel_for("triad", a.extent(0), KOKKOS_LAMBDA(const StreamIndex i) {
+		a[i] = b[i] + scalar * c[i];
+	});
+
+	Kokkos::fence();
+}
+
+int perform_validation(StreamHostArray& a, StreamHostArray& b, StreamHostArray& c,
+	const StreamIndex arraySize, const double scalar) {
+
+	double ai = 1.0;
+	double bi = 2.0;
+	double ci = 0.0;
+
+	for( StreamIndex i = 0; i < arraySize; ++i ) {
+		ci = ai;
+		bi = scalar * ci;
+		ci = ai + bi;
+		ai = bi + scalar * ci;
+	};
+
+	double aError = 0.0;
+	double bError = 0.0;
+	double cError = 0.0;
+
+	for( StreamIndex i = 0; i < arraySize; ++i ) {
+		aError = std::abs( a[i] - ai );
+		bError = std::abs( b[i] - bi );
+		cError = std::abs( c[i] - ci );
+	}
+
+	double aAvgError = aError / (double) arraySize;
+	double bAvgError = bError / (double) arraySize;
+	double cAvgError = cError / (double) arraySize;
+
+	const double epsilon = 1.0e-13;
+	int errorCount = 0;
+
+	if( std::abs( aAvgError / ai ) > epsilon ) {
+		fprintf(stderr, "Error: validation check on View a failed.\n");
+		errorCount++;
+	}
+
+	if( std::abs( bAvgError / bi ) > epsilon ) {
+		fprintf(stderr, "Error: validation check on View b failed.\n");
+		errorCount++;
+	}
+
+	if( std::abs( cAvgError / ci ) > epsilon ) {
+		fprintf(stderr, "Error: validation check on View c failed.\n");
+		errorCount++;
+	}
+
+	if( errorCount == 0 ) {
+		printf("All solutions checked and verified.\n");
+	}
+
+	return errorCount;
+}
+
+int run_benchmark() {
+
+	printf("Reports fastest timing per kernel\n");
+	printf("Creating Views...\n");
+
+	printf("Memory Sizes:\n");
+	printf("- Array Size:    %" PRIu64 "\n", static_cast<uint64_t>(STREAM_ARRAY_SIZE));
+	printf("- Per Array:     %12.2f MB\n", 1.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
+	printf("- Total:         %12.2f MB\n", 3.0e-6 * (double) STREAM_ARRAY_SIZE * (double) sizeof(double));
+
+	printf("Benchmark kernels will be performed for %d iterations.\n", STREAM_NTIMES);
+
+	printf(HLINE);
+
+	StreamDeviceArray dev_a("a", STREAM_ARRAY_SIZE);
+	StreamDeviceArray dev_b("b", STREAM_ARRAY_SIZE);
+	StreamDeviceArray dev_c("c", STREAM_ARRAY_SIZE);
+
+	StreamHostArray a = Kokkos::create_mirror_view(dev_a);
+	StreamHostArray b = Kokkos::create_mirror_view(dev_b);
+	StreamHostArray c = Kokkos::create_mirror_view(dev_c);
+
+	const double scalar = 3.0;
+
+	double copyTime  = std::numeric_limits<double>::max();
+	double scaleTime = std::numeric_limits<double>::max();
+	double addTime   = std::numeric_limits<double>::max();
+	double triadTime = std::numeric_limits<double>::max();
+
+	printf("Initializing Views...\n");
+
+#if defined(KOKKOS_HAVE_OPENMP)
+	Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::OpenMP>(0, STREAM_ARRAY_SIZE),
+#else
+	Kokkos::parallel_for("init", Kokkos::RangePolicy<Kokkos::Serial>(0, STREAM_ARRAY_SIZE),
+#endif
+		KOKKOS_LAMBDA(const int i) {
+
+		a[i] = 1.0;
+		b[i] = 2.0;
+		c[i] = 0.0;
+	});
+
+	// Copy contents of a (from the host) to the dev_a (device)
+	Kokkos::deep_copy(dev_a, a);
+	Kokkos::deep_copy(dev_b, b);
+	Kokkos::deep_copy(dev_c, c);
+
+	double start;
+
+	printf("Starting benchmarking...\n");
+
+	for( StreamIndex k = 0; k < STREAM_NTIMES; ++k ) {
+		start = now();
+		perform_copy(dev_a, dev_b, dev_c);
+		copyTime = std::min( copyTime, (now() - start) );
+
+		start = now();
+		perform_scale(dev_a, dev_b, dev_c, scalar);
+		scaleTime = std::min( scaleTime, (now() - start) );
+
+		start = now();
+		perform_add(dev_a, dev_b, dev_c);
+		addTime = std::min( addTime, (now() - start) );
+
+		start = now();
+		perform_triad(dev_a, dev_b, dev_c, scalar);
+		triadTime = std::min( triadTime, (now() - start) );
+	}
+
+	Kokkos::deep_copy(a, dev_a);
+	Kokkos::deep_copy(b, dev_b);
+	Kokkos::deep_copy(c, dev_c);
+
+	printf("Performing validation...\n");
+	int rc = perform_validation(a, b, c, STREAM_ARRAY_SIZE, scalar);
+
+	printf(HLINE);
+
+	printf("Copy            %11.2f MB/s\n",
+		( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / copyTime );
+	printf("Scale           %11.2f MB/s\n",
+		( 1.0e-06 * 2.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / scaleTime );
+	printf("Add             %11.2f MB/s\n",
+		( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / addTime );
+	printf("Triad           %11.2f MB/s\n",
+		( 1.0e-06 * 3.0 * (double) sizeof(double) * (double) STREAM_ARRAY_SIZE) / triadTime );
+
+	printf(HLINE);
+
+	return rc;
+}
+
+int main(int argc, char* argv[]) {
+
+	printf(HLINE);
+	printf("Kokkos STREAM Benchmark\n");
+	printf(HLINE);
+
+	Kokkos::initialize(argc, argv);
+	const int rc = run_benchmark();
+	Kokkos::finalize();
+
+	return rc;
+}
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@ -125,18 +125,20 @@ function show_help {
  echo "  --openmp-ratio=N/D    Ratio of the cpuset to use for OpenMP"
  echo "                        Default: 1"
  echo "  --openmp-places=<Op>  Op=threads|cores|sockets. Default: threads"
-  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
-  echo "  --force-openmp-num-threads=N"
+  echo "  --openmp-num-threads=N"
  echo "                        Override logic for selecting OMP_NUM_THREADS"
-  echo "  --force-openmp-proc-bind=<OP>"
+  echo "  --openmp-proc-bind=<OP>"
  echo "                        Override logic for selecting OMP_PROC_BIND"
-  echo "  --no-openmp-nested    Set OMP_NESTED to false"
+  echo "  --openmp-nested       Set OMP_NESTED to true"
+  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"
  echo "  --output-prefix=<P>   Save the output to files of the form"
  echo "                        P.hpcbind.N, P.stdout.N and P.stderr.N where P is "
  echo "                        the prefix and N is the rank (no spaces)"
  echo "  --output-mode=<Op>    How console output should be handled."
  echo "                        Options are all, rank0, and none.  Default: rank0" 
  echo "  --lstopo              Show bindings in lstopo"
+  echo "  --save-topology=<Xml>  Save the topology to the given xml file"
+  echo "  --load-topology=<Xml>  Load a previously saved topology from an xml file"
  echo "  -v|--verbose          Print bindings and relevant environment variables"
  echo "  -h|--help             Show this message"
  echo ""
@ -189,7 +191,7 @@ HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads}
 declare -i HPCBIND_OPENMP_PROC_BIND=1
 HPCBIND_OPENMP_FORCE_NUM_THREADS=""
 HPCBIND_OPENMP_FORCE_PROC_BIND=""
-declare -i HPCBIND_OPENMP_NESTED=1
+declare -i HPCBIND_OPENMP_NESTED=0
 declare -i HPCBIND_VERBOSE=0

 declare -i HPCBIND_LSTOPO=0
@ -197,6 +199,9 @@ declare -i HPCBIND_LSTOPO=0
 HPCBIND_OUTPUT_PREFIX=""
 HPCBIND_OUTPUT_MODE="rank0"

+HPCBIND_OUTPUT_TOPOLOGY=""
+HPCBIND_INPUT_TOPOLOGY=""
+
 declare -i HPCBIND_HAS_COMMAND=0

 for i in "$@"; do
@ -276,10 +281,22 @@ for i in "$@"; do
      HPCBIND_OPENMP_NESTED=0
      shift
      ;;
+    --openmp-nested)
+      HPCBIND_OPENMP_NESTED=1
+      shift
+      ;;
    --output-prefix=*)
      HPCBIND_OUTPUT_PREFIX="${i#*=}"
      shift
      ;;
+    --save-topology=*)
+      HPCBIND_OUTPUT_TOPOLOGY="${i#*=}"
+      shift
+      ;;
+    --load-topology=*)
+      HPCBIND_INPUT_TOPOLOGY="${i#*=}"
+      shift
+      ;;
    --output-mode=*)
      HPCBIND_OUTPUT_MODE="${i#*=}"
      #convert to lower case
@ -327,24 +344,37 @@ elif [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
  HPCBIND_TEE=1
 fi

+# Save the topology to the given xml file
+if [[ "${HPCBIND_OUTPUT_TOPOLOGY}" != "" ]]; then
+  if [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then
+    lstopo-no-graphics "${HPCBIND_OUTPUT_TOPOLOGY}"
+  else
+    lstopo-no-graphics >/dev/null 2>&1
+  fi
+fi
+
+# Load the topology to the given xml file
+if [[ "${HPCBIND_INPUT_TOPOLOGY}" != "" ]]; then
+  if [ -f ${HPCBIND_INPUT_TOPOLOGY} ]; then
+    export HWLOC_XMLFILE="${HPCBIND_INPUT_TOPOLOGY}"
+    export HWLOC_THISSYSTEM=1
+  fi
+fi

 if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
  HPCBIND_LOG=/dev/null
  HPCBIND_ERR=/dev/null
  HPCBIND_OUT=/dev/null
 else
-  if [[ ${HPCBIND_QUEUE_SIZE} -gt 0 ]]; then
-    HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
-    HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})
-
-    HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
-    HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
-    HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
-  else
-    HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_RANK}"
-    HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_RANK}"
-    HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_RANK}"
+  if [[ ${HPCBIND_QUEUE_SIZE} -le 0 ]]; then
+    HPCBIND_QUEUE_SIZE=1
  fi
+  HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}"
+  HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK})
+
+  HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}"
+  HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}"
+  HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}"
  > ${HPCBIND_LOG}
 fi

@ -546,6 +576,8 @@ if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
  hostname -s >> ${HPCBIND_LOG}
  echo "[HPCBIND]" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
+  echo "[HWLOC]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^HWLOC_" >> ${HPCBIND_LOG}
  echo "[CUDA]" >> ${HPCBIND_LOG}
  echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
  echo "[OPENMP]" >> ${HPCBIND_LOG}
@ -568,6 +600,8 @@ else
  hostname -s > >(tee -a ${HPCBIND_LOG})
  echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
+  echo "[HWLOC]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^HWLOC_" > >(tee -a ${HPCBIND_LOG})
  echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
  echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
  echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@ -74,6 +74,9 @@ dry_run=0
 host_only=0
 host_only_args=""

+# Just run version on host compiler
+get_host_version=0
+
 # Enable workaround for CUDA 6.5 for pragma ident 
 replace_pragma_ident=0

@ -93,6 +96,9 @@ depfile_separate=0
 depfile_output_arg=""
 depfile_target_arg=""

+# Option to remove duplicate libraries and object files
+remove_duplicate_link_files=0
+
 #echo "Arguments: $# $@"

 while [ $# -gt 0 ]
@ -106,10 +112,18 @@ do
  --host-only)
    host_only=1
    ;;
+  #get the host version only
+  --host-version)
+    get_host_version=1
+    ;;
  #replace '#pragma ident' with '#ident' this is needed to compile OpenMPI due to a configure script bug and a non standardized behaviour of pragma with macros
  --replace-pragma-ident)
    replace_pragma_ident=1
    ;;
+  #remove duplicate link files
+  --remove-duplicate-link-files)
+    remove_duplicate_link_files=1
+    ;;
  #handle source files to be compiled as cuda files
  *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
    cpp_files="$cpp_files $1"
@ -124,7 +138,12 @@ do
    fi
    ;;
  #Handle shared args (valid for both nvcc and the host compiler)
-  -D*|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
+  -D*)
+    unescape_commas=`echo "$1" | sed -e 's/\\\,/,/g'`
+    arg=`printf "%q" $unescape_commas`
+    shared_args="$shared_args $arg"
+    ;;
+  -I*|-L*|-l*|-g|--help|--version|-E|-M|-shared|-w)
    shared_args="$shared_args $1"
    ;;
  #Handle compilation argument
@ -152,7 +171,7 @@ do
    shift
    ;;
  #Handle known nvcc args
-  -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
+  --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
    cuda_args="$cuda_args $1"
    ;;
  #Handle more known nvcc args
@ -164,8 +183,11 @@ do
    cuda_args="$cuda_args $1 $2"
    shift
    ;;
+  -rdc=*|-maxrregcount*|--maxrregcount*)
+    cuda_args="$cuda_args $1"
+    ;;
  #Handle c++11
-  --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
+  --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1y|-std=c++1y|--std=c++17|-std=c++17|--std=c++1z|-std=c++1z)
    if [ $stdcxx_applied -eq 1 ]; then
       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
    else
@ -205,6 +227,15 @@ do
    fi
    shift
    ;;
+  #Handle -+ (same as -x c++, specifically used for xl compilers, but mutually exclusive with -x. So replace it with -x c++)
+  -+)
+    if [ $first_xcompiler_arg -eq 1 ]; then
+      xcompiler_args="-x,c++"
+      first_xcompiler_arg=0
+    else
+      xcompiler_args="$xcompiler_args,-x,c++"
+    fi
+    ;;
  #Handle -ccbin (if its not set we can set it to a default value)
  -ccbin)
    cuda_args="$cuda_args $1 $2"
@ -212,18 +243,39 @@ do
    host_compiler=$2
    shift
    ;;
-  #Handle -arch argument (if its not set use a default
-  -arch*)
+
+  #Handle -arch argument (if its not set use a default) this is the version with = sign
+  -arch*|-gencode*)
    cuda_args="$cuda_args $1"
    arch_set=1
    ;;
+  #Handle -code argument (if its not set use a default) this is the version with = sign
+  -code*)
+    cuda_args="$cuda_args $1"
+    ;;
+  #Handle -arch argument (if its not set use a default) this is the version without = sign
+  -arch|-gencode)
+    cuda_args="$cuda_args $1 $2"
+    arch_set=1
+    shift
+    ;;
+  #Handle -code argument (if its not set use a default) this is the version without = sign
+  -code)
+    cuda_args="$cuda_args $1 $2"
+    shift
+    ;;
  #Handle -Xcudafe argument
  -Xcudafe)
    cuda_args="$cuda_args -Xcudafe $2"
    shift
    ;;
+  #Handle -Xlinker argument
+  -Xlinker)
+    xlinker_args="$xlinker_args -Xlinker $2"
+    shift
+    ;;
  #Handle args that should be sent to the linker
-  -Wl*)
+  -Wl,*)
    xlinker_args="$xlinker_args -Xlinker ${1:4:${#1}}"
    host_linker_args="$host_linker_args ${1:4:${#1}}"
    ;;
@ -256,6 +308,44 @@ do
  shift
 done

+# Only print host compiler version
+if [ $get_host_version -eq 1 ]; then
+  $host_compiler --version
+  exit
+fi
+
+#Remove duplicate object files
+if [ $remove_duplicate_link_files -eq 1 ]; then
+for obj in $object_files
+do
+  object_files_reverse="$obj $object_files_reverse"
+done
+
+object_files_reverse_clean=""
+for obj in $object_files_reverse
+do
+  exists=false
+  for obj2 in $object_files_reverse_clean
+  do
+    if [ "$obj" == "$obj2" ]
+    then
+      exists=true
+      echo "Exists: $obj"
+    fi
+  done
+  if [ "$exists" == "false" ]
+  then
+    object_files_reverse_clean="$object_files_reverse_clean $obj"
+  fi
+done
+
+object_files=""
+for obj in $object_files_reverse_clean
+do
+  object_files="$obj $object_files"
+done
+fi
+
 #Add default host compiler if necessary
 if [ $ccbin_set -ne 1 ]; then
  cuda_args="$cuda_args -ccbin $host_compiler"
@ -328,10 +418,19 @@ fi

 #Run compilation command
 if [ $host_only -eq 1 ]; then
+  if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
+    echo "$host_command"
+  fi
  $host_command
 elif [ -n "$nvcc_depfile_command" ]; then
+  if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
+    echo "$nvcc_command && $nvcc_depfile_command"
+  fi
  $nvcc_command && $nvcc_depfile_command
 else
+  if [ "$NVCC_WRAPPER_SHOW_COMMANDS_BEING_RUN" == "1" ] ; then
+    echo "$nvcc_command"
+  fi
  $nvcc_command
 fi
 error_code=$?
--- a/lib/kokkos/cmake/kokkos_build.cmake
+++ b/lib/kokkos/cmake/kokkos_build.cmake
@ -235,3 +235,7 @@ install(FILES
 # Install the export set for use with the install-tree
 INSTALL(EXPORT KokkosTargets DESTINATION
       "${INSTALL_CMAKE_DIR}")
+
+# build and install pkgconfig file
+CONFIGURE_FILE(core/src/kokkos.pc.in kokkos.pc @ONLY)
+INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/kokkos.pc DESTINATION lib/pkgconfig)
--- a/lib/kokkos/cmake/kokkos_functions.cmake
+++ b/lib/kokkos/cmake/kokkos_functions.cmake
@ -47,7 +47,7 @@ function(set_kokkos_cxx_compiler)
                    OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION
                    OUTPUT_STRIP_TRAILING_WHITESPACE)

-    string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+$"
+    string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+$"
           INTERNAL_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION})
  endif()

--- a/lib/kokkos/cmake/kokkos_options.cmake
+++ b/lib/kokkos/cmake/kokkos_options.cmake
@ -41,7 +41,6 @@ list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST
 foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
  string(TOUPPER ${opt} OPT )
  IF(DEFINED Kokkos_ENABLE_${opt})
-    MESSAGE("Kokkos_ENABLE_${opt} is defined!")
    IF(DEFINED KOKKOS_ENABLE_${OPT})
      IF(NOT ("${KOKKOS_ENABLE_${OPT}}" STREQUAL "${Kokkos_ENABLE_${opt}}"))
        IF(DEFINED KOKKOS_ENABLE_${OPT}_INTERNAL)
@ -59,7 +58,6 @@ foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST})
      ENDIF()
    ELSE()
      SET(KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT ${Kokkos_ENABLE_${opt}})
-      MESSAGE("set KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT!")
    ENDIF()
  ENDIF()
 endforeach()
@ -81,6 +79,7 @@ list(APPEND KOKKOS_ARCH_LIST
     ARMv80          # (HOST) ARMv8.0 Compatible CPU
     ARMv81          # (HOST) ARMv8.1 Compatible CPU
     ARMv8-ThunderX  # (HOST) ARMv8 Cavium ThunderX CPU
+     ARMv8-TX2       # (HOST) ARMv8 Cavium ThunderX2 CPU
     WSM             # (HOST) Intel Westmere CPU
     SNB             # (HOST) Intel Sandy/Ivy Bridge CPUs
     HSW             # (HOST) Intel Haswell CPUs
@ -123,11 +122,18 @@ list(APPEND KOKKOS_DEVICES_LIST
 # List of possible TPLs for Kokkos
 # From Makefile.kokkos: Options: hwloc,librt,experimental_memkind
 set(KOKKOS_USE_TPLS_LIST)
+if(APPLE)
+list(APPEND KOKKOS_USE_TPLS_LIST
+    HWLOC          # hwloc
+    MEMKIND        # experimental_memkind
+    )
+else()
 list(APPEND KOKKOS_USE_TPLS_LIST
    HWLOC          # hwloc
    LIBRT          # librt
    MEMKIND        # experimental_memkind
    )
+endif()
 # Map of cmake variables to Makefile variables
 set(KOKKOS_INTERNAL_HWLOC hwloc)
 set(KOKKOS_INTERNAL_LIBRT librt)
@ -172,6 +178,7 @@ set(KOKKOS_INTERNAL_LAMBDA enable_lambda)

 set(tmpr "\n       ")
 string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_ARCH_DOCSTR "${KOKKOS_ARCH_LIST}")
+set(KOKKOS_INTERNAL_ARCH_DOCSTR "${tmpr}${KOKKOS_INTERNAL_ARCH_DOCSTR}")
 # This would be useful, but we use Foo_ENABLE mechanisms
 #string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_DEVICES_DOCSTR "${KOKKOS_DEVICES_LIST}")
 #string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_USE_TPLS_DOCSTR "${KOKKOS_USE_TPLS_LIST}")
@ -269,7 +276,7 @@ set(KOKKOS_ENABLE_PROFILING_LOAD_PRINT ${KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_P
 set_kokkos_default_default(DEPRECATED_CODE ON)
 set(KOKKOS_ENABLE_DEPRECATED_CODE ${KOKKOS_INTERNAL_ENABLE_DEPRECATED_CODE_DEFAULT} CACHE BOOL "Enable deprecated code.")

-set_kokkos_default_default(EXPLICIT_INSTANTIATION ON)
+set_kokkos_default_default(EXPLICIT_INSTANTIATION OFF)
 set(KOKKOS_ENABLE_EXPLICIT_INSTANTIATION ${KOKKOS_INTERNAL_ENABLE_EXPLICIT_INSTANTIATION_DEFAULT} CACHE BOOL "Enable explicit template instantiation.")

 #-------------------------------------------------------------------------------
--- a/lib/kokkos/cmake/kokkos_settings.cmake
+++ b/lib/kokkos/cmake/kokkos_settings.cmake
@ -15,16 +15,16 @@

 # Ensure that KOKKOS_ARCH is in the ARCH_LIST
 if (KOKKOS_ARCH MATCHES ",")
-  message("-- Detected a comma in: KOKKOS_ARCH=${KOKKOS_ARCH}")
+  message("-- Detected a comma in: KOKKOS_ARCH=`${KOKKOS_ARCH}`")
  message("-- Although we prefer KOKKOS_ARCH to be semicolon-delimited, we do allow")
  message("-- comma-delimited values for compatibility with scripts (see github.com/trilinos/Trilinos/issues/2330)")
  string(REPLACE "," ";" KOKKOS_ARCH "${KOKKOS_ARCH}")
-  message("-- Commas were changed to semicolons, now KOKKOS_ARCH=${KOKKOS_ARCH}")
+  message("-- Commas were changed to semicolons, now KOKKOS_ARCH=`${KOKKOS_ARCH}`")
 endif()
 foreach(arch ${KOKKOS_ARCH})
  list(FIND KOKKOS_ARCH_LIST ${arch} indx)
  if (indx EQUAL -1)
-    message(FATAL_ERROR "${arch} is not an accepted value for KOKKOS_ARCH."
+    message(FATAL_ERROR "`${arch}` is not an accepted value in KOKKOS_ARCH=`${KOKKOS_ARCH}`."
      "  Please pick from these choices: ${KOKKOS_INTERNAL_ARCH_DOCSTR}")
  endif ()
 endforeach()
@ -130,7 +130,8 @@ string(REPLACE ";" ":" KOKKOS_INTERNAL_ADDTOPATH "${addpathl}")
 # Set the KOKKOS_SETTINGS String -- this is the primary communication with the
 # makefile configuration.  See Makefile.kokkos

-set(KOKKOS_SETTINGS KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH})
+set(KOKKOS_SETTINGS KOKKOS_CMAKE=yes)
+set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH})
 set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_PATH=${KOKKOS_PATH})
 set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_INSTALL_PATH=${CMAKE_INSTALL_PREFIX})

--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@ -241,17 +241,16 @@ elif [ "$MACHINE" = "white" ]; then

  BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
  IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
-  CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
-  CUDA_MODULE_LIST2="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.3.0,ibm/xl/13.1.6"
+  CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/6.4.0,ibm/xl/16.1.0"

  # Don't do pthread on white.
  GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"

  # Format: (compiler module-list build-list exe-name warning-flag)
  COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
-             "ibm/13.1.6 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
-             "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
-             "cuda/9.0.103 $CUDA_MODULE_LIST2 $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
+             "gcc/6.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
+             "ibm/16.1.0 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
+             "cuda/9.0.103 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
  )

  if [ -z "$ARCH_FLAG" ]; then
@ -362,7 +361,7 @@ elif [ "$MACHINE" = "apollo" ]; then
               "gcc/5.3.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS"
               "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS"
               "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS"
-               "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread" clang++ $CUDA_WARNING_FLAGS"
+               "clang/6.0 $CLANG_MODULE_LIST "Cuda_Pthread,OpenMP" clang++ $CUDA_WARNING_FLAGS"
               "cuda/9.1 $CUDA_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS"
    )
  else
--- a/lib/kokkos/containers/src/Kokkos_DualView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@ -96,6 +96,7 @@ template< class DataType ,
          class Arg3Type = void>
 class DualView : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
 {
+template< class , class , class , class > friend class DualView ;
 public:
  //! \name Typedefs for device types and various Kokkos::View specializations.
  //@{
@ -182,8 +183,20 @@ public:
  //! \name Counters to keep track of changes ("modified" flags)
  //@{

-  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_device;
-  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_host;
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
+protected:
+  // modified_flags[0] -> host
+  // modified_flags[1] -> device
+  typedef View<unsigned int[2],LayoutLeft,Kokkos::HostSpace> t_modified_flags;
+  t_modified_flags modified_flags;
+
+public:
+#else
+  typedef View<unsigned int[2],LayoutLeft,typename t_host::execution_space> t_modified_flags;
+  typedef View<unsigned int,LayoutLeft,typename t_host::execution_space> t_modified_flag;
+  t_modified_flags modified_flags;
+  t_modified_flag modified_host,modified_device;
+#endif

  //@}
  //! \name Constructors
@ -194,10 +207,14 @@ public:
  /// Both device and host View objects are constructed using their
  /// default constructors.  The "modified" flags are both initialized
  /// to "unmodified."
-  DualView () :
-    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
-    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
-  {}
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
+  DualView () = default;
+#else
+  DualView ():modified_flags (t_modified_flags("DualView::modified_flags")) {
+    modified_host = t_modified_flag(modified_flags,0);
+    modified_device = t_modified_flag(modified_flags,1);
+  }
+#endif

  /// \brief Constructor that allocates View objects on both host and device.
  ///
@ -219,17 +236,24 @@ public:
            const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG)
    : d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
    , h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
-    , modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device"))
-    , modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
-  {}
+    , modified_flags (t_modified_flags("DualView::modified_flags"))
+  {
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    modified_host = t_modified_flag(modified_flags,0);
+    modified_device = t_modified_flag(modified_flags,1);
+#endif
+  }

  //! Copy constructor (shallow copy)
  template<class SS, class LS, class DS, class MS>
  DualView (const DualView<SS,LS,DS,MS>& src) :
    d_view (src.d_view),
    h_view (src.h_view),
-    modified_device (src.modified_device),
-    modified_host (src.modified_host)
+    modified_flags (src.modified_flags)
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    , modified_host(src.modified_host)
+    , modified_device(src.modified_device)
+#endif
  {}

  //! Subview constructor
@ -241,8 +265,11 @@ public:
          )
    : d_view( Kokkos::subview( src.d_view , arg0 , args ... ) )
    , h_view( Kokkos::subview( src.h_view , arg0 , args ... ) )
-    , modified_device (src.modified_device)
-    , modified_host (src.modified_host)
+    , modified_flags (src.modified_flags)
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    , modified_host(src.modified_host)
+    , modified_device(src.modified_device)
+#endif
    {}

  /// \brief Create DualView from existing device and host View objects.
@ -258,8 +285,7 @@ public:
  DualView (const t_dev& d_view_, const t_host& h_view_) :
    d_view (d_view_),
    h_view (h_view_),
-    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
-    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+    modified_flags (t_modified_flags("DualView::modified_flags"))
  {
    if ( int(d_view.rank)     != int(h_view.rank) ||
         d_view.extent(0) != h_view.extent(0) ||
@ -281,6 +307,10 @@ public:
         d_view.span()        != h_view.span() ) {
      Kokkos::Impl::throw_runtime_exception("DualView constructed with incompatible views");
    }
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+    modified_host = t_modified_flag(modified_flags,0);
+    modified_device = t_modified_flag(modified_flags,1);
+#endif
  }

  //@}
@ -316,6 +346,30 @@ public:
    t_dev,
    t_host>::type& view () const
  {
+    #ifndef KOKKOS_ENABLE_DEPRECATED_CODE
+    constexpr bool device_is_memspace  = std::is_same<Device,typename Device::memory_space>::value;
+    constexpr bool device_is_execspace = std::is_same<Device,typename Device::execution_space>::value;
+    constexpr bool device_exec_is_t_dev_exec  = std::is_same<typename Device::execution_space,typename t_dev::execution_space>::value;
+    constexpr bool device_mem_is_t_dev_mem    = std::is_same<typename Device::memory_space,typename t_dev::memory_space>::value;
+    constexpr bool device_exec_is_t_host_exec  = std::is_same<typename Device::execution_space,typename t_host::execution_space>::value;
+    constexpr bool device_mem_is_t_host_mem    = std::is_same<typename Device::memory_space,typename t_host::memory_space>::value;
+    constexpr bool device_is_t_host_device  = std::is_same<typename Device::execution_space,typename t_host::device_type>::value;
+    constexpr bool device_is_t_dev_device    = std::is_same<typename Device::memory_space,typename t_host::device_type>::value;
+
+    static_assert(
+        device_is_t_dev_device || device_is_t_host_device ||
+        (device_is_memspace  && (device_mem_is_t_dev_mem   || device_mem_is_t_host_mem) ) ||
+        (device_is_execspace && (device_exec_is_t_dev_exec || device_exec_is_t_host_exec) ) ||
+        (
+          (!device_is_execspace && !device_is_memspace) && (
+            (device_mem_is_t_dev_mem   || device_mem_is_t_host_mem)  ||
+            (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)
+          )
+        )
+        ,
+        "Template parameter to .view() must exactly match one of the DualView's device types or one of the execution or memory spaces");
+    #endif
+
    return Impl::if_c<
      std::is_same<
        typename t_dev::memory_space,
@ -324,6 +378,72 @@ public:
      t_host >::select (d_view , h_view);
  }

+  KOKKOS_INLINE_FUNCTION
+  t_host view_host() const {
+    return h_view;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  t_dev view_device() const {
+    return d_view;
+  }
+
+  template<class Device>
+  static int get_device_side() {
+    constexpr bool device_is_memspace  = std::is_same<Device,typename Device::memory_space>::value;
+    constexpr bool device_is_execspace = std::is_same<Device,typename Device::execution_space>::value;
+    constexpr bool device_exec_is_t_dev_exec  = std::is_same<typename Device::execution_space,typename t_dev::execution_space>::value;
+    constexpr bool device_mem_is_t_dev_mem    = std::is_same<typename Device::memory_space,typename t_dev::memory_space>::value;
+    constexpr bool device_exec_is_t_host_exec  = std::is_same<typename Device::execution_space,typename t_host::execution_space>::value;
+    constexpr bool device_mem_is_t_host_mem    = std::is_same<typename Device::memory_space,typename t_host::memory_space>::value;
+    constexpr bool device_is_t_host_device  = std::is_same<typename Device::execution_space,typename t_host::device_type>::value;
+    constexpr bool device_is_t_dev_device    = std::is_same<typename Device::memory_space,typename t_host::device_type>::value;
+
+    #ifndef KOKKOS_ENABLE_DEPRECATED_CODE
+    static_assert(
+        device_is_t_dev_device || device_is_t_host_device ||
+        (device_is_memspace  && (device_mem_is_t_dev_mem   || device_mem_is_t_host_mem) ) ||
+        (device_is_execspace && (device_exec_is_t_dev_exec || device_exec_is_t_host_exec) ) ||
+        (
+          (!device_is_execspace && !device_is_memspace) && (
+            (device_mem_is_t_dev_mem   || device_mem_is_t_host_mem)  ||
+            (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)
+          )
+        )
+        ,
+        "Template parameter to .sync() must exactly match one of the DualView's device types or one of the execution or memory spaces");
+    #endif
+
+    #ifndef KOKKOS_ENABLE_DEPRECATED_CODE
+    int dev = -1;
+    #else
+    int dev = 0;
+    #endif
+    if(device_is_t_dev_device) dev = 1;
+    else if(device_is_t_host_device) dev = 0;
+    else {
+      if(device_is_memspace) {
+        if(device_mem_is_t_dev_mem) dev = 1;
+        if(device_mem_is_t_host_mem) dev = 0;
+        if(device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1;
+      }
+      if(device_is_execspace) {
+        if(device_exec_is_t_dev_exec) dev = 1;
+        if(device_exec_is_t_host_exec) dev = 0;
+        if(device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1;
+      }
+      if(!device_is_execspace && !device_is_memspace) {
+        if(device_mem_is_t_dev_mem) dev = 1;
+        if(device_mem_is_t_host_mem) dev = 0;
+        if(device_mem_is_t_host_mem && device_mem_is_t_dev_mem) dev = -1;
+        if(device_exec_is_t_dev_exec) dev = 1;
+        if(device_exec_is_t_host_exec) dev = 0;
+        if(device_exec_is_t_host_exec && device_exec_is_t_dev_exec) dev = -1;
+      }
+    }
+    return dev;
+  }
+
  /// \brief Update data on device or host only if data in the other
  ///   space has been marked as modified.
  ///
@ -347,23 +467,20 @@ public:
        ( std::is_same< Device , int>::value)
        , int >::type& = 0)
  {
-    const unsigned int dev =
-      Impl::if_c<
-        std::is_same<
-          typename t_dev::memory_space,
-          typename Device::memory_space>::value ,
-        unsigned int,
-        unsigned int>::select (1, 0);
+    if(modified_flags.data()==NULL) return;

-    if (dev) { // if Device is the same as DualView's device type
-      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+    int dev = get_device_side<Device>();
+
+    if (dev == 1) { // if Device is the same as DualView's device type
+      if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
        deep_copy (d_view, h_view);
-        modified_host() = modified_device() = 0;
+        modified_flags(0) = modified_flags(1) = 0;
      }
-    } else { // hopefully Device is the same as DualView's host type
-      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+    }
+    if (dev == 0) { // hopefully Device is the same as DualView's host type
+      if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
        deep_copy (h_view, d_view);
-        modified_host() = modified_device() = 0;
+        modified_flags(0) = modified_flags(1) = 0;
      }
    }
    if(std::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
@ -378,46 +495,71 @@ public:
      ( std::is_same< Device , int>::value)
      , int >::type& = 0 )
  {
-    const unsigned int dev =
-      Impl::if_c<
-        std::is_same<
-          typename t_dev::memory_space,
-          typename Device::memory_space>::value,
-        unsigned int,
-        unsigned int>::select (1, 0);
-    if (dev) { // if Device is the same as DualView's device type
-      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+    if(modified_flags.data()==NULL) return;
+
+    int dev = get_device_side<Device>();
+
+    if (dev == 1) { // if Device is the same as DualView's device type
+      if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
      }
-    } else { // hopefully Device is the same as DualView's host type
-      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+    }
+    if (dev == 0){ // hopefully Device is the same as DualView's host type
+      if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
      }
    }
  }

+  void sync_host() {
+    if( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value )
+      Impl::throw_runtime_exception("Calling sync_host on a DualView with a const datatype.");
+    if(modified_flags.data()==NULL) return;
+    if(modified_flags(1) > modified_flags(0)) {
+      deep_copy (h_view, d_view);
+      modified_flags(1) = modified_flags(0) = 0;
+    }
+  }
+
+  void sync_device() {
+    if( ! std::is_same< typename traits::data_type , typename traits::non_const_data_type>::value )
+      Impl::throw_runtime_exception("Calling sync_device on a DualView with a const datatype.");
+    if(modified_flags.data()==NULL) return;
+    if(modified_flags(0) > modified_flags(1)) {
+      deep_copy (d_view, h_view);
+      modified_flags(1) = modified_flags(0) = 0;
+    }
+  }
+
  template<class Device>
  bool need_sync() const
  {
-    const unsigned int dev =
-      Impl::if_c<
-        std::is_same<
-          typename t_dev::memory_space,
-          typename Device::memory_space>::value ,
-        unsigned int,
-        unsigned int>::select (1, 0);
+    if(modified_flags.data()==NULL) return false;
+    int dev = get_device_side<Device>();

-    if (dev) { // if Device is the same as DualView's device type
-      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+    if (dev == 1) { // if Device is the same as DualView's device type
+      if ((modified_flags(0) > 0) && (modified_flags(0) >= modified_flags(1))) {
        return true;
      }
-    } else { // hopefully Device is the same as DualView's host type
-      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+    }
+    if (dev == 0){ // hopefully Device is the same as DualView's host type
+      if ((modified_flags(1) > 0) && (modified_flags(1) >= modified_flags(0))) {
        return true;
      }
    }
    return false;
  }
+
+  inline bool need_sync_host() const {
+    if(modified_flags.data()==NULL) return false;
+    return modified_flags(0)<modified_flags(1);
+  }
+
+  inline bool need_sync_device() const {
+    if(modified_flags.data()==NULL) return false;
+    return modified_flags(1)<modified_flags(0);
+  }
+
  /// \brief Mark data as modified on the given device \c Device.
  ///
  /// If \c Device is the same as this DualView's device type, then
@ -425,26 +567,22 @@ public:
  /// data as modified.
  template<class Device>
  void modify () {
-    const unsigned int dev =
-      Impl::if_c<
-        std::is_same<
-          typename t_dev::memory_space,
-          typename Device::memory_space>::value,
-        unsigned int,
-        unsigned int>::select (1, 0);
+    if(modified_flags.data()==NULL) return;
+    int dev = get_device_side<Device>();

-    if (dev) { // if Device is the same as DualView's device type
+    if (dev == 1) { // if Device is the same as DualView's device type
      // Increment the device's modified count.
-      modified_device () = (modified_device () > modified_host () ?
-                            modified_device () : modified_host ()) + 1;
-    } else { // hopefully Device is the same as DualView's host type
+      modified_flags(1) = (modified_flags(1) > modified_flags(0) ?
+                            modified_flags(1) : modified_flags(0)) + 1;
+    }
+    if (dev == 0) { // hopefully Device is the same as DualView's host type
      // Increment the host's modified count.
-      modified_host () = (modified_device () > modified_host () ?
-                          modified_device () : modified_host ())  + 1;
+      modified_flags(0) = (modified_flags(1) > modified_flags(0) ?
+                          modified_flags(1) : modified_flags(0))  + 1;
    }

 #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
-    if (modified_host() && modified_device()) {
+    if (modified_flags(0) && modified_flags(1)) {
      std::string msg = "Kokkos::DualView::modify ERROR: ";
      msg += "Concurrent modification of host and device views ";
      msg += "in DualView \"";
@ -455,6 +593,45 @@ public:
 #endif
  }

+  inline void modify_host() {
+    if(modified_flags.data()!=NULL) {
+      modified_flags(0) = (modified_flags(1) > modified_flags(0) ?
+          modified_flags(1) : modified_flags(0))  + 1;
+      #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
+      if (modified_flags(0) && modified_flags(1)) {
+        std::string msg = "Kokkos::DualView::modify_host ERROR: ";
+        msg += "Concurrent modification of host and device views ";
+        msg += "in DualView \"";
+        msg += d_view.label();
+        msg += "\"\n";
+        Kokkos::abort(msg.c_str());
+      }
+    #endif
+    }
+  }
+
+  inline void modify_device() {
+    if(modified_flags.data()!=NULL) {
+      modified_flags(1) = (modified_flags(1) > modified_flags(0) ?
+          modified_flags(1) : modified_flags(0))  + 1;
+      #ifdef KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK
+      if (modified_flags(0) && modified_flags(1)) {
+        std::string msg = "Kokkos::DualView::modify_device ERROR: ";
+        msg += "Concurrent modification of host and device views ";
+        msg += "in DualView \"";
+        msg += d_view.label();
+        msg += "\"\n";
+        Kokkos::abort(msg.c_str());
+      }
+      #endif
+    }
+  }
+
+  inline void clear_sync_state() {
+    if(modified_flags.data()!=NULL) 
+      modified_flags(1) = modified_flags(0) = 0;
+  }
+
  //@}
  //! \name Methods for reallocating or resizing the View objects.
  //@{
@ -476,7 +653,10 @@ public:
     h_view = create_mirror_view( d_view );

     /* Reset dirty flags */
-     modified_device() = modified_host() = 0;
+     if(modified_flags.data()==NULL) {
+       modified_flags = t_modified_flags("DualView::modified_flags");
+     } else
+       modified_flags(1) = modified_flags(0) = 0;
  }

  /// \brief Resize both views, copying old contents into new if necessary.
@ -491,13 +671,16 @@ public:
           const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
           const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ,
           const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG ) {
-   if(modified_device() >= modified_host()) {
+   if(modified_flags.data()==NULL) {
+     modified_flags = t_modified_flags("DualView::modified_flags");
+   }
+   if(modified_flags(1) >= modified_flags(0)) {
     /* Resize on Device */
     ::Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
     h_view = create_mirror_view( d_view );

     /* Mark Device copy as modified */
-     modified_device() = modified_device()+1;
+     modified_flags(1) = modified_flags(1)+1;

   } else {
     /* Realloc on Device */
@ -525,7 +708,7 @@ public:
     d_view = create_mirror_view( typename t_dev::execution_space(), h_view );

     /* Mark Host copy as modified */
-     modified_host() = modified_host()+1;
+     modified_flags(0) = modified_flags(0)+1;
   }
  }

@ -649,7 +832,10 @@ void
 deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
           const DualView<ST,SL,SD,SM>& src )
 {
-  if (src.modified_device () >= src.modified_host ()) {
+  if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
+    return deep_copy(dst.d_view, src.d_view);
+  }
+  if (src.modified_flags(1) >= src.modified_flags(0)) {
    deep_copy (dst.d_view, src.d_view);
    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
  } else {
@ -666,7 +852,10 @@ deep_copy (const ExecutionSpace& exec ,
           DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
           const DualView<ST,SL,SD,SM>& src )
 {
-  if (src.modified_device () >= src.modified_host ()) {
+  if(src.modified_flags.data()==NULL || dst.modified_flags.data()==NULL) {
+    return deep_copy(exec, dst.d_view, src.d_view);
+  }
+  if (src.modified_flags(1) >= src.modified_flags(0)) {
    deep_copy (exec, dst.d_view, src.d_view);
    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
  } else {
--- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp
@ -64,7 +64,7 @@ namespace Impl {
 template <typename Specialize>
 struct DynRankDimTraits {

-  enum : size_t{unspecified =KOKKOS_INVALID_INDEX};
+  enum : size_t{unspecified = KOKKOS_INVALID_INDEX};

  // Compute the rank of the view from the nonzero dimension arguments.
  KOKKOS_INLINE_FUNCTION
@ -384,8 +384,8 @@ public:
    // Removed dimension checks...

      typedef typename DstType::offset_type  dst_offset_type ;
-      dst.m_map.m_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
-      dst.m_map.m_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_handle , src.m_track );
+      dst.m_map.m_impl_offset = dst_offset_type(std::integral_constant<unsigned,0>() , src.layout() ); //Check this for integer input1 for padding, etc
+      dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle< DstTraits >::assign( src.m_map.m_impl_handle , src.m_track );
      dst.m_track.assign( src.m_track , DstTraits::is_managed );
      dst.m_rank = src.Rank ;
    }
@ -565,10 +565,14 @@ public:

  //----------------------------------------
  // Allow specializations to query their specialized map
-
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
  KOKKOS_INLINE_FUNCTION
  const Kokkos::Impl::ViewMapping< traits , void > &
  implementation_map() const { return m_map ; }
+#endif
+  KOKKOS_INLINE_FUNCTION
+  const Kokkos::Impl::ViewMapping< traits , void > &
+  impl_map() const { return m_map ; }

  //----------------------------------------

@ -624,7 +628,7 @@ public:
  reference_type operator()() const
    {
      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
-      return implementation_map().reference();
+      return impl_map().reference();
      //return m_map.reference(0,0,0,0,0,0,0);
    }

@ -647,7 +651,7 @@ public:
  typename std::enable_if< !std::is_same<typename drvtraits::value_type, typename drvtraits::scalar_array_type>::value && std::is_integral<iType>::value, reference_type>::type
  operator[](const iType & i0) const
    {
-//      auto map = implementation_map();
+//      auto map = impl_map();
      const size_t dim_scalar = m_map.dimension_scalar();
      const size_t bytes = this->span() / dim_scalar;

@ -785,7 +789,7 @@ public:
  reference_type access() const
    {
      KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (0 , this->rank(), m_track, m_map) )
-      return implementation_map().reference();
+      return impl_map().reference();
      //return m_map.reference(0,0,0,0,0,0,0);
    }

@ -1004,7 +1008,7 @@ public:

  //----------------------------------------
  // Allocation according to allocation properties and array layout
-  // unused arg_layout dimensions must be set toKOKKOS_INVALID_INDEX so that rank deduction can properly take place
+  // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that rank deduction can properly take place
  template< class ... P >
  explicit inline
  DynRankView( const Kokkos::Impl::ViewCtorProp< P ... > & arg_prop
@ -1179,7 +1183,7 @@ public:
    : DynRankView( Kokkos::Impl::ViewCtorProp< std::string >( arg_label )
    , typename traits::array_layout
          ( arg_N0 , arg_N1 , arg_N2 , arg_N3 , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
-          )
+      )
    {}

  // For backward compatibility
@ -1189,8 +1193,7 @@ public:
      , const typename traits::array_layout & arg_layout
      )
    : DynRankView( Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
-
-          , Impl::DynRankDimTraits<typename traits::specialize>::createLayout(arg_layout)
+                 , arg_layout
      )
    {}

@ -1205,7 +1208,9 @@ public:
      , const size_t arg_N6 =KOKKOS_INVALID_INDEX
      , const size_t arg_N7 =KOKKOS_INVALID_INDEX
      )
-    : DynRankView(Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing ), arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7 )
+    : DynRankView(Kokkos::Impl::ViewCtorProp< std::string , Kokkos::Impl::WithoutInitializing_t >( arg_prop.label , Kokkos::WithoutInitializing )
+      , typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)
+      )
    {}

  //----------------------------------------
@ -1445,30 +1450,30 @@ public:
      ret_type dst ;

      const SubviewExtents< 7 , rank > extents =
-        ExtentGenerator< Args ... >::generator( src.m_map.m_offset.m_dim , args... ) ;
+        ExtentGenerator< Args ... >::generator( src.m_map.m_impl_offset.m_dim , args... ) ;

-      dst_offset_type tempdst( src.m_map.m_offset , extents ) ;
+      dst_offset_type tempdst( src.m_map.m_impl_offset , extents ) ;

      dst.m_track = src.m_track ;

-      dst.m_map.m_offset.m_dim.N0 = tempdst.m_dim.N0 ;
-      dst.m_map.m_offset.m_dim.N1 = tempdst.m_dim.N1 ;
-      dst.m_map.m_offset.m_dim.N2 = tempdst.m_dim.N2 ;
-      dst.m_map.m_offset.m_dim.N3 = tempdst.m_dim.N3 ;
-      dst.m_map.m_offset.m_dim.N4 = tempdst.m_dim.N4 ;
-      dst.m_map.m_offset.m_dim.N5 = tempdst.m_dim.N5 ;
-      dst.m_map.m_offset.m_dim.N6 = tempdst.m_dim.N6 ;
+      dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0 ;
+      dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1 ;
+      dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2 ;
+      dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3 ;
+      dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4 ;
+      dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5 ;
+      dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6 ;

-      dst.m_map.m_offset.m_stride.S0 = tempdst.m_stride.S0 ;
-      dst.m_map.m_offset.m_stride.S1 = tempdst.m_stride.S1 ;
-      dst.m_map.m_offset.m_stride.S2 = tempdst.m_stride.S2 ;
-      dst.m_map.m_offset.m_stride.S3 = tempdst.m_stride.S3 ;
-      dst.m_map.m_offset.m_stride.S4 = tempdst.m_stride.S4 ;
-      dst.m_map.m_offset.m_stride.S5 = tempdst.m_stride.S5 ;
-      dst.m_map.m_offset.m_stride.S6 = tempdst.m_stride.S6 ;
+      dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0 ;
+      dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1 ;
+      dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2 ;
+      dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3 ;
+      dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4 ;
+      dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5 ;
+      dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6 ;

-      dst.m_map.m_handle = dst_handle_type( src.m_map.m_handle +
-                                      src.m_map.m_offset( extents.domain_offset(0)
+      dst.m_map.m_impl_handle = dst_handle_type( src.m_map.m_impl_handle +
+                                      src.m_map.m_impl_offset( extents.domain_offset(0)
                                                  , extents.domain_offset(1)
                                                  , extents.domain_offset(2)
                                                  , extents.domain_offset(3)
@ -1896,6 +1901,7 @@ inline
 typename DynRankView<T,P...>::HostMirror
 create_mirror( const DynRankView<T,P...> & src
             , typename std::enable_if<
+                 std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
                 ! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
                               , Kokkos::LayoutStride >::value
               >::type * = 0
@ -1914,6 +1920,7 @@ inline
 typename DynRankView<T,P...>::HostMirror
 create_mirror( const DynRankView<T,P...> & src
             , typename std::enable_if<
+                 std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
                 std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
                             , Kokkos::LayoutStride >::value
               >::type * = 0
@ -1929,7 +1936,11 @@ create_mirror( const DynRankView<T,P...> & src

 // Create a mirror in a new space (specialization for different space)
 template<class Space, class T, class ... P>
-typename Impl::MirrorDRVType<Space,T,P ...>::view_type create_mirror(const Space& , const Kokkos::DynRankView<T,P...> & src) {
+typename Impl::MirrorDRVType<Space,T,P ...>::view_type
+create_mirror(const Space& , const Kokkos::DynRankView<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename ViewTraits<T,P...>::specialize , void >::value
+               >::type * = 0) {
  return typename Impl::MirrorDRVType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
 }

@ -1985,6 +1996,29 @@ create_mirror_view(const Space& , const Kokkos::DynRankView<T,P...> & src
  return typename Impl::MirrorDRViewType<Space,T,P ...>::view_type(src.label(), Impl::reconstructLayout(src.layout(), src.rank()) );
 }

+// Create a mirror view and deep_copy in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::DynRankView<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  (void)name;
+  return src;
+}
+
+// Create a mirror view and deep_copy in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorDRViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::DynRankView<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<!Impl::MirrorDRViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  using Mirror = typename Impl::MirrorDRViewType<Space,T,P ...>::view_type;
+  std::string label = name.empty() ? src.label() : name;
+  auto mirror = Mirror( Kokkos::ViewAllocateWithoutInitializing(label), Impl::reconstructLayout(src.layout(), src.rank()) );
+  deep_copy(mirror, src);
+  return mirror;
+}
+
 } //end Kokkos


--- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
+++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@ -47,7 +47,9 @@
 #include <string>
 #include <vector>

-#include <Kokkos_Core.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Parallel_Reduce.hpp>

 namespace Kokkos {

--- a/lib/kokkos/containers/src/Kokkos_Vector.hpp
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@ -86,14 +86,13 @@ public:
  vector():DV() {
    _size = 0;
    _extra_storage = 1.1;
-    DV::modified_host() = 1;
  }


  vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Arg1Type>("Vector",size_t(n*(1.1))) {
    _size = n;
    _extra_storage = 1.1;
-    DV::modified_host() = 1;
+    DV::modified_flags(0) = 1;

    assign(n,val);
  }
@ -119,16 +118,16 @@ public:

          /* Assign value either on host or on device */

-    if( DV::modified_host() >= DV::modified_device() ) {
+    if( DV::template need_sync<typename DV::t_dev::device_type>() ) {
      set_functor_host f(DV::h_view,val);
      parallel_for(n,f);
      DV::t_host::execution_space::fence();
-      DV::modified_host()++;
+      DV::template modify<typename DV::t_host::device_type>();
    } else {
      set_functor f(DV::d_view,val);
      parallel_for(n,f);
      DV::t_dev::execution_space::fence();
-      DV::modified_device()++;
+      DV::template modify<typename DV::t_dev::device_type>();
    }
  }

@ -137,7 +136,8 @@ public:
  }

  void push_back(Scalar val) {
-    DV::modified_host()++;
+    DV::template sync<typename DV::t_host::device_type>();
+    DV::template modify<typename DV::t_host::device_type>();
    if(_size == span()) {
      size_t new_size = _size*_extra_storage;
      if(new_size == _size) new_size++;
@ -247,10 +247,10 @@ public:
  }

  void on_host() {
-    DV::modified_host() = DV::modified_device() + 1;
+    DV::template modify<typename DV::t_host::device_type>();
  }
  void on_device() {
-    DV::modified_device() = DV::modified_host() + 1;
+    DV::template modify<typename DV::t_dev::device_type>();
  }

  void set_overallocation(float extra) {
--- a/lib/kokkos/containers/unit_tests/CMakeLists.txt
+++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt
@ -23,6 +23,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    threads/TestThreads_DynRankViewAPI_rank12345.cpp
    threads/TestThreads_DynRankViewAPI_rank67.cpp
    threads/TestThreads_ErrorReporter.cpp
+    threads/TestThreads_OffsetView.cpp
    threads/TestThreads_ScatterView.cpp
    threads/TestThreads_StaticCrsGraph.cpp
    threads/TestThreads_UnorderedMap.cpp
@ -47,6 +48,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    serial/TestSerial_DynRankViewAPI_rank12345.cpp
    serial/TestSerial_DynRankViewAPI_rank67.cpp
    serial/TestSerial_ErrorReporter.cpp
+    serial/TestSerial_OffsetView.cpp
    serial/TestSerial_ScatterView.cpp
    serial/TestSerial_StaticCrsGraph.cpp
    serial/TestSerial_UnorderedMap.cpp
@ -71,6 +73,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    openmp/TestOpenMP_DynRankViewAPI_rank12345.cpp
    openmp/TestOpenMP_DynRankViewAPI_rank67.cpp
    openmp/TestOpenMP_ErrorReporter.cpp
+    openmp/TestOpenMP_OffsetView.cpp
    openmp/TestOpenMP_ScatterView.cpp
    openmp/TestOpenMP_StaticCrsGraph.cpp
    openmp/TestOpenMP_UnorderedMap.cpp
@ -95,6 +98,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
    cuda/TestCuda_DynRankViewAPI_rank12345.cpp
    cuda/TestCuda_DynRankViewAPI_rank67.cpp
    cuda/TestCuda_ErrorReporter.cpp
+    cuda/TestCuda_OffsetView.cpp
    cuda/TestCuda_ScatterView.cpp
    cuda/TestCuda_StaticCrsGraph.cpp
    cuda/TestCuda_UnorderedMap.cpp
--- a/lib/kokkos/containers/unit_tests/Makefile
+++ b/lib/kokkos/containers/unit_tests/Makefile
@ -39,6 +39,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	OBJ_CUDA += TestCuda_DynRankViewAPI_rank12345.o
 	OBJ_CUDA += TestCuda_DynRankViewAPI_rank67.o
 	OBJ_CUDA += TestCuda_ErrorReporter.o
+	OBJ_CUDA += TestCuda_OffsetView.o
 	OBJ_CUDA += TestCuda_ScatterView.o
 	OBJ_CUDA += TestCuda_StaticCrsGraph.o
 	OBJ_CUDA += TestCuda_UnorderedMap.o
@ -57,6 +58,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
 	OBJ_ROCM += TestROCm_DynRankViewAPI_rank12345.o
 	OBJ_ROCM += TestROCm_DynRankViewAPI_rank67.o
 	OBJ_ROCM += TestROCm_ErrorReporter.o
+	OBJ_ROCM += TestROCm_OffsetView.o
 	OBJ_ROCM += TestROCm_ScatterView.o
 	OBJ_ROCM += TestROCm_StaticCrsGraph.o
 	OBJ_ROCM += TestROCm_UnorderedMap.o
@ -75,6 +77,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	OBJ_THREADS += TestThreads_DynRankViewAPI_rank12345.o
 	OBJ_THREADS += TestThreads_DynRankViewAPI_rank67.o
 	OBJ_THREADS += TestThreads_ErrorReporter.o
+	OBJ_THREADS += TestThreads_OffsetView.o
 	OBJ_THREADS += TestThreads_ScatterView.o
 	OBJ_THREADS += TestThreads_StaticCrsGraph.o
 	OBJ_THREADS += TestThreads_UnorderedMap.o
@ -93,6 +96,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
 	OBJ_OPENMP += TestOpenMP_DynRankViewAPI_rank12345.o
 	OBJ_OPENMP += TestOpenMP_DynRankViewAPI_rank67.o
 	OBJ_OPENMP += TestOpenMP_ErrorReporter.o
+	OBJ_OPENMP += TestOpenMP_OffsetView.o
 	OBJ_OPENMP += TestOpenMP_ScatterView.o
 	OBJ_OPENMP += TestOpenMP_StaticCrsGraph.o
 	OBJ_OPENMP += TestOpenMP_UnorderedMap.o
@ -111,6 +115,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
 	OBJ_SERIAL += TestSerial_DynRankViewAPI_rank12345.o
 	OBJ_SERIAL += TestSerial_DynRankViewAPI_rank67.o
 	OBJ_SERIAL += TestSerial_ErrorReporter.o
+	OBJ_SERIAL += TestSerial_OffsetView.o
 	OBJ_SERIAL += TestSerial_ScatterView.o
 	OBJ_SERIAL += TestSerial_StaticCrsGraph.o
 	OBJ_SERIAL += TestSerial_UnorderedMap.o
--- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
+++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp
@ -729,6 +729,7 @@ public:
  static void run_tests() {
    run_test_resize_realloc();
    run_test_mirror();
+    run_test_mirror_and_copy();
    run_test_scalar();
    run_test();
    run_test_const();
@ -885,6 +886,69 @@ public:
    }
  }

+  static void run_test_mirror_and_copy()
+  {
+    // LayoutLeft
+    {
+      Kokkos::DynRankView< double, Kokkos::LayoutLeft, Kokkos::HostSpace > a_org( "A", 10 );
+      a_org(5) = 42.0;
+      Kokkos::DynRankView< double, Kokkos::LayoutLeft, Kokkos::HostSpace > a_h = a_org;
+      auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
+      auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
+      auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
+
+      int equal_ptr_h_h2 = a_h.data()  == a_h2.data() ? 1 : 0;
+      int equal_ptr_h_d  = a_h.data()  ==  a_d.data() ? 1 : 0;
+      int equal_ptr_h2_d = a_h2.data() ==  a_d.data() ? 1 : 0;
+      int equal_ptr_h3_d = a_h3.data() ==  a_d.data() ? 1 : 0;
+
+      int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
+      ASSERT_EQ( equal_ptr_h_h2, 1 );
+      ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
+      ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
+      ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
+
+      ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
+      ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
+      ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
+      ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
+      ASSERT_EQ( a_h.rank(), a_org.rank() );
+      ASSERT_EQ( a_h.rank(), a_h2.rank() );
+      ASSERT_EQ( a_h.rank(), a_h3.rank() );
+      ASSERT_EQ( a_h.rank(), a_d.rank() );
+      ASSERT_EQ( a_org(5), a_h3(5) );
+    }
+    // LayoutRight
+    {
+      Kokkos::DynRankView< double, Kokkos::LayoutRight, Kokkos::HostSpace > a_org( "A", 10 );
+      a_org(5) = 42.0;
+      Kokkos::DynRankView< double, Kokkos::LayoutRight, Kokkos::HostSpace > a_h = a_org;
+      auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h );
+      auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h );
+      auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d );
+
+      int equal_ptr_h_h2 = a_h.data()  == a_h2.data() ? 1 : 0;
+      int equal_ptr_h_d  = a_h.data()  ==  a_d.data() ? 1 : 0;
+      int equal_ptr_h2_d = a_h2.data() ==  a_d.data() ? 1 : 0;
+      int equal_ptr_h3_d = a_h3.data() ==  a_d.data() ? 1 : 0;
+
+      int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0;
+      ASSERT_EQ( equal_ptr_h_h2, 1 );
+      ASSERT_EQ( equal_ptr_h_d, is_same_memspace );
+      ASSERT_EQ( equal_ptr_h2_d, is_same_memspace );
+      ASSERT_EQ( equal_ptr_h3_d, is_same_memspace );
+
+      ASSERT_EQ( a_h.extent(0), a_h3.extent(0) );
+      ASSERT_EQ( a_h.extent(0), a_h2.extent(0) );
+      ASSERT_EQ( a_h.extent(0), a_d .extent(0) );
+      ASSERT_EQ( a_h.rank(), a_org.rank() );
+      ASSERT_EQ( a_h.rank(), a_h2.rank() );
+      ASSERT_EQ( a_h.rank(), a_h3.rank() );
+      ASSERT_EQ( a_h.rank(), a_d.rank() );
+      ASSERT_EQ( a_org(5), a_h3(5) );
+    }
+  }
+
  static void run_test_scalar()
  {
    typedef typename dView0::HostMirror  hView0 ; //HostMirror of DynRankView is a DynRankView
--- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp
@ -0,0 +1,426 @@
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+
+/*
+ * FIXME the OffsetView class is really not very well tested.
+ */
+#ifndef CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_
+#define CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_
+
+
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_OffsetView.hpp>
+#include <KokkosExp_MDRangePolicy.hpp>
+
+using std::endl;
+using std::cout;
+
+namespace Test{
+
+   template <typename Scalar, typename Device>
+   void test_offsetview_construction(unsigned int size)
+   {
+
+      typedef Kokkos::Experimental::OffsetView<Scalar**, Device> offset_view_type;
+      typedef Kokkos::View<Scalar**, Device> view_type;
+
+      Kokkos::Experimental::index_list_type range0 = {-1, 3};
+      Kokkos::Experimental::index_list_type range1 = {-2, 2};
+
+      offset_view_type ov("firstOV", range0, range1);
+
+      ASSERT_EQ("firstOV", ov.label());
+      ASSERT_EQ(2, ov.Rank);
+
+      ASSERT_EQ(ov.begin(0), -1);
+      ASSERT_EQ(ov.end(0), 4);
+
+      ASSERT_EQ(ov.begin(1), -2);
+      ASSERT_EQ(ov.end(1), 3);
+
+      ASSERT_EQ(ov.extent(0), 5);
+      ASSERT_EQ(ov.extent(1), 5);
+
+      const int ovmin0 = ov.begin(0);
+      const int ovend0 = ov.end(0);
+      const int ovmin1 = ov.begin(1);
+      const int ovend1 = ov.end(1);
+
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+      {
+         Kokkos::Experimental::OffsetView<Scalar*, Device> offsetV1("OneDOffsetView", range0);
+
+         Kokkos::RangePolicy<Device, int> rangePolicy1(offsetV1.begin(0), offsetV1.end(0));
+         Kokkos::parallel_for(rangePolicy1, KOKKOS_LAMBDA (const int i){
+            offsetV1(i) = 1;
+         }
+         );
+	 Kokkos::fence();
+
+         int OVResult = 0;
+         Kokkos::parallel_reduce(rangePolicy1, KOKKOS_LAMBDA(const int i, int & updateMe){
+            updateMe += offsetV1(i);
+         }, OVResult);
+	 
+	 Kokkos::fence();
+         ASSERT_EQ(OVResult, offsetV1.end(0) - offsetV1.begin(0)) << "found wrong number of elements in OffsetView that was summed.";
+
+      }
+      {  //test deep copy of scalar const value into mirro
+         const int constVal = 6;
+         typename offset_view_type::HostMirror hostOffsetView =
+               Kokkos::Experimental::create_mirror_view(ov);
+
+         Kokkos::Experimental::deep_copy(hostOffsetView, constVal);
+
+         for(int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
+            for(int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
+               ASSERT_EQ(hostOffsetView(i,j),  constVal) << "Bad data found in OffsetView";
+            }
+         }
+      }
+
+      typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+      typedef typename range_type::point_type point_type;
+
+      range_type rangePolicy2D(point_type{ {ovmin0, ovmin1 } },
+            point_type{ { ovend0, ovend1 } });
+
+      const int constValue = 9;
+      Kokkos::parallel_for(rangePolicy2D, KOKKOS_LAMBDA (const int i, const int j) {
+         ov(i,j) =  constValue;
+      }
+      );
+      
+      //test offsetview to offsetviewmirror deep copy
+      typename offset_view_type::HostMirror hostOffsetView =
+            Kokkos::Experimental::create_mirror_view(ov);
+
+      Kokkos::Experimental::deep_copy(hostOffsetView, ov);
+
+      for(int i = hostOffsetView.begin(0); i < hostOffsetView.end(0); ++i) {
+         for(int j = hostOffsetView.begin(1); j < hostOffsetView.end(1); ++j) {
+            ASSERT_EQ(hostOffsetView(i,j),  constValue) << "Bad data found in OffsetView";
+         }
+      }
+      
+     int OVResult = 0;
+      Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
+         updateMe += ov(i, j);
+      }, OVResult);
+
+      int answer = 0;
+      for(int i = ov.begin(0); i < ov.end(0); ++i) {
+         for(int j = ov.begin(1); j < ov.end(1); ++j) {
+            answer += constValue;
+         }
+      }
+      
+      ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView";
+#endif
+
+      {
+         offset_view_type ovCopy(ov);
+         ASSERT_EQ(ovCopy==ov, true) <<
+               "Copy constructor or equivalence operator broken";
+      }
+      
+      {
+         offset_view_type ovAssigned = ov;
+         ASSERT_EQ(ovAssigned==ov, true) <<
+               "Assignment operator or equivalence operator broken";
+      }
+      
+      {  //construct OffsetView from a View plus begins array
+         const int extent0 = 100;
+         const int extent1 = 200;
+         const int extent2 = 300;
+         Kokkos::View<Scalar***, Device> view3D("view3D", extent0, extent1, extent2);
+
+         Kokkos::deep_copy(view3D, 1);
+
+         Kokkos::Array<int64_t,3> begins = {{-10, -20, -30}};
+         Kokkos::Experimental::OffsetView<Scalar***, Device> offsetView3D(view3D, begins);
+
+         typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<3>, Kokkos::IndexType<int64_t> > range3_type;
+         typedef typename range3_type::point_type point3_type;
+
+         range3_type rangePolicy3DZero(point3_type{ {0, 0, 0 } },
+               point3_type{ { extent0, extent1, extent2 } });
+
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+        int view3DSum = 0;
+         Kokkos::parallel_reduce(rangePolicy3DZero, KOKKOS_LAMBDA(const int i, const int j, int k, int & updateMe){
+            updateMe += view3D(i, j, k);
+         }, view3DSum);
+
+         range3_type rangePolicy3D(point3_type{ {begins[0], begins[1], begins[2] } },
+               point3_type{ { begins[0] + extent0, begins[1] + extent1, begins[2] + extent2 } });
+         int offsetView3DSum = 0;
+
+         Kokkos::parallel_reduce(rangePolicy3D, KOKKOS_LAMBDA(const int i, const int j, int k, int & updateMe){
+            updateMe += offsetView3D(i, j, k);
+         }, offsetView3DSum);
+
+         ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken.";
+#endif
+      }
+      view_type viewFromOV = ov.view();
+
+      ASSERT_EQ(viewFromOV == ov, true) <<
+            "OffsetView::view() or equivalence operator View == OffsetView broken";
+
+      {
+         offset_view_type ovFromV(viewFromOV, {-1, -2});
+
+         ASSERT_EQ(ovFromV == viewFromOV , true) <<
+               "Construction of OffsetView from View or equivalence operator OffsetView == View broken";
+      }
+      {
+         offset_view_type ovFromV = viewFromOV;
+         ASSERT_EQ(ovFromV == viewFromOV , true) <<
+               "Construction of OffsetView from View by assignment (implicit conversion) or equivalence operator OffsetView == View broken";
+      }
+
+      {// test offsetview to view deep copy
+         view_type aView("aView", ov.extent(0), ov.extent(1));
+         Kokkos::Experimental::deep_copy(aView, ov);
+
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+         int sum = 0;
+         Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
+            updateMe += ov(i, j) - aView(i- ov.begin(0), j-ov.begin(1));
+         }, sum);
+
+         ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken.";
+#endif
+      }
+
+      {// test view to  offsetview deep copy
+         view_type aView("aView", ov.extent(0), ov.extent(1));
+
+         Kokkos::deep_copy(aView, 99);
+         Kokkos::Experimental::deep_copy(ov, aView);
+	 
+
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+         int sum = 0;
+         Kokkos::parallel_reduce(rangePolicy2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
+            updateMe += ov(i, j) - aView(i- ov.begin(0), j-ov.begin(1));
+         }, sum);
+
+         ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken.";
+#endif
+      }
+   }
+   template <typename Scalar, typename Device>
+   void test_offsetview_subview(unsigned int size)
+   {
+      {//test subview 1
+          Kokkos::Experimental::OffsetView<Scalar*, Device> sliceMe("offsetToSlice", {-10, 20});
+          {
+             auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0);
+             ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken.";
+          }
+
+       }
+      {//test subview 2
+         Kokkos::Experimental::OffsetView<Scalar**, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30});
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(),-2);
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL());
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+      }
+
+
+      {//test subview rank 3
+
+         Kokkos::Experimental::OffsetView<Scalar***, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30}, {-30,40});
+
+         //slice 1
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe,Kokkos::ALL(),Kokkos::ALL(), 0);
+            ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe,Kokkos::ALL(), 0,Kokkos::ALL());
+            ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
+         }
+
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe,0, Kokkos::ALL(),Kokkos::ALL());
+            ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
+
+         }
+         {
+	   auto offsetSubview = Kokkos::Experimental::subview(sliceMe,0, Kokkos::ALL(), Kokkos::make_pair(-30, -21));
+            ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken.";
+
+            ASSERT_EQ(offsetSubview.begin(0) , -20);
+            ASSERT_EQ(offsetSubview.end(0) , 31);
+            ASSERT_EQ(offsetSubview.begin(1) , 0);
+            ASSERT_EQ(offsetSubview.end(1) , 9);
+
+#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA)
+            typedef Kokkos::MDRangePolicy<Device, Kokkos::Rank<2>, Kokkos::IndexType<int> > range_type;
+            typedef typename range_type::point_type point_type;
+
+            const int b0 = offsetSubview.begin(0);
+            const int b1 = offsetSubview.begin(1);
+
+            const int e0 = offsetSubview.end(0);
+            const int e1 = offsetSubview.end(1);
+
+            range_type rangeP2D(point_type{ {b0, b1 } }, point_type{ { e0, e1} });
+
+            Kokkos::parallel_for(rangeP2D, KOKKOS_LAMBDA(const int i, const int j) {
+               offsetSubview(i,j) =  6;
+            }
+            );
+
+            int sum = 0;
+             Kokkos::parallel_reduce(rangeP2D, KOKKOS_LAMBDA(const int i, const int j, int & updateMe){
+                updateMe += offsetSubview(i, j);
+             }, sum);
+
+            ASSERT_EQ(sum, 6*(e0-b0)*(e1-b1));
+#endif
+         }
+
+         // slice 2
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0);
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL());
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0);
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+      }
+
+      {//test subview rank 4
+
+         Kokkos::Experimental::OffsetView<Scalar****, Device> sliceMe("offsetToSlice", {-10,20}, {-20,30}, {-30,40}, {-40, 50});
+
+         //slice 1
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(),Kokkos::ALL(), Kokkos::ALL(), 0);
+            ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL());
+            ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe ,Kokkos::ALL(), 0, Kokkos::ALL(),Kokkos::ALL());
+            ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe , 0, Kokkos::ALL(), Kokkos::ALL(),  Kokkos::ALL() );
+            ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken.";
+         }
+
+         // slice 2
+         auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0);
+         ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken.";
+         {
+            auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0);
+            ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL());
+            ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe,  0, Kokkos::ALL(), 0, Kokkos::ALL());
+            ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview2b = Kokkos::Experimental::subview(sliceMe,  0, 0, Kokkos::ALL(), Kokkos::ALL());
+            ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken.";
+         }
+         // slice 3
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0);
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0);
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe,  0, 0, Kokkos::ALL(), 0);
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+         {
+            auto offsetSubview = Kokkos::Experimental::subview(sliceMe,  0, 0, 0, Kokkos::ALL());
+            ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken.";
+         }
+
+      }
+
+   }
+
+   TEST_F( TEST_CATEGORY, offsetview_construction) {
+      test_offsetview_construction<int,TEST_EXECSPACE>(10);
+   }
+   TEST_F( TEST_CATEGORY, offsetview_subview) {
+      test_offsetview_subview<int,TEST_EXECSPACE>(10);
+   }
+
+} // namespace Test
+
+#endif /* CONTAINERS_UNIT_TESTS_TESTOFFSETVIEW_HPP_ */
--- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp
+++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp
@ -80,7 +80,9 @@ void test_scatter_view_config(int n)
    Kokkos::Experimental::contribute(original_view, scatter_view);
  }
 #if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
+  Kokkos::fence();
  auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view);
+  Kokkos::fence();
  for (typename decltype(host_view)::size_type i = 0; i < host_view.extent(0); ++i) {
    auto val0 = host_view(i, 0);
    auto val1 = host_view(i, 1);
@ -111,9 +113,6 @@ struct TestDuplicatedScatterView {
    test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
      Kokkos::Experimental::ScatterDuplicated,
      Kokkos::Experimental::ScatterNonAtomic>(n);
-    test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
-      Kokkos::Experimental::ScatterDuplicated,
-      Kokkos::Experimental::ScatterAtomic>(n);
  }
 };

@ -127,6 +126,16 @@ struct TestDuplicatedScatterView<Kokkos::Cuda> {
 };
 #endif

+#ifdef KOKKOS_ENABLE_ROCM
+// disable duplicated instantiation with ROCm until
+// UniqueToken can support it
+template <>
+struct TestDuplicatedScatterView<Kokkos::Experimental::ROCm> {
+  TestDuplicatedScatterView(int) {
+  }
+};
+#endif
+
 template <typename ExecSpace>
 void test_scatter_view(int n)
 {
@ -142,16 +151,28 @@ void test_scatter_view(int n)
      Kokkos::Experimental::ScatterNonDuplicated,
      Kokkos::Experimental::ScatterNonAtomic>(n);
  }
+#ifdef KOKKOS_ENABLE_SERIAL
+  if (!std::is_same<ExecSpace, Kokkos::Serial>::value) {
+#endif
  test_scatter_view_config<ExecSpace, Kokkos::LayoutRight,
    Kokkos::Experimental::ScatterNonDuplicated,
    Kokkos::Experimental::ScatterAtomic>(n);
+#ifdef KOKKOS_ENABLE_SERIAL
+  }
+#endif

  TestDuplicatedScatterView<ExecSpace> duptest(n);
 }

 TEST_F( TEST_CATEGORY, scatterview) {
+#ifndef KOKKOS_ENABLE_ROCM
  test_scatter_view<TEST_EXECSPACE>(10);
+#ifdef KOKKOS_ENABLE_DEBUG
+  test_scatter_view<TEST_EXECSPACE>(100000);
+#else
  test_scatter_view<TEST_EXECSPACE>(10000000);
+#endif
+#endif
 }

 } // namespace Test
--- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@ -46,6 +46,7 @@
 #include <vector>

 #include <Kokkos_StaticCrsGraph.hpp>
+#include <Kokkos_Core.hpp>

 /*--------------------------------------------------------------------------*/
 namespace Test {
--- a/lib/kokkos/containers/unit_tests/cuda/TestCuda_OffsetView.cpp
+++ b/lib/kokkos/containers/unit_tests/cuda/TestCuda_OffsetView.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<cuda/TestCuda_Category.hpp>
+#include<TestOffsetView.hpp>
+
--- a/lib/kokkos/containers/unit_tests/openmp/TestOpenMP_OffsetView.cpp
+++ b/lib/kokkos/containers/unit_tests/openmp/TestOpenMP_OffsetView.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<openmp/TestOpenMP_Category.hpp>
+#include<TestOffsetView.hpp>
+
--- a/lib/kokkos/containers/unit_tests/rocm/TestROCm_Category.hpp
+++ b/lib/kokkos/containers/unit_tests/rocm/TestROCm_Category.hpp
@ -60,6 +60,6 @@ protected:
 } // namespace Test

 #define TEST_CATEGORY rocm
-#define TEST_EXECSPACE Kokkos::ROCm
+#define TEST_EXECSPACE Kokkos::Experimental::ROCm

 #endif
--- a/lib/kokkos/containers/unit_tests/serial/TestSerial_OffsetView.cpp
+++ b/lib/kokkos/containers/unit_tests/serial/TestSerial_OffsetView.cpp
@ -0,0 +1,46 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<serial/TestSerial_Category.hpp>
+#include<TestOffsetView.hpp>
+
--- a/lib/kokkos/containers/unit_tests/threads/TestThreads_OffsetView.cpp
+++ b/lib/kokkos/containers/unit_tests/threads/TestThreads_OffsetView.cpp
@ -0,0 +1,47 @@
+
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Christian R. Trott (crtrott@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include<threads/TestThreads_Category.hpp>
+#include<TestOffsetView.hpp>
+
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@ -108,3 +108,7 @@ else()

 endif()
 #-----------------------------------------------------------------------------
+
+# build and install pkgconfig file
+CONFIGURE_FILE(kokkos.pc.in kokkos.pc @ONLY)
+INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/kokkos.pc DESTINATION lib/pkgconfig)
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -208,7 +208,7 @@ struct CudaParallelLaunch< DriverType
                    , const int          shmem
                    , const cudaStream_t stream = 0 )
  {
-    if ( grid.x && ( block.x * block.y * block.z ) ) {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {

      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
           sizeof( DriverType ) ) {
@ -264,7 +264,7 @@ struct CudaParallelLaunch< DriverType
                    , const int          shmem
                    , const cudaStream_t stream = 0 )
  {
-    if ( grid.x && ( block.x * block.y * block.z ) ) {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {

      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
           sizeof( DriverType ) ) {
@ -321,7 +321,7 @@ struct CudaParallelLaunch< DriverType
                    , const int          shmem
                    , const cudaStream_t stream = 0 )
  {
-    if ( grid.x && ( block.x * block.y * block.z ) ) {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {

      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
           sizeof( DriverType ) ) {
@ -370,7 +370,7 @@ struct CudaParallelLaunch< DriverType
                    , const int          shmem
                    , const cudaStream_t stream = 0 )
  {
-    if ( grid.x && ( block.x * block.y * block.z ) ) {
+    if ( (grid.x != 0) && ( ( block.x * block.y * block.z ) != 0 ) ) {

      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
           sizeof( DriverType ) ) {
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@ -453,6 +453,8 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
          , arg_label.c_str()
          , SharedAllocationHeader::maximum_label_length
          );
+  // Set last element zero, in case c_str is too long
+  header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;

  // Copy to device memory
  Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
@ -491,6 +493,9 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
          , arg_label.c_str()
          , SharedAllocationHeader::maximum_label_length
          );
+
+  // Set last element zero, in case c_str is too long
+  RecordBase::m_alloc_ptr->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
 }

 SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
@ -525,6 +530,8 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
          , arg_label.c_str()
          , SharedAllocationHeader::maximum_label_length
          );
+  // Set last element zero, in case c_str is too long
+  RecordBase::m_alloc_ptr->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char) 0;
 }

 //----------------------------------------------------------------------------
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@ -689,9 +689,13 @@ Cuda::size_type cuda_internal_multiprocessor_count()

 CudaSpace::size_type cuda_internal_maximum_concurrent_block_count()
 {
+  #if defined(KOKKOS_ARCH_KEPLER)
+  // Compute capability 3.0 through 3.7
+  enum : int { max_resident_blocks_per_multiprocessor = 16 };
+  #else
  // Compute capability 5.0 through 6.2
  enum : int { max_resident_blocks_per_multiprocessor = 32 };
-
+  #endif
   return CudaInternal::singleton().m_multiProcCount
          * max_resident_blocks_per_multiprocessor ;
 };
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@ -52,22 +52,22 @@

 namespace Kokkos { namespace Impl {

-template<class DriverType, bool Large>
+template<class DriverType, class LaunchBounds, bool Large>
 struct CudaGetMaxBlockSize;

-template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+template<class DriverType, class LaunchBounds>
 int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
-  return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
+  return CudaGetMaxBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
 }


 template<class DriverType>
-struct CudaGetMaxBlockSize<DriverType,true> {
+struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;
-    int blockSize=32;
+    int blockSize=1024;
    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
@ -76,8 +76,9 @@ struct CudaGetMaxBlockSize<DriverType,true> {
        blockSize,
        sharedmem);

-    while (blockSize<1024 && numBlocks>0) {
-      blockSize*=2;
+    if(numBlocks>0) return blockSize;
+    while (blockSize>32 && numBlocks==0) {
+      blockSize/=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

@ -87,19 +88,30 @@ struct CudaGetMaxBlockSize<DriverType,true> {
          blockSize,
          sharedmem);
    }
-    if(numBlocks>0) return blockSize;
-    else return blockSize/2;
+    int blockSizeUpperBound = blockSize*2;
+    while (blockSize<blockSizeUpperBound && numBlocks>0) {
+      blockSize+=32;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &numBlocks,
+            cuda_parallel_launch_constant_memory<DriverType>,
+            blockSize,
+            sharedmem);
+    }
+    return blockSize - 32;
  }
 };

 template<class DriverType>
-struct CudaGetMaxBlockSize<DriverType,false> {
+struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int numBlocks;

-    int blockSize=32;
-    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+    unsigned int blockSize=1024;
+    unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
        &numBlocks,
@ -107,8 +119,9 @@ struct CudaGetMaxBlockSize<DriverType,false> {
        blockSize,
        sharedmem);

-    while (blockSize<1024 && numBlocks>0) {
-      blockSize*=2;
+    if(numBlocks>0) return blockSize;
+    while (blockSize>32 && numBlocks==0) {
+      blockSize/=2;
      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );

@ -118,24 +131,121 @@ struct CudaGetMaxBlockSize<DriverType,false> {
          blockSize,
          sharedmem);
    }
-    if(numBlocks>0) return blockSize;
-    else return blockSize/2;
+    unsigned int blockSizeUpperBound = blockSize*2;
+    while (blockSize<blockSizeUpperBound && numBlocks>0) {
+      blockSize+=32;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &numBlocks,
+            cuda_parallel_launch_local_memory<DriverType>,
+            blockSize,
+            sharedmem);
+    }
+    return blockSize - 32;
+  }
+};
+
+template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
+struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,MinBlocksPerSM>,true> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int numBlocks = 0, oldNumBlocks = 0;
+    unsigned int blockSize=MaxThreadsPerBlock;
+    unsigned int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
+        blockSize,
+        sharedmem);
+
+    if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
+
+    while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
+      blockSize/=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_constant_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
+    while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>MinBlocksPerSM) {
+      blockSize+=32;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      oldNumBlocks = numBlocks;
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &numBlocks,
+            cuda_parallel_launch_constant_memory<DriverType>,
+            blockSize,
+            sharedmem);
+    }
+    if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
+    return -1;
+  }
+};
+
+template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
+struct CudaGetMaxBlockSize<DriverType,Kokkos::LaunchBounds<MaxThreadsPerBlock,MinBlocksPerSM>,false> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int numBlocks = 0, oldNumBlocks = 0;
+    unsigned int blockSize=MaxThreadsPerBlock;
+    int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                    FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
+        blockSize,
+        sharedmem);
+    if(static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) return blockSize;
+
+    while (blockSize>32 && static_cast<unsigned int>(numBlocks)<MinBlocksPerSM) {
+      blockSize/=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_local_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    unsigned int blockSizeUpperBound = (blockSize*2<MaxThreadsPerBlock?blockSize*2:MaxThreadsPerBlock);
+    while (blockSize<blockSizeUpperBound && static_cast<unsigned int>(numBlocks)>=MinBlocksPerSM) {
+      blockSize+=32;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      oldNumBlocks = numBlocks;
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+            &numBlocks,
+            cuda_parallel_launch_local_memory<DriverType>,
+            blockSize,
+            sharedmem);
+    }
+    if(static_cast<unsigned int>(oldNumBlocks)>=MinBlocksPerSM) return blockSize - 32;
+    return -1;
  }
 };


-
-template<class DriverType, bool Large>
+template<class DriverType, class LaunchBounds, bool Large>
 struct CudaGetOptBlockSize;

-template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
+template<class DriverType, class LaunchBounds>
 int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
-  return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
+  return CudaGetOptBlockSize<DriverType,LaunchBounds,(CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
 }

 template<class DriverType>
-struct CudaGetOptBlockSize<DriverType,true> {
+struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,true> {
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
@ -165,7 +275,7 @@ struct CudaGetOptBlockSize<DriverType,true> {
 };

 template<class DriverType>
-struct CudaGetOptBlockSize<DriverType,false> {
+struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds<>,false> {
  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
    int blockSize=16;
@ -194,6 +304,75 @@ struct CudaGetOptBlockSize<DriverType,false> {
  }
 };

+template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
+struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,true> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int blockSize=16;
+    int numBlocks;
+    int sharedmem;
+    int maxOccupancy=0;
+    int bestBlockSize=0;
+    int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);
+
+    while(blockSize < max_threads_per_block ) {
+      blockSize*=2;
+
+      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_constant_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
+              blockSize,
+              sharedmem);
+      if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
+        if(maxOccupancy < numBlocks*blockSize) {
+           maxOccupancy = numBlocks*blockSize;
+           bestBlockSize = blockSize;
+        }
+      }
+    }
+    if(maxOccupancy > 0)
+      return bestBlockSize;
+    return -1;
+  }
+};
+
+template<class DriverType, unsigned int MaxThreadsPerBlock, unsigned int MinBlocksPerSM>
+struct CudaGetOptBlockSize<DriverType,Kokkos::LaunchBounds< MaxThreadsPerBlock, MinBlocksPerSM >,false> {
+  static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
+                            const size_t shmem_extra_block, const size_t shmem_extra_thread) {
+    int blockSize=16;
+    int numBlocks;
+    int sharedmem;
+    int maxOccupancy=0;
+    int bestBlockSize=0;
+    int max_threads_per_block = std::min(MaxThreadsPerBlock,cuda_internal_maximum_warp_count()*CudaTraits::WarpSize);
+
+    while(blockSize < max_threads_per_block ) {
+      blockSize*=2;
+      sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
+                  FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize/vector_length );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_local_memory<DriverType,MaxThreadsPerBlock,MinBlocksPerSM>,
+              blockSize,
+              sharedmem);
+      if(numBlocks >= int(MinBlocksPerSM) && blockSize<=int(MaxThreadsPerBlock)) {
+        if(maxOccupancy < numBlocks*blockSize) {
+          maxOccupancy = numBlocks*blockSize;
+          bestBlockSize = blockSize;
+        }
+      }
+    }
+    if(maxOccupancy > 0)
+      return bestBlockSize;
+    return -1;
+  }
+};
+
 }} // namespace Kokkos::Impl

 #endif // KOKKOS_ENABLE_CUDA
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Locks.hpp
@ -148,6 +148,9 @@ namespace Kokkos {
 namespace Impl {
 namespace {
  static int lock_array_copied = 0;
+  inline int eliminate_warning_for_lock_array() {
+    return lock_array_copied;
+  }
 }
 }
 }
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@ -60,6 +60,7 @@
 #include <Cuda/Kokkos_Cuda_Internal.hpp>
 #include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Kokkos_Vectorization.hpp>
+#include <Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp>

 #if defined(KOKKOS_ENABLE_PROFILING)
 #include <impl/Kokkos_Profiling_Interface.hpp>
@ -114,6 +115,7 @@ public:

  //----------------------------------------

+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
  template< class FunctorType >
  inline static
  int team_size_max( const FunctorType & functor )
@ -131,7 +133,35 @@ public:

      return n ;
    }
+#endif

+  template<class FunctorType>
+  int team_size_max( const FunctorType& f, const ParallelForTag& ) const {
+    typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
+    int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
+        (size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) );
+    return block_size/vector_length();
+  }
+
+  template<class FunctorType>
+  int team_size_max( const FunctorType& f, const ParallelReduceTag& ) const {
+    typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,TeamPolicyInternal,FunctorType> functor_analysis_type;
+    typedef typename Impl::ParallelReduceReturnValue<void,typename functor_analysis_type::value_type,FunctorType>::reducer_type reducer_type;
+    typedef Impl::ParallelReduce< FunctorType , TeamPolicy<Properties...>, reducer_type > closure_type;
+    typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
+
+    int block_size = Kokkos::Impl::cuda_get_max_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
+        (size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) +
+                                                          ((functor_value_traits::StaticValueSize!=0)?0:functor_value_traits::value_size( f )));
+
+    // Currently we require Power-of-2 team size for reductions.
+    int p2 = 1;
+    while(p2<=block_size) p2*=2;
+    p2/=2;
+    return p2/vector_length();
+  }
+
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
  template< class FunctorType >
  static int team_size_recommended( const FunctorType & functor )
    { return team_size_max( functor ); }
@ -143,11 +173,41 @@ public:
      if(max<1) max = 1;
      return max;
    }
+#endif
+
+  template<class FunctorType>
+  int team_size_recommended( const FunctorType& f, const ParallelForTag& ) const {
+    typedef Impl::ParallelFor< FunctorType , TeamPolicy<Properties...> > closure_type;
+    int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
+        (size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double));
+    return block_size/vector_length();
+  }
+
+  template<class FunctorType>
+  int team_size_recommended( const FunctorType& f, const ParallelReduceTag& ) const {
+    typedef Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,TeamPolicyInternal,FunctorType> functor_analysis_type;
+    typedef typename Impl::ParallelReduceReturnValue<void,typename functor_analysis_type::value_type,FunctorType>::reducer_type reducer_type;
+    typedef Impl::ParallelReduce< FunctorType , TeamPolicy<Properties...>, reducer_type > closure_type;
+    typedef Impl::FunctorValueTraits< FunctorType , typename traits::work_tag > functor_value_traits;
+
+    int block_size = Kokkos::Impl::cuda_get_opt_block_size< closure_type, typename traits::launch_bounds >( f ,(size_t) vector_length(),
+        (size_t) team_scratch_size(0) + 2*sizeof(double), (size_t) thread_scratch_size(0) + sizeof(double) +
+                                                          ((functor_value_traits::StaticValueSize!=0)?0:functor_value_traits::value_size( f )));
+    return block_size/vector_length();
+  }
+

  inline static
  int vector_length_max()
    { return Impl::CudaTraits::WarpSize; }

+  inline static
+  int scratch_size_max(int level)
+    { return (level==0?
+        1024*40:             // 48kB is the max for CUDA, but we need some for team_member.reduce etc.
+        20*1024*1024);   // arbitrarily setting this to 20MB, for a Volta V100 that would give us about 3.2GB for 2 teams per SM
+    }
+
  //----------------------------------------

  inline int vector_length()   const { return m_vector_length ; }
@ -419,7 +479,7 @@ public:
  void execute() const
    {
      const typename Policy::index_type nwork = m_policy.end() - m_policy.begin();
-      const int block_size = Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( m_functor , 1, 0 , 0 );
+      const int block_size = Kokkos::Impl::cuda_get_opt_block_size< ParallelFor, LaunchBounds>( m_functor , 1, 0 , 0 );
      const dim3 block(  1 , block_size , 1);
      const dim3 grid( std::min( typename Policy::index_type(( nwork + block.y - 1 ) / block.y) , typename Policy::index_type(cuda_internal_maximum_grid_count()) ) , 1 , 1);

@ -654,7 +714,7 @@ public:
    : m_functor( arg_functor )
    , m_league_size( arg_policy.league_size() )
    , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelFor, LaunchBounds >( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length() )
    , m_vector_size( arg_policy.vector_length() )
    , m_shmem_begin( sizeof(double) * ( m_team_size + 2 ) )
    , m_shmem_size( arg_policy.scratch_size(0,m_team_size) + FunctorTeamShmemSize< FunctorType >::value( m_functor , m_team_size ) )
@ -670,7 +730,7 @@ public:
      }

      if ( int(m_team_size) >
-           int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor >
+           int(Kokkos::Impl::cuda_get_max_block_size< ParallelFor, LaunchBounds >
                 ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
        Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > requested too large team size."));
      }
@ -725,12 +785,13 @@ public:
  const Policy        m_policy ;
  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;
+  const bool          m_result_ptr_device_accessible ;
  size_type *         m_scratch_space ;
  size_type *         m_scratch_flags ;
  size_type *         m_unified_space ;

-  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
-  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit)
+  enum { UseShflReduction = false };//((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
  // Some crutch to do function overloading
 private:
  typedef double DummyShflReductionType;
@ -752,12 +813,12 @@ public:

  __device__ inline
  void operator() () const {
-    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+/*    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
  }

  __device__ inline
  void run(const DummySHMEMReductionType& ) const
-  {
+  {*/
    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );

@ -786,7 +847,8 @@ public:
      // This is the final block with the final result at the final threads' location

      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
-      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+      size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) : 
+                                 ( m_unified_space ? m_unified_space : m_scratch_space );

      if ( threadIdx.y == 0 ) {
        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
@ -798,10 +860,9 @@ public:
    }
  }

-  __device__ inline
+/*  __device__ inline
   void run(const DummyShflReductionType&) const
   {
-
     value_type value;
     ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
     // Number of blocks is bounded so that the reduction can be limited to two passes.
@ -832,7 +893,7 @@ public:
         *result = value;
       }
     }
-   }
+   }*/

  // Determine block size constrained by shared memory:
  static inline
@ -863,16 +924,18 @@ public:

      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute

-      Cuda::fence();
+      if(!m_result_ptr_device_accessible) {
+        Cuda::fence();

-      if ( m_result_ptr ) {
-        if ( m_unified_space ) {
-          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
-          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
-        }
-        else {
-          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
-          DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+        if ( m_result_ptr ) {
+          if ( m_unified_space ) {
+            const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+            for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+          }
+          else {
+            const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+            DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+          }
        }
      }
    }
@ -883,17 +946,18 @@ public:
    }
  }

-  template< class HostViewType >
+  template< class ViewType >
  ParallelReduce( const FunctorType  & arg_functor
                , const Policy       & arg_policy
-                , const HostViewType & arg_result
+                , const ViewType & arg_result
                , typename std::enable_if<
-                   Kokkos::is_view< HostViewType >::value
+                   Kokkos::is_view< ViewType >::value
                ,void*>::type = NULL)
  : m_functor( arg_functor )
  , m_policy(  arg_policy )
  , m_reducer( InvalidType() )
  , m_result_ptr( arg_result.data() )
+  , m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
  , m_scratch_space( 0 )
  , m_scratch_flags( 0 )
  , m_unified_space( 0 )
@ -906,6 +970,7 @@ public:
  , m_policy(  arg_policy )
  , m_reducer( reducer )
  , m_result_ptr( reducer.view().data() )
+  , m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
  , m_scratch_space( 0 )
  , m_scratch_flags( 0 )
  , m_unified_space( 0 )
@ -953,6 +1018,7 @@ public:
  const Policy        m_policy ; // used for workrange and nwork
  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;
+  const bool          m_result_ptr_device_accessible ;
  size_type *         m_scratch_space ;
  size_type *         m_scratch_flags ;
  size_type *         m_unified_space ;
@ -960,7 +1026,7 @@ public:
  typedef typename Kokkos::Impl::Reduce::DeviceIterateTile<Policy::rank, Policy, FunctorType, typename Policy::work_tag, reference_type> DeviceIteratePattern;

  // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit
-  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) };
+  enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && (ValueTraits::StaticValueSize!=0)) };
  // Some crutch to do function overloading
 private:
  typedef double DummyShflReductionType;
@ -978,12 +1044,12 @@ public:
  inline
  __device__
  void operator() (void) const {
-    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
+/*    run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
  }

  __device__ inline
  void run(const DummySHMEMReductionType& ) const
-  {
+  {*/
    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
      word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );

@ -1007,7 +1073,8 @@ public:

      // This is the final block with the final result at the final threads' location
      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
-      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+      size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
+                                 ( m_unified_space ? m_unified_space : m_scratch_space );

      if ( threadIdx.y == 0 ) {
        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
@ -1019,7 +1086,7 @@ public:
    }
  }

-  __device__ inline
+/*  __device__ inline
   void run(const DummyShflReductionType&) const
   {

@ -1051,7 +1118,7 @@ public:
       }
     }
   }
-
+*/
  // Determine block size constrained by shared memory:
  static inline
  unsigned local_block_size( const FunctorType & f )
@ -1089,16 +1156,18 @@ public:

      CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem ); // copy to device and execute

-      Cuda::fence();
+      if(!m_result_ptr_device_accessible) {
+        Cuda::fence();

-      if ( m_result_ptr ) {
-        if ( m_unified_space ) {
-          const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
-          for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
-        }
-        else {
-          const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
-          DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+        if ( m_result_ptr ) {
+          if ( m_unified_space ) {
+            const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer)  );
+            for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+          }
+          else {
+            const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer)  );
+            DeepCopy<HostSpace,CudaSpace>( m_result_ptr , m_scratch_space , size );
+          }
        }
      }
    }
@ -1109,17 +1178,18 @@ public:
    }
  }

-  template< class HostViewType >
+  template< class ViewType >
  ParallelReduce( const FunctorType  & arg_functor
                , const Policy       & arg_policy
-                , const HostViewType & arg_result
+                , const ViewType & arg_result
                , typename std::enable_if<
-                   Kokkos::is_view< HostViewType >::value
+                   Kokkos::is_view< ViewType >::value
                ,void*>::type = NULL)
  : m_functor( arg_functor )
  , m_policy(  arg_policy )
  , m_reducer( InvalidType() )
  , m_result_ptr( arg_result.data() )
+  , m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
  , m_scratch_space( 0 )
  , m_scratch_flags( 0 )
  , m_unified_space( 0 )
@ -1132,6 +1202,7 @@ public:
  , m_policy(  arg_policy )
  , m_reducer( reducer )
  , m_result_ptr( reducer.view().data() )
+  , m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
  , m_scratch_space( 0 )
  , m_scratch_flags( 0 )
  , m_unified_space( 0 )
@ -1174,7 +1245,7 @@ public:
  typedef FunctorType      functor_type ;
  typedef Cuda::size_type  size_type ;

-  enum { UseShflReduction = (true && ValueTraits::StaticValueSize) };
+  enum { UseShflReduction = (true && (ValueTraits::StaticValueSize!=0)) };

 private:
  typedef double DummyShflReductionType;
@ -1191,6 +1262,7 @@ private:
  const FunctorType   m_functor ;
  const ReducerType   m_reducer ;
  const pointer_type  m_result_ptr ;
+  const bool          m_result_ptr_device_accessible ;
  size_type *         m_scratch_space ;
  size_type *         m_scratch_flags ;
  size_type *         m_unified_space ;
@ -1279,7 +1351,8 @@ public:
      // This is the final block with the final result at the final threads' location

      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
-      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+      size_type * const global = m_result_ptr_device_accessible? reinterpret_cast<size_type*>(m_result_ptr) :
+                                 ( m_unified_space ? m_unified_space : m_scratch_space );

      if ( threadIdx.y == 0 ) {
        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared );
@ -1312,12 +1385,18 @@ public:
        , value );
    }

-    pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+    pointer_type const result = m_result_ptr_device_accessible? m_result_ptr :
+                                (pointer_type) ( m_unified_space ? m_unified_space : m_scratch_space );

    value_type init;
    ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init);
-    if(Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
-           (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) {
+    if(
+        Impl::cuda_inter_block_reduction<FunctorType,ValueJoin,WorkTag>
+           (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)
+        //This breaks a test
+        //   Kokkos::Impl::CudaReductionsFunctor<FunctorType,WorkTag,false,true>::scalar_inter_block_reduction(ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x ,
+        //              kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags)
+    ) {
      const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
      if(id==0) {
        Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value );
@ -1331,7 +1410,7 @@ public:
    {
      const int nwork = m_league_size * m_team_size ;
      if ( nwork ) {
-        const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024) )
+        const int block_count = UseShflReduction? std::min( m_league_size , size_type(1024*32) )
          :std::min( m_league_size , m_team_size );

        m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) * block_count );
@ -1344,16 +1423,18 @@ public:

        CudaParallelLaunch< ParallelReduce, LaunchBounds >( *this, grid, block, shmem_size_total ); // copy to device and execute

-        Cuda::fence();
+        if(!m_result_ptr_device_accessible) {
+          Cuda::fence();

-        if ( m_result_ptr ) {
-          if ( m_unified_space ) {
-            const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
-            for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
-          }
-          else {
-            const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
-            DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
+          if ( m_result_ptr ) {
+            if ( m_unified_space ) {
+              const int count = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
+              for ( int i = 0 ; i < count ; ++i ) { m_result_ptr[i] = pointer_type(m_unified_space)[i] ; }
+            }
+            else {
+              const int size = ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) );
+              DeepCopy<HostSpace,CudaSpace>( m_result_ptr, m_scratch_space, size );
+            }
          }
        }
      }
@ -1364,16 +1445,17 @@ public:
      }
    }

-  template< class HostViewType >
+  template< class ViewType >
  ParallelReduce( const FunctorType  & arg_functor
                , const Policy       & arg_policy
-                , const HostViewType & arg_result
+                , const ViewType & arg_result
                , typename std::enable_if<
-                                   Kokkos::is_view< HostViewType >::value
+                                   Kokkos::is_view< ViewType >::value
                                ,void*>::type = NULL)
  : m_functor( arg_functor )
  , m_reducer( InvalidType() )
  , m_result_ptr( arg_result.data() )
+  , m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ViewType::memory_space>::accessible )
  , m_scratch_space( 0 )
  , m_scratch_flags( 0 )
  , m_unified_space( 0 )
@ -1383,17 +1465,17 @@ public:
  , m_scratch_ptr{NULL,NULL}
  , m_scratch_size{
    arg_policy.scratch_size(0,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
                                                                 arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
                                                                 arg_policy.vector_length() )
    ), arg_policy.scratch_size(1,( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
                                                                 arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
                                                                 arg_policy.vector_length() )
        )}
  , m_league_size( arg_policy.league_size() )
  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
                                                               arg_policy.vector_length() )
  , m_vector_size( arg_policy.vector_length() )
@ -1430,9 +1512,7 @@ public:
      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
    }

-    if ( unsigned(m_team_size) >
-         unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
-               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
+    if ( int(m_team_size) > arg_policy.team_size_max(m_functor,ParallelReduceTag()) ) {
      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
    }

@ -1444,6 +1524,7 @@ public:
  : m_functor( arg_functor )
  , m_reducer( reducer )
  , m_result_ptr( reducer.view().data() )
+  , m_result_ptr_device_accessible(MemorySpaceAccess< Kokkos::CudaSpace , typename ReducerType::result_view_type::memory_space>::accessible )
  , m_scratch_space( 0 )
  , m_scratch_flags( 0 )
  , m_unified_space( 0 )
@ -1453,7 +1534,7 @@ public:
  , m_scratch_ptr{NULL,NULL}
  , m_league_size( arg_policy.league_size() )
  , m_team_size( 0 <= arg_policy.team_size() ? arg_policy.team_size() :
-      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >( arg_functor , arg_policy.vector_length(),
+      Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >( arg_functor , arg_policy.vector_length(),
                                                               arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) /
      arg_policy.vector_length() )
  , m_vector_size( arg_policy.vector_length() )
@ -1486,10 +1567,7 @@ public:
         CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
    }
-
-    if ( int(m_team_size) >
-         int(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
-               ( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
+    if ( int(m_team_size) > arg_policy.team_size_max(m_functor,ParallelReduceTag()) ) {
      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
    }

@ -1753,7 +1831,7 @@ public:
      // Occupancy calculator assumes whole block.

      m_team_size =
-        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce >
+        Kokkos::Impl::cuda_get_opt_block_size< ParallelReduce, LaunchBounds >
          ( arg_functor
          , arg_policy.vector_length()
          , arg_policy.team_scratch_size(0)
@ -1970,7 +2048,9 @@ private:
    const WorkRange range( m_policy , blockIdx.x , gridDim.x );

    for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
-
+      #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      unsigned MASK=KOKKOS_IMPL_CUDA_ACTIVEMASK;
+      #endif
      const typename Policy::member_type iwork = iwork_base + threadIdx.y ;

      __syncthreads(); // Don't overwrite previous iteration values until they are used
@ -1981,7 +2061,11 @@ private:
      for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
      }
-
+      #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
+      #else
+      KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
+      #endif
      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.

      // Call functor to accumulate inclusive scan value for this work item
@ -2189,6 +2273,9 @@ private:
    const WorkRange range( m_policy , blockIdx.x , gridDim.x );

    for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
+      #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      unsigned MASK=KOKKOS_IMPL_CUDA_ACTIVEMASK;
+      #endif

      const typename Policy::member_type iwork = iwork_base + threadIdx.y ;

@ -2201,6 +2288,11 @@ private:
        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
      }

+      #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(MASK);
+      #else
+      KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
+      #endif
      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.

      // Call functor to accumulate inclusive scan value for this work item
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@ -194,8 +194,9 @@ void cuda_shfl_up( T & out , T const & in , int delta ,
 */

 template< class ValueType , class JoinOp>
-__device__
-inline void cuda_intra_warp_reduction( ValueType& result,
+__device__ inline
+typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
+cuda_intra_warp_reduction( ValueType& result,
                                       const JoinOp& join,
                                       const uint32_t max_active_thread = blockDim.y) {

@ -214,8 +215,9 @@ inline void cuda_intra_warp_reduction( ValueType& result,
 }

 template< class ValueType , class JoinOp>
-__device__
-inline void cuda_inter_warp_reduction( ValueType& value,
+__device__ inline
+typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
+cuda_inter_warp_reduction( ValueType& value,
                                       const JoinOp& join,
                                       const int max_active_thread = blockDim.y) {

@ -247,8 +249,9 @@ inline void cuda_inter_warp_reduction( ValueType& value,
 }

 template< class ValueType , class JoinOp>
-__device__
-inline void cuda_intra_block_reduction( ValueType& value,
+__device__ inline
+typename std::enable_if< !Kokkos::is_reducer<ValueType>::value >::type
+cuda_intra_block_reduction( ValueType& value,
                                        const JoinOp& join,
                                        const int max_active_thread = blockDim.y) {
  cuda_intra_warp_reduction(value,join,max_active_thread);
@ -314,31 +317,52 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT
        if( id + 1 < int(gridDim.x) )
          join(value, tmp);
      }
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
+      int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 2) {
        value_type tmp = Kokkos::shfl_down(value, 2,32);
        if( id + 2 < int(gridDim.x) )
          join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 4) {
        value_type tmp = Kokkos::shfl_down(value, 4,32);
        if( id + 4 < int(gridDim.x) )
          join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 8) {
        value_type tmp = Kokkos::shfl_down(value, 8,32);
        if( id + 8 < int(gridDim.x) )
          join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 16) {
        value_type tmp = Kokkos::shfl_down(value, 16,32);
        if( id + 16 < int(gridDim.x) )
          join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
    }
  }
  //The last block has in its thread=0 the global reduction value through "value"
@ -478,31 +502,52 @@ cuda_inter_block_reduction( const ReducerType& reducer,
        if( id + 1 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
-      int active = KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      unsigned int mask = KOKKOS_IMPL_CUDA_ACTIVEMASK;
+      int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      int active = KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 2) {
        value_type tmp = Kokkos::shfl_down(value, 2,32);
        if( id + 2 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 4) {
        value_type tmp = Kokkos::shfl_down(value, 4,32);
        if( id + 4 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 8) {
        value_type tmp = Kokkos::shfl_down(value, 8,32);
        if( id + 8 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
      if (int(blockDim.x*blockDim.y) > 16) {
        value_type tmp = Kokkos::shfl_down(value, 16,32);
        if( id + 16 < int(gridDim.x) )
          reducer.join(value, tmp);
      }
-      active += KOKKOS_IMPL_CUDA_BALLOT(1);
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(mask,1);
+#else
+      active += KOKKOS_IMPL_CUDA_BALLOT_MASK(1);
+#endif
    }
  }

@ -513,6 +558,213 @@ cuda_inter_block_reduction( const ReducerType& reducer,
 #endif
 }

+template<class FunctorType, class ArgTag, bool DoScan, bool UseShfl>
+struct CudaReductionsFunctor;
+
+template<class FunctorType, class ArgTag>
+struct CudaReductionsFunctor<FunctorType, ArgTag, false, true> {
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+  typedef typename ValueTraits::value_type Scalar;
+
+  __device__
+  static inline void scalar_intra_warp_reduction(
+      const FunctorType& functor,
+      Scalar value,                            // Contribution
+      const bool skip_vector,                  // Skip threads if Kokkos vector lanes are not part of the reduction
+      const int width,                         // How much of the warp participates
+      Scalar& result)
+  {
+    unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
+    for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
+      Scalar tmp;
+      cuda_shfl_down(tmp,value,delta,width,mask);
+      ValueJoin::join( functor , &value, &tmp);
+    }
+
+    cuda_shfl(result,value,0,width,mask);
+  }
+
+
+  __device__
+  static inline void scalar_intra_block_reduction(
+      const FunctorType& functor,
+      Scalar value,
+      const bool skip,
+      Scalar* my_global_team_buffer_element,
+      const int shared_elements,
+      Scalar* shared_team_buffer_element) {
+
+    const int warp_id = (threadIdx.y*blockDim.x)/32;
+    Scalar* const my_shared_team_buffer_element =
+        shared_team_buffer_element + warp_id%shared_elements;
+
+    // Warp Level Reduction, ignoring Kokkos vector entries
+    scalar_intra_warp_reduction(functor,value,skip,32,value);
+
+    if(warp_id<shared_elements) {
+        *my_shared_team_buffer_element=value;
+    }
+    // Wait for every warp to be done before using one warp to do final cross warp reduction
+    __syncthreads();
+
+    const int num_warps = blockDim.x*blockDim.y/32;
+    for(int w = shared_elements; w<num_warps; w+=shared_elements) {
+      if(warp_id>=w && warp_id<w+shared_elements) {
+        if((threadIdx.y*blockDim.x + threadIdx.x)%32==0)
+          ValueJoin::join( functor , my_shared_team_buffer_element, &value);
+      }
+      __syncthreads();
+    }
+
+
+    if( warp_id == 0) {
+      ValueInit::init( functor , &value );
+      for(unsigned int i=threadIdx.y*blockDim.x+threadIdx.x; i<blockDim.y*blockDim.x/32; i+=32)
+        ValueJoin::join( functor , &value,&shared_team_buffer_element[i]);
+      scalar_intra_warp_reduction(functor,value,false,32,*my_global_team_buffer_element);
+    }
+  }
+
+  __device__
+  static inline bool scalar_inter_block_reduction(
+      const FunctorType     & functor ,
+      const Cuda::size_type   block_id ,
+      const Cuda::size_type   block_count ,
+      Cuda::size_type * const shared_data ,
+      Cuda::size_type * const global_data ,
+      Cuda::size_type * const global_flags )  {
+    Scalar* const global_team_buffer_element = ((Scalar*) global_data);
+    Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x;
+    Scalar* shared_team_buffer_elements = ((Scalar*) shared_data);
+    Scalar value = shared_team_buffer_elements[threadIdx.y];
+    int shared_elements=blockDim.x*blockDim.y/32;
+    int global_elements=block_count;
+    __syncthreads();
+
+    scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
+    __syncthreads();
+    unsigned int num_teams_done = 0;
+    if(threadIdx.x + threadIdx.y == 0) {
+      __threadfence();
+      num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
+    }
+    bool is_last_block = false;
+    if(__syncthreads_or(num_teams_done == gridDim.x)) {
+      is_last_block=true;
+      *global_flags = 0;
+      ValueInit::init( functor, &value);
+      for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<global_elements; i+=blockDim.x*blockDim.y) {
+        ValueJoin::join( functor , &value,&global_team_buffer_element[i]);
+      }
+      scalar_intra_block_reduction(functor,value,false,shared_team_buffer_elements+(blockDim.y-1),shared_elements,shared_team_buffer_elements);
+    }
+    return is_last_block;
+  }
+};
+
+template<class FunctorType, class ArgTag>
+struct CudaReductionsFunctor<FunctorType, ArgTag, false, false> {
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+  typedef typename ValueTraits::value_type Scalar;
+
+  __device__
+  static inline void scalar_intra_warp_reduction(
+      const FunctorType& functor,
+      Scalar* value,                           // Contribution
+      const bool skip_vector,                  // Skip threads if Kokkos vector lanes are not part of the reduction
+      const int width)                         // How much of the warp participates
+  {
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+    unsigned mask = width==32?0xffffffff:((1<<width)-1)<<((threadIdx.y*blockDim.x+threadIdx.x)%(32/width))*width;
+#endif
+    const int lane_id = (threadIdx.y*blockDim.x+threadIdx.x)%32;
+    for(int delta=skip_vector?blockDim.x:1; delta<width; delta*=2) {
+      if(lane_id + delta<32) {
+        ValueJoin::join( functor , value, value+delta);
+      }
+#ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+      KOKKOS_IMPL_CUDA_SYNCWARP_MASK(mask);
+#else
+      KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
+#endif
+    }
+    *value=*(value-lane_id);
+  }
+
+
+  __device__
+  static inline void scalar_intra_block_reduction(
+      const FunctorType& functor,
+      Scalar value,
+      const bool skip,
+      Scalar* result,
+      const int shared_elements,
+      Scalar* shared_team_buffer_element) {
+
+    const int warp_id = (threadIdx.y*blockDim.x)/32;
+    Scalar* const my_shared_team_buffer_element =
+        shared_team_buffer_element + threadIdx.y*blockDim.x+threadIdx.x;
+    *my_shared_team_buffer_element = value;
+    // Warp Level Reduction, ignoring Kokkos vector entries
+    scalar_intra_warp_reduction(functor,my_shared_team_buffer_element,skip,32);
+    // Wait for every warp to be done before using one warp to do final cross warp reduction
+    __syncthreads();
+
+    if( warp_id == 0) {
+      const unsigned int delta = (threadIdx.y*blockDim.x+threadIdx.x)*32;
+      if(delta<blockDim.x*blockDim.y)
+        *my_shared_team_buffer_element = shared_team_buffer_element[delta];
+      KOKKOS_IMPL_CUDA_SYNCWARP;   
+      scalar_intra_warp_reduction(functor,my_shared_team_buffer_element,false,blockDim.x*blockDim.y/32);
+      if(threadIdx.x + threadIdx.y == 0) *result = *shared_team_buffer_element;
+    }
+  }
+
+  __device__
+  static inline bool scalar_inter_block_reduction(
+      const FunctorType     & functor ,
+      const Cuda::size_type   block_id ,
+      const Cuda::size_type   block_count ,
+      Cuda::size_type * const shared_data ,
+      Cuda::size_type * const global_data ,
+      Cuda::size_type * const global_flags )  {
+    Scalar* const global_team_buffer_element = ((Scalar*) global_data);
+    Scalar* const my_global_team_buffer_element = global_team_buffer_element + blockIdx.x;
+    Scalar* shared_team_buffer_elements = ((Scalar*) shared_data);
+    Scalar value = shared_team_buffer_elements[threadIdx.y];
+    int shared_elements=blockDim.x*blockDim.y/32;
+    int global_elements=block_count;
+    __syncthreads();
+
+    scalar_intra_block_reduction(functor,value,true,my_global_team_buffer_element,shared_elements,shared_team_buffer_elements);
+    __syncthreads();
+
+    unsigned int num_teams_done = 0;
+    if(threadIdx.x + threadIdx.y == 0) {
+      __threadfence();
+      num_teams_done = Kokkos::atomic_fetch_add(global_flags,1)+1;
+    }
+    bool is_last_block = false;
+    if(__syncthreads_or(num_teams_done == gridDim.x)) {
+      is_last_block=true;
+      *global_flags = 0;
+      ValueInit::init( functor, &value);
+      for(int i=threadIdx.y*blockDim.x+threadIdx.x; i<global_elements; i+=blockDim.x*blockDim.y) {
+        ValueJoin::join( functor , &value,&global_team_buffer_element[i]);
+      }
+      scalar_intra_block_reduction(functor,value,false,shared_team_buffer_elements+(blockDim.y-1),shared_elements,shared_team_buffer_elements);
+    }
+    return is_last_block;
+  }
+};
 //----------------------------------------------------------------------------
 // See section B.17 of Cuda C Programming Guide Version 3.2
 // for discussion of
@ -639,14 +891,15 @@ void cuda_intra_block_reduce_scan( const FunctorType & functor ,
 *
 *  Global reduce result is in the last threads' 'shared_data' location.
 */
+
 template< bool DoScan , class FunctorType , class ArgTag >
 __device__
-bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
-                                          const Cuda::size_type   block_id ,
-                                          const Cuda::size_type   block_count ,
-                                          Cuda::size_type * const shared_data ,
-                                          Cuda::size_type * const global_data ,
-                                          Cuda::size_type * const global_flags )
+bool cuda_single_inter_block_reduce_scan2( const FunctorType     & functor ,
+                                    const Cuda::size_type   block_id ,
+                                    const Cuda::size_type   block_count ,
+                                    Cuda::size_type * const shared_data ,
+                                    Cuda::size_type * const global_data ,
+                                    Cuda::size_type * const global_flags )
 {
  typedef Cuda::size_type                  size_type ;
  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
@ -655,7 +908,6 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;

  typedef typename ValueTraits::pointer_type    pointer_type ;
-  //typedef typename ValueTraits::reference_type  reference_type ;

  // '__ffs' = position of the least significant bit set to 1.
  // 'blockDim.y' is guaranteed to be a power of two so this
@ -678,12 +930,7 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
    size_type * const global = global_data + word_count.value * block_id ;

-//#if (__CUDA_ARCH__ < 500)
    for ( int i = int(threadIdx.y) ; i < int(word_count.value) ; i += int(blockDim.y) ) { global[i] = shared[i] ; }
-//#else
-//    for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
-//#endif
-
  }

  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
@ -725,6 +972,22 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
  return is_last_block ;
 }

+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const Cuda::size_type   block_id ,
+                                          const Cuda::size_type   block_count ,
+                                          Cuda::size_type * const shared_data ,
+                                          Cuda::size_type * const global_data ,
+                                          Cuda::size_type * const global_flags )
+{
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  if(!DoScan && ValueTraits::StaticValueSize)
+    return Kokkos::Impl::CudaReductionsFunctor<FunctorType,ArgTag,false,(ValueTraits::StaticValueSize>16)>::scalar_inter_block_reduction(functor,block_id,block_count,shared_data,global_data,global_flags);
+  else
+    return cuda_single_inter_block_reduce_scan2<DoScan, FunctorType, ArgTag>(functor, block_id, block_count, shared_data, global_data, global_flags);
+}
+
 // Size in bytes required for inter block reduce or scan
 template< bool DoScan , class FunctorType , class ArgTag >
 inline
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp
@ -160,7 +160,7 @@ public:

  template<class ValueType>
  KOKKOS_INLINE_FUNCTION
-  void team_broadcast( ValueType & val, const int& thread_id) const
+  void team_broadcast( ValueType & val, const int& thread_id ) const
    {
      #ifdef __CUDA_ARCH__
      if ( 1 == blockDim.z ) { // team == block
@ -178,6 +178,29 @@ public:
      }
      #endif
    }
+	
+  template<class Closure, class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast( Closure const & f, ValueType & val, const int& thread_id ) const
+    {
+      #ifdef __CUDA_ARCH__
+      f( val );
+
+      if ( 1 == blockDim.z ) { // team == block
+        __syncthreads();
+        // Wait for shared data write until all threads arrive here
+        if ( threadIdx.x == 0u && threadIdx.y == (uint32_t)thread_id ) {
+          *((ValueType*) m_team_reduce) = val ;
+        }
+        __syncthreads(); // Wait for shared data read until root thread writes
+        val = *((ValueType*) m_team_reduce);
+      }
+      else { // team <= warp
+        ValueType tmp( val ); // input might not be a register variable
+        cuda_shfl( val, tmp, blockDim.x * thread_id, blockDim.x * blockDim.y );
+      }
+      #endif
+    }

  //--------------------------------------------------------------------------
  /**\brief  Reduction across a team
@ -200,92 +223,7 @@ public:
  team_reduce( ReducerType const & reducer ) const noexcept
    {
      #ifdef __CUDA_ARCH__
-
-      typedef typename ReducerType::value_type value_type ;
-
-      value_type tmp( reducer.reference() );
-
-      // reduce within the warp using shuffle
-
-      const int wx =
-        ( threadIdx.x + blockDim.x * threadIdx.y ) & CudaTraits::WarpIndexMask ;
-
-      for ( int i = CudaTraits::WarpSize ; (int)blockDim.x <= ( i >>= 1 ) ; ) {
-
-        cuda_shfl_down( reducer.reference() , tmp , i , CudaTraits::WarpSize );
-
-        // Root of each vector lane reduces:
-        if ( 0 == threadIdx.x && wx < i ) {
-          reducer.join( tmp , reducer.reference() );
-        }
-      }
-
-      if ( 1 < blockDim.z ) { // team <= warp
-        // broadcast result from root vector lange of root thread
-
-        cuda_shfl( reducer.reference() , tmp
-                 , blockDim.x * threadIdx.y , CudaTraits::WarpSize );
-
-      }
-      else { // team == block
-        // Reduce across warps using shared memory
-        // Broadcast result within block
-
-        // Number of warps, blockDim.y may not be power of two:
-        const int nw  = ( blockDim.x * blockDim.y + CudaTraits::WarpIndexMask ) >> CudaTraits::WarpIndexShift ;
-
-        // Warp index:
-        const int wy = ( blockDim.x * threadIdx.y ) >> CudaTraits::WarpIndexShift ;
-
-        // Number of shared memory entries for the reduction:
-        int nsh = m_team_reduce_size / sizeof(value_type);
-
-        // Using at most one entry per warp:
-        if ( nw < nsh ) nsh = nw ;
-
-        __syncthreads(); // Wait before shared data write
-
-        if ( 0 == wx && wy < nsh ) {
-          ((value_type*) m_team_reduce)[wy] = tmp ;
-        }
-
-        // When more warps than shared entries:
-        for ( int i = nsh ; i < nw ; i += nsh ) {
-
-          __syncthreads();
-
-          if ( 0 == wx && i <= wy ) {
-            const int k = wy - i ;
-            if ( k < nsh ) {
-              reducer.join( *((value_type*) m_team_reduce + k) , tmp );
-            }
-          }
-        }
-
-        __syncthreads();
-
-        // One warp performs the inter-warp reduction:
-
-        if ( 0 == wy ) {
-
-          // Start at power of two covering nsh
-
-          for ( int i = 1 << ( 32 - __clz(nsh-1) ) ; ( i >>= 1 ) ; ) {
-            const int k = wx + i ;
-            if ( wx < i && k < nsh ) {
-              reducer.join( ((value_type*)m_team_reduce)[wx]
-                          , ((value_type*)m_team_reduce)[k] );
-              __threadfence_block();
-            }
-          }
-        }
-
-        __syncthreads(); // Wait for reduction
-
-        // Broadcast result to all threads
-        reducer.reference() = *((value_type*)m_team_reduce);
-      }
-
+      cuda_intra_block_reduction(reducer,blockDim.y);
      #endif /* #ifdef __CUDA_ARCH__ */
    }

@ -801,7 +739,11 @@ void parallel_for
      ; i += blockDim.x ) {
    closure(i);
  }
+  #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
+  #else
+  KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
+  #endif
 #endif
 }

@ -970,7 +912,11 @@ KOKKOS_INLINE_FUNCTION
 void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
 #ifdef __CUDA_ARCH__
  if(threadIdx.x == 0) lambda();
+  #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
+  #else
+  KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
+  #endif
 #endif
 }

@ -979,7 +925,11 @@ KOKKOS_INLINE_FUNCTION
 void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
 #ifdef __CUDA_ARCH__
  if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
+  #ifdef KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
  KOKKOS_IMPL_CUDA_SYNCWARP_MASK(blockDim.x==32?0xffffffff:((1<<blockDim.x)-1)<<(threadIdx.y%(32/blockDim.x))*blockDim.x);
+  #else
+  KOKKOS_IMPL_CUDA_SYNCWARP_MASK;
+  #endif
 #endif
 }

--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Version_9_8_Compatibility.hpp
@ -2,9 +2,11 @@

 #if defined( __CUDA_ARCH__ )
 #if ( CUDA_VERSION < 9000 )
+#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
 #define KOKKOS_IMPL_CUDA_SYNCWARP __threadfence_block()
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(x) __threadfence_block()
+#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK __threadfence_block()
 #define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot(x)
+#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) __ballot(x)
 #define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl(x,y,z)
 #define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl(x,y,z)
 #define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up(x,y,z)
@ -12,9 +14,11 @@
 #define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) __shfl_down(x,y,z)
 #define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down(x,y,z)
 #else
+#define KOKKOS_IMPL_CUDA_ACTIVEMASK __activemask()
 #define KOKKOS_IMPL_CUDA_SYNCWARP __syncwarp(0xffffffff)
-#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m)
+#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK(m) __syncwarp(m);
 #define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(__activemask(),x)
+#define KOKKOS_IMPL_CUDA_BALLOT_MASK(m,x) __ballot_sync(m,x)
 #define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl_sync(0xffffffff,x,y,z)
 #define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) __shfl_sync(m,x,y,z)
 #define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up_sync(0xffffffff,x,y,z)
@ -23,11 +27,16 @@
 #define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) __shfl_down_sync(m,x,y,z)
 #endif 
 #else
+#define KOKKOS_IMPL_CUDA_ACTIVEMASK 0
 #define KOKKOS_IMPL_CUDA_SYNCWARP 
+#define KOKKOS_IMPL_CUDA_SYNCWARP_MASK
 #define KOKKOS_IMPL_CUDA_BALLOT(x) 0
+#define KOKKOS_IMPL_CUDA_BALLOT_MASK(x) 0
 #define KOKKOS_IMPL_CUDA_SHFL(x,y,z) 0
+#define KOKKOS_IMPL_CUDA_SHFL_MASK(m,x,y,z) 0
 #define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) 0
 #define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) 0
+#define KOKKOS_IMPL_CUDA_SHFL_DOWN_MASK(m,x,y,z) 0
 #endif 

 #if ( CUDA_VERSION >= 9000 ) && (!defined(KOKKOS_COMPILER_CLANG))
--- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@ -279,6 +279,8 @@ public:
  KOKKOS_INLINE_FUNCTION
  static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
    {
+      if(arg_data_ptr == NULL) return handle_type();
+
 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
      // Assignment of texture = non-texture requires creation of a texture object
      // which can only occur on the host.  In addition, 'get_record' is only valid
@ -292,8 +294,7 @@ public:

 #if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC )
      if ( 0 == r ) {
-        //Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
-        return handle_type();
+        Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory");
      }
 #endif

--- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
+++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp
@ -46,6 +46,8 @@

 #include <initializer_list>

+#include <Kokkos_Layout.hpp>
+
 #include<impl/KokkosExp_Host_IterateTile.hpp>
 #include <Kokkos_ExecPolicy.hpp>
 #include <Kokkos_Parallel.hpp>
@ -63,13 +65,15 @@
 namespace Kokkos {

 // ------------------------------------------------------------------ //
-
+// Moved to Kokkos_Layout.hpp for more general accessibility
+/*
 enum class Iterate
 {
  Default, // Default for the device
  Left,    // Left indices stride fastest
  Right,   // Right indices stride fastest
 };
+*/

 template <typename ExecSpace>
 struct default_outer_direction
--- a/lib/kokkos/core/src/Kokkos_Array.hpp
+++ b/lib/kokkos/core/src/Kokkos_Array.hpp
@ -45,11 +45,13 @@
 #define KOKKOS_ARRAY_HPP

 #include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Error.hpp>

 #include <type_traits>
 #include <algorithm>
 #include <limits>
 #include <cstddef>
+#include <string>

 namespace Kokkos {

@ -132,6 +134,7 @@ public:

  KOKKOS_INLINE_FUNCTION static constexpr size_type size() { return N ; }
  KOKKOS_INLINE_FUNCTION static constexpr bool      empty(){ return false ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return N ; }

  template< typename iType >
  KOKKOS_INLINE_FUNCTION
@ -160,7 +163,7 @@ public:
      return & m_internal_implementation_private_member_data[0];
    }

-  #ifdef KOKKOS_ROCM_CLANG_WORKAROUND
+  #ifdef KOKKOS_IMPL_ROCM_CLANG_WORKAROUND
  // Do not default unless move and move-assignment are also defined
  KOKKOS_INLINE_FUNCTION
  ~Array() = default ;
@ -197,6 +200,7 @@ public:

  KOKKOS_INLINE_FUNCTION static constexpr size_type size()  { return 0 ; }
  KOKKOS_INLINE_FUNCTION static constexpr bool      empty() { return true ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return 0 ; }

  template< typename iType >
  KOKKOS_INLINE_FUNCTION
@ -261,6 +265,7 @@ public:

  KOKKOS_INLINE_FUNCTION constexpr size_type size()  const { return m_size ; }
  KOKKOS_INLINE_FUNCTION constexpr bool      empty() const { return 0 != m_size ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size ; }

  template< typename iType >
  KOKKOS_INLINE_FUNCTION
@ -336,6 +341,7 @@ public:

  KOKKOS_INLINE_FUNCTION constexpr size_type size()  const { return m_size ; }
  KOKKOS_INLINE_FUNCTION constexpr bool      empty() const { return 0 != m_size ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type max_size() const { return m_size ; }

  template< typename iType >
  KOKKOS_INLINE_FUNCTION
--- a/lib/kokkos/core/src/Kokkos_Concepts.hpp
+++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp
@ -105,7 +105,10 @@ namespace Kokkos {
  template< typename T > struct is_ ## CONCEPT { \
  private: \
    template< typename , typename = std::true_type > struct have : std::false_type {}; \
-    template< typename U > struct have<U,typename std::is_same<U,typename U:: CONCEPT >::type> : std::true_type {}; \
+    template< typename U > struct have<U,typename std::is_same< \
+     typename std::remove_cv<U>::type, \
+     typename std::remove_cv<typename U:: CONCEPT>::type \
+   >::type> : std::true_type {}; \
  public: \
    enum { value = is_ ## CONCEPT::template have<T>::value }; \
  };
--- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp
+++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp
@ -453,8 +453,9 @@ template<class ViewTypeA,class ViewTypeB, class Layout, class ExecSpace,typename
 struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,2,iType,KOKKOS_IMPL_COMPILING_LIBRARY> {
  ViewTypeA a;
  ViewTypeB b;
-
-  typedef Kokkos::Rank<2,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
+  typedef Kokkos::Rank<2,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;

  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -475,7 +476,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,3,iType,KOKKOS_IMPL_COMPILI
  ViewTypeA a;
  ViewTypeB b;

-  typedef Kokkos::Rank<3,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
+  typedef Kokkos::Rank<3,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;

  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -496,7 +499,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,4,iType,KOKKOS_IMPL_COMPILI
  ViewTypeA a;
  ViewTypeB b;

-  typedef Kokkos::Rank<4,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
+  typedef Kokkos::Rank<4,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;

  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -519,7 +524,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,5,iType,KOKKOS_IMPL_COMPILI
  ViewTypeA a;
  ViewTypeB b;

-  typedef Kokkos::Rank<5,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
+  typedef Kokkos::Rank<5,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;

  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -542,7 +549,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,6,iType,KOKKOS_IMPL_COMPILI
  ViewTypeA a;
  ViewTypeB b;

-  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
+  typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;

  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -566,7 +575,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,7,iType,KOKKOS_IMPL_COMPILI
  ViewTypeA a;
  ViewTypeB b;

-  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
+  typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;

  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -590,7 +601,9 @@ struct ViewCopy<ViewTypeA,ViewTypeB,Layout,ExecSpace,8,iType,KOKKOS_IMPL_COMPILI
  ViewTypeA a;
  ViewTypeB b;

-  typedef Kokkos::Rank<6,ViewFillLayoutSelector<Layout>::iterate,ViewFillLayoutSelector<Layout>::iterate> iterate_type;
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::outer_iteration_pattern;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::layout_iterate_type_selector<Layout>::inner_iteration_pattern;
+  typedef Kokkos::Rank<6,outer_iteration_pattern,inner_iteration_pattern> iterate_type;
  typedef Kokkos::MDRangePolicy<ExecSpace,iterate_type,Kokkos::IndexType<iType>> policy_type;

  ViewCopy(const ViewTypeA& a_, const ViewTypeB& b_):a(a_),b(b_) {
@ -642,7 +655,9 @@ void view_copy(const DstType& dst, const SrcType& src) {
  int64_t strides[DstType::Rank+1];
  dst.stride(strides);
  Kokkos::Iterate iterate;
-  if        ( std::is_same<typename DstType::array_layout,Kokkos::LayoutRight>::value ) {
+  if        ( Kokkos::is_layouttiled<typename DstType::array_layout>::value ) {
+    iterate = Kokkos::layout_iterate_type_selector<typename DstType::array_layout>::outer_iteration_pattern;
+  } else if        ( std::is_same<typename DstType::array_layout,Kokkos::LayoutRight>::value ) {
    iterate = Kokkos::Iterate::Right;
  } else if ( std::is_same<typename DstType::array_layout,Kokkos::LayoutLeft>::value ) {
    iterate = Kokkos::Iterate::Left;
@ -1243,9 +1258,9 @@ void deep_copy
     ViewTypeFlat;

    ViewTypeFlat dst_flat(dst.data(),dst.size());
-    if(dst.span() < std::numeric_limits<int>::max())
+    if(dst.span() < std::numeric_limits<int>::max()) {
      Kokkos::Impl::ViewFill< ViewTypeFlat , Kokkos::LayoutRight, typename ViewType::execution_space, ViewTypeFlat::Rank, int >( dst_flat , value );
-    else
+    } else
      Kokkos::Impl::ViewFill< ViewTypeFlat , Kokkos::LayoutRight, typename ViewType::execution_space, ViewTypeFlat::Rank, int64_t >( dst_flat , value );
    Kokkos::fence();
    return;
@ -1397,7 +1412,6 @@ void deep_copy
  enum { SrcExecCanAccessDst =
   Kokkos::Impl::SpaceAccessibility< src_execution_space , dst_memory_space >::accessible };

-
  // Checking for Overlapping Views.
  dst_value_type* dst_start = dst.data();
  dst_value_type* dst_end   = dst.data() + dst.span();
@ -1493,7 +1507,7 @@ void deep_copy
    Kokkos::fence();
  } else {
    Kokkos::fence();
-    Impl::view_copy(typename dst_type::uniform_runtime_nomemspace_type(dst),typename src_type::uniform_runtime_const_nomemspace_type(src));
+    Impl::view_copy(dst, src);
    Kokkos::fence();
  }
 }
@ -1739,8 +1753,7 @@ void deep_copy
    exec_space.fence();
  } else {
    exec_space.fence();
-    Impl::view_copy(typename dst_type::uniform_runtime_nomemspace_type(dst),
-                    typename src_type::uniform_runtime_const_nomemspace_type(src));
+    Impl::view_copy(dst, src);
    exec_space.fence();
  }
 }
@ -1917,4 +1930,213 @@ void realloc(      Kokkos::View<T,P...> & v ,
 }
 } /* namespace Kokkos */

+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Deduce Mirror Types
+template<class Space, class T, class ... P>
+struct MirrorViewType {
+  // The incoming view_type
+  typedef typename Kokkos::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::View<data_type,array_layout,Space> dest_view_type;
+  // If it is the same memory_space return the existsing view_type
+  // This will also keep the unmanaged trait if necessary
+  typedef typename std::conditional<is_same_memspace,src_view_type,dest_view_type>::type view_type;
+};
+
+template<class Space, class T, class ... P>
+struct MirrorType {
+  // The incoming view_type
+  typedef typename Kokkos::View<T,P...> src_view_type;
+  // The memory space for the mirror view
+  typedef typename Space::memory_space memory_space;
+  // Check whether it is the same memory space
+  enum { is_same_memspace = std::is_same<memory_space,typename src_view_type::memory_space>::value };
+  // The array_layout
+  typedef typename src_view_type::array_layout array_layout;
+  // The data type (we probably want it non-const since otherwise we can't even deep_copy to it.
+  typedef typename src_view_type::non_const_data_type data_type;
+  // The destination view type if it is not the same memory space
+  typedef Kokkos::View<data_type,array_layout,Space> view_type;
+};
+
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror( const Kokkos::View<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
+                 ! std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
+                               , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
+                   , src.extent(0)
+                   , src.extent(1)
+                   , src.extent(2)
+                   , src.extent(3)
+                   , src.extent(4)
+                   , src.extent(5)
+                   , src.extent(6)
+                   , src.extent(7) );
+#else
+                 , src.rank_dynamic > 0 ? src.extent(0): KOKKOS_IMPL_CTOR_DEFAULT_ARG
+                 , src.rank_dynamic > 1 ? src.extent(1): KOKKOS_IMPL_CTOR_DEFAULT_ARG
+                 , src.rank_dynamic > 2 ? src.extent(2): KOKKOS_IMPL_CTOR_DEFAULT_ARG
+                 , src.rank_dynamic > 3 ? src.extent(3): KOKKOS_IMPL_CTOR_DEFAULT_ARG
+                 , src.rank_dynamic > 4 ? src.extent(4): KOKKOS_IMPL_CTOR_DEFAULT_ARG
+                 , src.rank_dynamic > 5 ? src.extent(5): KOKKOS_IMPL_CTOR_DEFAULT_ARG
+                 , src.rank_dynamic > 6 ? src.extent(6): KOKKOS_IMPL_CTOR_DEFAULT_ARG
+                 , src.rank_dynamic > 7 ? src.extent(7): KOKKOS_IMPL_CTOR_DEFAULT_ARG );
+#endif
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror( const Kokkos::View<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename ViewTraits<T,P...>::specialize , void >::value &&
+                 std::is_same< typename Kokkos::ViewTraits<T,P...>::array_layout
+                             , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,P...>                   src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  Kokkos::LayoutStride layout ;
+
+  layout.dimension[0] = src.extent(0);
+  layout.dimension[1] = src.extent(1);
+  layout.dimension[2] = src.extent(2);
+  layout.dimension[3] = src.extent(3);
+  layout.dimension[4] = src.extent(4);
+  layout.dimension[5] = src.extent(5);
+  layout.dimension[6] = src.extent(6);
+  layout.dimension[7] = src.extent(7);
+
+  layout.stride[0] = src.stride_0();
+  layout.stride[1] = src.stride_1();
+  layout.stride[2] = src.stride_2();
+  layout.stride[3] = src.stride_3();
+  layout.stride[4] = src.stride_4();
+  layout.stride[5] = src.stride_5();
+  layout.stride[6] = src.stride_6();
+  layout.stride[7] = src.stride_7();
+
+  return dst_type( std::string( src.label() ).append("_mirror") , layout );
+}
+
+
+// Create a mirror in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorType<Space,T,P ...>::view_type
+create_mirror(const Space& , const Kokkos::View<T,P...> & src
+             , typename std::enable_if<
+                 std::is_same< typename ViewTraits<T,P...>::specialize , void >::value
+               >::type * = 0) {
+  return typename Impl::MirrorType<Space,T,P ...>::view_type(src.label(),src.layout());
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::View<T,P...> & src
+                  , typename std::enable_if<(
+                      std::is_same< typename Kokkos::View<T,P...>::memory_space
+                                  , typename Kokkos::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::View<T,P...>::data_type
+                                  , typename Kokkos::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return src ;
+}
+
+template< class T , class ... P >
+inline
+typename Kokkos::View<T,P...>::HostMirror
+create_mirror_view( const Kokkos::View<T,P...> & src
+                  , typename std::enable_if< ! (
+                      std::is_same< typename Kokkos::View<T,P...>::memory_space
+                                  , typename Kokkos::View<T,P...>::HostMirror::memory_space
+                                  >::value
+                      &&
+                      std::is_same< typename Kokkos::View<T,P...>::data_type
+                                  , typename Kokkos::View<T,P...>::HostMirror::data_type
+                                  >::value
+                    )>::type * = 0
+                  )
+{
+  return Kokkos::create_mirror( src );
+}
+
+// Create a mirror view in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return src;
+}
+
+// Create a mirror view in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view(const Space& , const Kokkos::View<T,P...> & src
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  return typename Impl::MirrorViewType<Space,T,P ...>::view_type(src.label(),src.layout());
+}
+
+// Create a mirror view and deep_copy in a new space (specialization for same space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  (void)name;
+  return src;
+}
+
+// Create a mirror view and deep_copy in a new space (specialization for different space)
+template<class Space, class T, class ... P>
+typename Impl::MirrorViewType<Space,T,P ...>::view_type
+create_mirror_view_and_copy(const Space& , const Kokkos::View<T,P...> & src
+  , std::string const& name = ""
+  , typename std::enable_if<!Impl::MirrorViewType<Space,T,P ...>::is_same_memspace>::type* = 0 ) {
+  using Mirror = typename Impl::MirrorViewType<Space,T,P ...>::view_type;
+  std::string label = name.empty() ? src.label() : name;
+  auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout());
+  deep_copy(mirror, src);
+  return mirror;
+}
+
+} /* namespace Kokkos */
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
 #endif
--- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@ -57,6 +57,10 @@

 namespace Kokkos {

+struct ParallelForTag {};
+struct ParallelScanTag {};
+struct ParallelReduceTag {};
+
 struct ChunkSize {
  int value;
  ChunkSize(int value_):value(value_) {}
@ -320,6 +324,10 @@ public:

  template< class FunctorType >
  static int team_size_recommended( const FunctorType & , const int&);
+
+  template<class FunctorType>
+  int team_size_recommended( const FunctorType & functor , const int vector_length);
+
  //----------------------------------------
  /** \brief  Construct policy with the given instance of the execution space */
  TeamPolicyInternal( const typename traits::execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 );
--- a/lib/kokkos/core/src/Kokkos_Layout.hpp
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@ -76,6 +76,8 @@ struct LayoutLeft {

  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];

+  enum { is_extent_constructible = true };
+
  LayoutLeft( LayoutLeft const & ) = default ;
  LayoutLeft( LayoutLeft && ) = default ;
  LayoutLeft & operator = ( LayoutLeft const & ) = default ;
@ -108,6 +110,8 @@ struct LayoutRight {

  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ];

+  enum { is_extent_constructible = true };
+
  LayoutRight( LayoutRight const & ) = default ;
  LayoutRight( LayoutRight && ) = default ;
  LayoutRight & operator = ( LayoutRight const & ) = default ;
@ -132,6 +136,8 @@ struct LayoutStride {
  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
  size_t stride[ ARRAY_LAYOUT_MAX_RANK ] ;

+  enum { is_extent_constructible = false };
+
  LayoutStride( LayoutStride const & ) = default ;
  LayoutStride( LayoutStride && ) = default ;
  LayoutStride & operator = ( LayoutStride const & ) = default ;
@ -222,6 +228,8 @@ struct LayoutTileLeft {

  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;

+  enum { is_extent_constructible = true };
+
  LayoutTileLeft( LayoutTileLeft const & ) = default ;
  LayoutTileLeft( LayoutTileLeft && ) = default ;
  LayoutTileLeft & operator = ( LayoutTileLeft const & ) = default ;
@ -235,6 +243,144 @@ struct LayoutTileLeft {
    : dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
 };

+
+//////////////////////////////////////////////////////////////////////////////////////
+
+enum class Iterate
+{
+  Default,
+  Left,    // Left indices stride fastest
+  Right   // Right indices stride fastest
+};
+
+// To check for LayoutTiled
+// This is to hide extra compile-time 'identifier' info within the LayoutTiled class by not relying on template specialization to include the ArgN*'s
+template < typename LayoutTiledCheck, class Enable = void >
+struct is_layouttiled : std::false_type {};
+
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
+template < typename LayoutTiledCheck >
+struct is_layouttiled< LayoutTiledCheck, typename std::enable_if<LayoutTiledCheck::is_array_layout_tiled>::type > : std::true_type {};
+
+namespace Experimental {
+
+/// LayoutTiled
+// Must have Rank >= 2
+template < Kokkos::Iterate OuterP, Kokkos::Iterate InnerP,
+           unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 = 0,  unsigned ArgN3 = 0,  unsigned ArgN4 = 0,  unsigned ArgN5 = 0,  unsigned ArgN6 = 0,  unsigned ArgN7 = 0, 
+           bool IsPowerOfTwo = 
+           ( Impl::is_integral_power_of_two(ArgN0) &&
+             Impl::is_integral_power_of_two(ArgN1) &&
+             (Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
+             (Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
+             (Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
+             (Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
+             (Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
+             (Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
+           )
+         >
+struct LayoutTiled {
+
+  static_assert( IsPowerOfTwo
+               , "LayoutTiled must be given power-of-two tile dimensions" );
+
+#if 0
+  static_assert( (Impl::is_integral_power_of_two(ArgN0) ) &&
+                 (Impl::is_integral_power_of_two(ArgN1) ) &&
+                 (Impl::is_integral_power_of_two(ArgN2) || (ArgN2 == 0) ) &&
+                 (Impl::is_integral_power_of_two(ArgN3) || (ArgN3 == 0) ) &&
+                 (Impl::is_integral_power_of_two(ArgN4) || (ArgN4 == 0) ) &&
+                 (Impl::is_integral_power_of_two(ArgN5) || (ArgN5 == 0) ) &&
+                 (Impl::is_integral_power_of_two(ArgN6) || (ArgN6 == 0) ) &&
+                 (Impl::is_integral_power_of_two(ArgN7) || (ArgN7 == 0) )
+               , "LayoutTiled must be given power-of-two tile dimensions" );
+#endif
+
+  typedef LayoutTiled<OuterP, InnerP, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, IsPowerOfTwo> array_layout ;
+  static constexpr Iterate outer_pattern = OuterP;
+  static constexpr Iterate inner_pattern = InnerP;
+
+  enum { N0 = ArgN0 };
+  enum { N1 = ArgN1 };
+  enum { N2 = ArgN2 };
+  enum { N3 = ArgN3 };
+  enum { N4 = ArgN4 };
+  enum { N5 = ArgN5 };
+  enum { N6 = ArgN6 };
+  enum { N7 = ArgN7 };
+
+  size_t dimension[ ARRAY_LAYOUT_MAX_RANK ] ;
+
+  enum { is_extent_constructible = true };
+
+  LayoutTiled( LayoutTiled const & ) = default ;
+  LayoutTiled( LayoutTiled && ) = default ;
+  LayoutTiled & operator = ( LayoutTiled const & ) = default ;
+  LayoutTiled & operator = ( LayoutTiled && ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  explicit constexpr
+  LayoutTiled( size_t argN0 = 0 , size_t argN1 = 0 , size_t argN2 = 0 , size_t argN3 = 0
+                , size_t argN4 = 0 , size_t argN5 = 0 , size_t argN6 = 0 , size_t argN7 = 0
+                )
+    : dimension { argN0 , argN1 , argN2 , argN3 , argN4 , argN5 , argN6 , argN7 } {}
+};
+
+} // namespace Experimental
+#endif
+
+
+// For use with view_copy
+template < typename ... Layout >
+struct layout_iterate_type_selector {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Default ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default ;
+};
+
+template <>
+struct layout_iterate_type_selector< Kokkos::LayoutRight > {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
+};
+
+template <>
+struct layout_iterate_type_selector< Kokkos::LayoutLeft > {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
+};
+
+template <>
+struct layout_iterate_type_selector< Kokkos::LayoutStride > {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Default ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Default ;
+};
+
+#ifndef KOKKOS_ENABLE_DEPRECATED_CODE
+template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 ,  unsigned ArgN3 ,  unsigned ArgN4 ,  unsigned ArgN5 ,  unsigned ArgN6 ,  unsigned ArgN7 >
+struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
+};
+
+template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 ,  unsigned ArgN3 ,  unsigned ArgN4 ,  unsigned ArgN5 ,  unsigned ArgN6 ,  unsigned ArgN7 >
+struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, Kokkos::Iterate::Left, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Left ;
+};
+
+template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 ,  unsigned ArgN3 ,  unsigned ArgN4 ,  unsigned ArgN5 ,  unsigned ArgN6 ,  unsigned ArgN7 >
+struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Left, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Left ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
+};
+
+template < unsigned ArgN0 , unsigned ArgN1 , unsigned ArgN2 ,  unsigned ArgN3 ,  unsigned ArgN4 ,  unsigned ArgN5 ,  unsigned ArgN6 ,  unsigned ArgN7 >
+struct layout_iterate_type_selector< Kokkos::Experimental::LayoutTiled<Kokkos::Iterate::Right, Kokkos::Iterate::Right, ArgN0, ArgN1, ArgN2, ArgN3, ArgN4, ArgN5, ArgN6, ArgN7, true> > {
+  static const Kokkos::Iterate outer_iteration_pattern = Kokkos::Iterate::Right ;
+  static const Kokkos::Iterate inner_iteration_pattern = Kokkos::Iterate::Right ;
+};
+#endif
+
 } // namespace Kokkos

 #endif // #ifndef KOKKOS_LAYOUT_HPP
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@ -153,7 +153,7 @@
    #else
      #define KOKKOS_LAMBDA [=]__host__ __device__

-      #if defined( KOKKOS_ENABLE_CXX1Z )
+      #if defined( KOKKOS_ENABLE_CXX17 ) || defined( KOKKOS_ENABLE_CXX20 )
        #define KOKKOS_CLASS_LAMBDA        [=,*this] __host__ __device__
      #endif
    #endif
@ -213,7 +213,7 @@
  #define KOKKOS_LAMBDA [=]
 #endif

-#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
+#if (defined( KOKKOS_ENABLE_CXX17 ) || defined( KOKKOS_ENABLE_CXX20) )&& !defined( KOKKOS_CLASS_LAMBDA )
  #define KOKKOS_CLASS_LAMBDA [=,*this]
 #endif

@ -521,6 +521,9 @@
 #if defined ( KOKKOS_ENABLE_CUDA )
  #if ( 9000 <= CUDA_VERSION )
  #define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND
+  #if ( __CUDA_ARCH__ )
+    #define KOKKOS_IMPL_CUDA_SYNCWARP_NEEDS_MASK
+  #endif
  #endif
 #endif

--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@ -793,7 +793,7 @@ struct ParallelReduceReturnValue<typename std::enable_if<

  static return_type return_value(ReturnType& return_val,
                                  const FunctorType& functor) {
-#ifdef KOKOOS_ENABLE_DEPRECATED_CODE
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
    return return_type(return_val,functor.value_count);
 #else
    if ( is_array<ReturnType>::value )
@ -1002,7 +1002,8 @@ void parallel_reduce(const std::string& label,
                     typename Impl::enable_if<
                       Kokkos::Impl::is_execution_policy<PolicyType>::value
                     >::type * = 0) {
-  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,const ReturnType>::execute(label,policy,functor,return_value);
+  ReturnType return_value_impl = return_value;
+  Impl::ParallelReduceAdaptor<PolicyType,FunctorType,ReturnType>::execute(label,policy,functor,return_value_impl);
 }

 template< class PolicyType, class FunctorType, class ReturnType >
@ -1054,6 +1055,9 @@ void parallel_reduce(const std::string& label,
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;

+  static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,PolicyType,FunctorType>::
+                 has_final_member_function,"Calling parallel_reduce without either return value or final function.");
+
  typedef Kokkos::View< value_type
              , Kokkos::HostSpace
              , Kokkos::MemoryUnmanaged
@ -1076,6 +1080,9 @@ void parallel_reduce(const PolicyType& policy,
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;

+  static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,PolicyType,FunctorType>::
+                 has_final_member_function,"Calling parallel_reduce without either return value or final function.");
+
  typedef Kokkos::View< value_type
              , Kokkos::HostSpace
              , Kokkos::MemoryUnmanaged
@ -1096,6 +1103,9 @@ void parallel_reduce(const size_t& policy,
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;

+  static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,RangePolicy<>,FunctorType>::
+                 has_final_member_function,"Calling parallel_reduce without either return value or final function.");
+
  typedef Kokkos::View< value_type
              , Kokkos::HostSpace
              , Kokkos::MemoryUnmanaged
@ -1117,6 +1127,9 @@ void parallel_reduce(const std::string& label,
                                     , typename ValueTraits::pointer_type
                                     >::type value_type ;

+  static_assert(Impl::FunctorAnalysis<Impl::FunctorPatternInterface::REDUCE,RangePolicy<>,FunctorType>::
+                 has_final_member_function,"Calling parallel_reduce without either return value or final function.");
+
  typedef Kokkos::View< value_type
              , Kokkos::HostSpace
              , Kokkos::MemoryUnmanaged
--- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@ -136,6 +136,55 @@ public:
    }
  }

+
+  KOKKOS_INLINE_FUNCTION
+  void* get_shmem_aligned (const ptrdiff_t size, const ptrdiff_t alignment, int level = -1) const {
+    if(level == -1)
+      level = m_default_level;
+    if(level == 0) {
+
+      char* previous = m_iter_L0;
+      const ptrdiff_t missalign = size_t(m_iter_L0)%alignment;
+      if(missalign) m_iter_L0 += alignment-missalign;
+
+      void* tmp = m_iter_L0 + m_offset * size;
+      if (m_end_L0 < (m_iter_L0 += size * m_multiplier)) {
+        m_iter_L0 = previous; // put it back like it was
+        #ifdef KOKKOS_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L0-m_iter_L0));
+        #endif // KOKKOS_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+    } else {
+
+      char* previous = m_iter_L1;
+      const ptrdiff_t missalign =  size_t(m_iter_L1)%alignment;
+      if(missalign) m_iter_L1 += alignment-missalign;
+
+      void* tmp = m_iter_L1 + m_offset * size;
+      if (m_end_L1 < (m_iter_L1 += size * m_multiplier)) {
+        m_iter_L1 = previous; // put it back like it was
+        #ifdef KOKKOS_DEBUG
+        // mfh 23 Jun 2015: printf call consumes 25 registers
+        // in a CUDA build, so only print in debug mode.  The
+        // function still returns NULL if not enough memory.
+        printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+                "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+                long(m_end_L1-m_iter_L1));
+        #endif // KOKKOS_DEBUG
+        tmp = 0;
+      }
+      return tmp;
+
+    }
+  }
+
  template< typename IntType >
  KOKKOS_INLINE_FUNCTION
  ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
--- a/lib/kokkos/core/src/Kokkos_Serial.hpp
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@ -262,7 +262,7 @@ public:
  }

  //----------------------------------------
-
+#ifdef KOKKOS_ENABLE_DEPRECATED_CODE
  template< class FunctorType >
  static
  int team_size_max( const FunctorType & ) { return 1 ; }
@ -274,6 +274,16 @@ public:
  template< class FunctorType >
  static
  int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
+#endif
+
+  template<class FunctorType>
+  int team_size_max( const FunctorType&, const ParallelForTag& ) const { return 1 ; }
+  template<class FunctorType>
+  int team_size_max( const FunctorType&, const ParallelReduceTag& ) const { return 1 ; }
+  template<class FunctorType>
+  int team_size_recommended( const FunctorType&, const ParallelForTag& ) const { return 1 ; }
+  template<class FunctorType>
+  int team_size_recommended( const FunctorType&, const ParallelReduceTag& ) const { return 1 ; }

  //----------------------------------------

@ -281,6 +291,16 @@ public:
  inline int league_size() const { return m_league_size ; }
  inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }

+  inline static
+  int vector_length_max()
+    { return 1024; } // Use arbitrary large number, is meant as a vectorizable length
+
+  inline static
+  int scratch_size_max(int level)
+  { return (level==0?
+        1024*32:
+        20*1024*1024);
+  }
  /** \brief  Specify league size, request team size */
  TeamPolicyInternal( execution_space &
            , int league_size_request
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Axel Kohlmeyer	cf79751f4f	Merge pull request #1207 from akohlmey/next-patch-release Patch release 15 November 2018	2018-11-15 19:33:52 -05:00
Axel Kohlmeyer	e4dee3de17	Merge pull request #1206 from akohlmey/collected-small-changes Collected small changes for next release	2018-11-15 17:29:26 -05:00
Axel Kohlmeyer	6e225d90fc	fix some minor bugs write data file writing and remove dead code and silence compiler warnings	2018-11-15 16:50:56 -05:00
Axel Kohlmeyer	1fc3b4618c	remove dead code and silence compiler warnings	2018-11-15 16:50:56 -05:00
Axel Kohlmeyer	eae9d27f6d	OpenMP support from the compiler is not a requirement for USER-OMP. Without OpenMP, it is like the OPT package but for many more styles, so it is still useful and should be supported.	2018-11-15 16:50:56 -05:00
Axel Kohlmeyer	db29ec7eee	complete workflow document	2018-11-15 14:58:02 -05:00
Axel Kohlmeyer	090778c42b	Merge pull request #1204 from lammps/doc-plumed Linkage mode improvements and documentation updates for USER-PLUMED package	2018-11-15 13:48:58 -05:00
Axel Kohlmeyer	db935dba5e	Merge pull request #1201 from junghans/cmake_doc cmake: update internal doc about how cmake finds executables	2018-11-15 13:48:17 -05:00
Axel Kohlmeyer	e160376365	incomplete first draft. committed for checking the markup in github.	2018-11-15 12:45:15 -05:00
Christoph Junghans	d5f222464b	Update README.md	2018-11-15 09:48:46 -07:00
Axel Kohlmeyer	4d9e2a014b	add detailed build instructions and discussion of linkage modes for PLUMED library and USER-PLUMED package	2018-11-15 11:35:04 -05:00
Axel Kohlmeyer	8a4983e4bc	reformatting and simplification of fix plumed docs	2018-11-15 10:53:38 -05:00
Axel Kohlmeyer	82d6aa9add	interlink fixes colvars, plumed, and smd	2018-11-15 10:52:41 -05:00
Axel Kohlmeyer	4231ab3d57	correct some links	2018-11-15 10:52:03 -05:00
Axel Kohlmeyer	25914ea3f3	patch 15Nov2018	2018-11-15 10:17:25 -05:00
Axel Kohlmeyer	003bb28471	make @gtribello code owner of the USER-PLUMED package	2018-11-14 22:17:25 -05:00
Axel Kohlmeyer	a557644939	support all three plumed linkage modes with CMake as well. For downloaded and previously installed plumed lib	2018-11-14 22:13:18 -05:00
Axel Kohlmeyer	04520e627d	add code and scripts to support all three plumed linkage modes with fix plumed for conventional build	2018-11-14 21:26:36 -05:00
Axel Kohlmeyer	952e52982e	add comment to indicate code intended for backward compatibility only	2018-11-14 05:37:59 -05:00
Axel Kohlmeyer	a942d8b3ba	use memset() for clearing of arrays	2018-11-14 05:30:23 -05:00
Axel Kohlmeyer	7a22b8aa62	check only in currently added data file atoms for dihedral overflow	2018-11-14 05:29:26 -05:00
Axel Kohlmeyer	4c1fbc359a	use tagint when unpacking atom tags from communication buffers	2018-11-14 05:28:19 -05:00
Axel Kohlmeyer	2c644c5f2e	Merge pull request #1197 from akohlmey/collected-small-fixes Collection of small changes and bugfixes for the next release	2018-11-13 15:18:09 -05:00
Axel Kohlmeyer	b1186a971e	Merge pull request #1202 from lammps/hyper Add Hyper-dynamics to REPLICA package	2018-11-13 15:17:30 -05:00
Axel Kohlmeyer	2dbd575a4b	Merge pull request #1203 from stanmoore1/kk_update Update Kokkos library in LAMMPS to v2.7.24	2018-11-13 15:15:09 -05:00
Steve Plimpton	4805e1df22	doc page additions for USER-PLUMED package	2018-11-13 08:29:07 -07:00
Steve Plimpton	380f0e4971	remove some debugging code	2018-11-13 08:06:40 -07:00
Axel Kohlmeyer	a026ce9669	correct broken links detected by make mobi	2018-11-12 21:38:26 -05:00
Axel Kohlmeyer	7e779d16de	correct broken links in manual reported by 'make html'	2018-11-12 21:33:37 -05:00
Axel Kohlmeyer	b776f0f29f	remove dead code and silence warnings about unused parameters	2018-11-12 21:11:55 -05:00
Axel Kohlmeyer	443644025f	silence compiler warnings	2018-11-12 20:50:14 -05:00
Axel Kohlmeyer	c4c90a96ec	avoid void return from non-void function	2018-11-12 20:49:01 -05:00
Axel Kohlmeyer	5cb2463204	c++ style include files do not have a .h extension	2018-11-12 20:33:30 -05:00
Axel Kohlmeyer	5a4e44b75a	remove accidentally duplicated code	2018-11-12 20:27:21 -05:00
Steve Plimpton	0ca02b6f41	added new commands to doc pages, fixed a few missing entries as well	2018-11-12 17:23:15 -07:00
Stan Moore	2b96dfd6cc	Remove deprecated Kokkos code	2018-11-12 15:49:31 -07:00
Stan Moore	c22c6e4d34	Add LAMMPS changes to Kokkos Makefile	2018-11-12 15:30:14 -07:00
Stan Moore	b2d67bcbb5	Remove tpls dir	2018-11-12 15:18:06 -07:00
Stan Moore	b3f08b38a2	Update Kokkos library in LAMMPS to v2.7.24	2018-11-12 15:16:26 -07:00
Axel Kohlmeyer	8e9d4f5bce	modify bond style hybrid, so it can handle bond style quartic as a sub-style	2018-11-12 16:06:55 -05:00
Steve Plimpton	fe07ad279d	added NULL declations to constructor, removed debug code	2018-11-12 12:32:54 -07:00
Steve Plimpton	5062c43aea	rename example outputs	2018-11-12 12:32:53 -07:00
Steve Plimpton	90caf0019c	fix doc page errors	2018-11-12 12:32:53 -07:00
Steve Plimpton	3b7ebbb8df	new hyper examples	2018-11-12 12:32:53 -07:00
Steve Plimpton	d7a479d2f6	hyper example dir	2018-11-12 12:32:52 -07:00
Steve Plimpton	0c8ce199af	more updates to hyper docs	2018-11-12 12:32:52 -07:00
Steve Plimpton	4a6f088c0b	updates to hyper doc pages	2018-11-12 12:32:52 -07:00
Steve Plimpton	56598fcd0b	changes to prd command doc page	2018-11-12 12:32:52 -07:00
Steve Plimpton	265c11dca9	more edits to hyper docs	2018-11-12 12:32:52 -07:00
Steve Plimpton	d6631266ce	doc files in wrong dir	2018-11-12 12:32:52 -07:00
Steve Plimpton	fbd610b8a9	global/local hyperdynamics src and doc files	2018-11-12 12:32:52 -07:00
Christoph Junghans	86d1304176	cmake: update doc aobut executables	2018-11-10 18:58:53 -07:00
Axel Kohlmeyer	f68d77c7af	correct formatting	2018-11-09 08:03:58 -05:00
Axel Kohlmeyer	7a4f534676	replace non-ASCII character	2018-11-09 08:03:41 -05:00
Axel Kohlmeyer	729201ab93	fix typo reported in #1199	2018-11-09 08:03:19 -05:00
Axel Kohlmeyer	ab8215a669	remove dead code	2018-11-09 01:09:31 -05:00
Axel Kohlmeyer	fe04147ee0	fix typo	2018-11-09 01:09:22 -05:00
Axel Kohlmeyer	62b1159673	update presets for USER-PLUMED package. fix typo.	2018-11-09 01:08:57 -05:00
Axel Kohlmeyer	adeb0c2b54	replace faulty preprocessor logic fixes #1196	2018-11-09 01:08:57 -05:00