Merge branch 'master' into HEAD

2017-10-07 12:35:51 -06:00
parent 11eed234f0 88a33edb50
commit b6187173e6
125 changed files with 3966 additions and 2965 deletions
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@ -37,6 +37,10 @@ enable_language(CXX)
 #####################################################################
 include(CheckCCompilerFlag)

+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
+endif()
+
 ########################################################################
 # User input options                                                   #
 ########################################################################
@ -76,7 +80,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
 option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
 if(LAMMPS_EXCEPTIONS)
  add_definitions(-DLAMMPS_EXCEPTIONS)
-  set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS")
+  set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS")
 endif()

 set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically")
@ -665,7 +669,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR})
 ############################################
 add_library(lammps ${LIB_SOURCES})
 target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
-add_dependencies(lammps ${LAMMPS_DEPS})
+if(LAMMPS_DEPS)
+  add_dependencies(lammps ${LAMMPS_DEPS})
+endif()
 set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE})
 if(BUILD_SHARED_LIBS)
  set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})
--- a/doc/src/JPG/user_intel.png
+++ b/doc/src/JPG/user_intel.png
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@ -706,7 +706,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding
 quantum forces calculated by LATTE.

 More information on LATTE can be found at this web site:
-"https://github.com/lanl/LATTE"_#latte_home.  A brief technical
+"https://github.com/lanl/LATTE"_latte_home.  A brief technical
 description is given with the "fix latte"_fix_latte.html command.

 :link(latte_home,https://github.com/lanl/LATTE)
@ -729,6 +729,7 @@ make lib-latte args="-b"                # download and build in lib/latte/LATTE-
 make lib-latte args="-p $HOME/latte"    # use existing LATTE installation in $HOME/latte
 make lib-latte args="-b -m gfortran"    # download and build in lib/latte and 
                                        #   copy Makefile.lammps.gfortran to Makefile.lammps
+:pre

 Note that 3 symbolic (soft) links, "includelink" and "liblink" and
 "filelink", are created in lib/latte to point into the LATTE home dir.
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@ -25,14 +25,14 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
 [Currently Available USER-INTEL Styles:]

 Angle Styles: charmm, harmonic :ulb,l
-Bond Styles: fene, harmonic :l
+Bond Styles: fene, fourier, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
-Fixes: nve, npt, nvt, nvt/sllod :l
+Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l
 Improper Styles: cvff, harmonic :l
 Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
-buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
-lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
-sw, tersoff :l
+buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
+lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, 
+rebo, sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule

@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks).
 :c,image(JPG/user_intel.png)

 Results are speedups obtained on Intel Xeon E5-2697v4 processors
-(code-named Broadwell) and Intel Xeon Phi 7250 processors
-(code-named Knights Landing) with "June 2017" LAMMPS built with
-Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
-per physical core. See {src/USER-INTEL/TEST/README} for the raw
-simulation rates and instructions to reproduce.
+(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named
+Knights Landing), and Intel Xeon Gold 6148 processors (code-named
+Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio
+2017 update 2. Results are with 1 MPI task per physical core. See
+{src/USER-INTEL/TEST/README} for the raw simulation rates and
+instructions to reproduce.

 :line

@ -82,6 +83,11 @@ this order :l
 The {newton} setting applies to all atoms, not just atoms shared
 between MPI tasks :l
 Vectorization can change the order for adding pairwise forces :l
+When using the -DLMP_USE_MKL_RNG define (all included intel optimized
+makefiles do) at build time, the random number generator for
+dissipative particle dynamics (pair style dpd/intel) uses the Mersenne
+Twister generator included in the Intel MKL library (that should be
+more robust than the default Masaglia random number generator) :l
 :ule

 The precision mode (described below) used with the USER-INTEL
@ -108,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
 For some of the simple 2-body potentials without long-range
 electrostatics, performance and scalability can be better with
 the "newton off" setting added to the input script :l
-For simulations on higher node counts, add "processors * * * grid 
+For simulations on higher node counts, add "processors * * * grid
 numa" to the beginning of the input script for better scalability :l
 If using {kspace_style pppm} in the input script, add
 "kspace_modify diff ad" for better performance :l
@ -119,8 +125,8 @@ For Intel Xeon Phi CPUs:
 Runs should be performed using MCDRAM. :ulb,l
 :ule

-For simulations using {kspace_style pppm} on Intel CPUs
-supporting AVX-512:
+For simulations using {kspace_style pppm} on Intel CPUs supporting
+AVX-512:

 Add "kspace_modify diff ad" to the input script :ulb,l
 The command-line option should be changed to
@ -237,14 +243,17 @@ However, if you do not have coprocessors on your system, building
 without offload support will produce a smaller binary.

 The general requirements for Makefiles with the USER-INTEL package
-are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When
-using Intel compilers, "-restrict" is required and "-qopenmp" is
-highly recommended for CCFLAGS and LINKFLAGS. LIB should include
-"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD"
-is required for CCFLAGS and "-qoffload" is required for LINKFLAGS.
-Other recommended CCFLAG options for best performance are
-"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2
-no-prec-div".
+are as follows. When using Intel compilers, "-restrict" is required 
+and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS. 
+CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads
+are not supported in the build environment) and "-DLMP_USE_MKL_RNG"
+(unless Intel Math Kernel Library (MKL) is not available in the build
+environment). For Intel compilers, LIB should include "-ltbbmalloc" 
+or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added
+to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is
+required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other
+recommended CCFLAG options for best performance are "-O2 -fno-alias
+-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div".

 NOTE: The vectorization and math capabilities can differ depending on
 the CPU. For Intel compilers, the "-x" flag specifies the type of
--- a/doc/src/atom_modify.txt
+++ b/doc/src/atom_modify.txt
@ -16,7 +16,7 @@ atom_modify keyword values ... :pre
 one or more keyword/value pairs may be appended :ulb,l
 keyword = {id} or {map} or {first} or {sort} :l
   {id} value = {yes} or {no}
-   {map} value = {array} or {hash}
+   {map} value = {yes} or {array} or {hash}
   {first} value = group-ID = group whose atoms will appear first in internal atom lists
   {sort} values = Nfreq binsize
     Nfreq = sort atoms spatially every this many time steps
@ -25,8 +25,8 @@ keyword = {id} or {map} or {first} or {sort} :l

 [Examples:]

-atom_modify map hash
-atom_modify map array sort 10000 2.0
+atom_modify map yes
+atom_modify map hash sort 10000 2.0
 atom_modify first colloid :pre

 [Description:]
@ -62,29 +62,33 @@ switch.  This is described in "Section 2.2"_Section_start.html#start_2
 of the manual.  If atom IDs are not used, they must be specified as 0
 for all atoms, e.g. in a data or restart file.

-The {map} keyword determines how atom ID lookup is done for molecular
-atom styles.  Lookups are performed by bond (angle, etc) routines in
-LAMMPS to find the local atom index associated with a global atom ID.
+The {map} keyword determines how atoms with specific IDs are found
+when required.  An example are the bond (angle, etc) methods which
+need to find the local index of an atom with a specific global ID
+which is a bond (angle, etc) partner.  LAMMPS performs this operation
+efficiently by creating a "map", which is either an {array} or {hash}
+table, as descibed below.

-When the {array} value is used, each processor stores a lookup table
-of length N, where N is the largest atom ID in the system.  This is a
+When the {map} keyword is not specified in your input script, LAMMPS
+only creates a map for "atom_styles"_atom_style.html for molecular
+systems which have permanent bonds (angles, etc).  No map is created
+for atomic systems, since it is normally not needed.  However some
+LAMMPS commands require a map, even for atomic systems, and will
+generate an error if one does not exist.  The {map} keyword thus
+allows you to force the creation of a map.  The {yes} value will
+create either an {array} or {hash} style map, as explained in the next
+paragraph.  The {array} and {hash} values create an atom-style or
+hash-style map respectively.
+
+For an {array}-style map, each processor stores a lookup table of
+length N, where N is the largest atom ID in the system.  This is a
 fast, simple method for many simulations, but requires too much memory
-for large simulations.  The {hash} value uses a hash table to perform
-the lookups.  This can be slightly slower than the {array} method, but
-its memory cost is proportional to the number of atoms owned by a
-processor, i.e. N/P when N is the total number of atoms in the system
-and P is the number of processors.
-
-When this setting is not specified in your input script, LAMMPS
-creates a map, if one is needed, as an array or hash.  See the
-discussion of default values below for how LAMMPS chooses which kind
-of map to build.  Note that atomic systems do not normally need to
-create a map.  However, even in this case some LAMMPS commands will
-create a map to find atoms (and then destroy it), or require a
-permanent map.  An example of the former is the "velocity loop
-all"_velocity.html command, which uses a map when looping over all
-atoms and insuring the same velocity values are assigned to an atom
-ID, no matter which processor owns it.
+for large simulations.  For a {hash}-style map, a hash table is
+created on each processor, which finds an atom ID in constant time
+(independent of the global number of atom IDs).  It can be slightly
+slower than the {array} map, but its memory cost is proportional to
+the number of atoms owned by a processor, i.e. N/P when N is the total
+number of atoms in the system and P is the number of processors.

 The {first} keyword allows a "group"_group.html to be specified whose
 atoms will be maintained as the first atoms in each processor's list
--- a/doc/src/dihedral_fourier.txt
+++ b/doc/src/dihedral_fourier.txt
@ -7,6 +7,7 @@
 :line

 dihedral_style fourier command :h3
+dihedral_style fourier/intel command :h3
 dihedral_style fourier/omp command :h3

 [Syntax:]
--- a/doc/src/dump_modify.txt
+++ b/doc/src/dump_modify.txt
@ -15,9 +15,11 @@ dump_modify dump-ID keyword values ... :pre
 dump-ID = ID of dump to modify :ulb,l
 one or more keyword/value pairs may be appended :l
 these keywords apply to various dump styles :l
-keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
-  {append} arg = {yes} or {no} or {at} N
+keyword = {append} or {at} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
+  {append} arg = {yes} or {no}
+  {at} arg = N
    N = index of frame written upon first dump
+    only available after "append yes"
  {buffer} arg = {yes} or {no}
  {element} args = E1 E2 ... EN, where N = # of atom types
    E1,...,EN = element name, e.g. C or Fe or Ga
--- a/doc/src/dump_netcdf.txt
+++ b/doc/src/dump_netcdf.txt
@ -25,7 +25,8 @@ args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule

 dump 1 all netcdf 100 traj.nc type x y z vx vy vz
 dump_modify 1 append yes at -1 thermo yes
-dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre
+dump 1 all netcdf/mpiio 1000 traj.nc id type x y z
+dump 1 all netcdf 1000 traj.*.nc id type x y z :pre

 [Description:]

@ -73,4 +74,3 @@ section for more info.
 [Related commands:]

 "dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html
-
--- a/doc/src/fix_latte.txt
+++ b/doc/src/fix_latte.txt
@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond
 integrals are parameterized using a Slater-Koster tight-binding
 approach. This procedure, which usually is referred to as the DFTB
 method has been described in detail by ("Elstner"_#Elstner) and
-("Finnis"_#Finnis) and coworkers. 
+("Finnis"_#Finnis2) and coworkers. 

 The work of the LATTE developers follows that of Elstner closely with
 respect to the physical model.  However, the development of LATTE is
@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
 M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
 7260 (1998).

-:link(Finnis)
+:link(Finnis2)
 [(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van
 Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998).

@ -197,11 +197,11 @@ J. Sci. Comput. 36 (2), 147-170, (2014).
 [(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys.,
 141, 164123, (2014).

-:link(Niklasson2014)
+:link(Niklasson2017)
 [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017).

-:link(Niklasson2012)
-[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
+:link(Cawkwell2012)
+[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
 (17), 174308 (2012).

 :link(Negre2016)
--- a/doc/src/fix_neb.txt
+++ b/doc/src/fix_neb.txt
@ -93,7 +93,7 @@ intermediate replica with the previous and the next image:

 Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre

-Note that in this case the specified {Kspring) is in force/distance
+Note that in this case the specified {Kspring} is in force/distance
 units.

 With a value of {ideal}, the spring force is computed as suggested in
@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and
 RDideal is the ideal RD for which all the images are equally spaced.
 I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
 I is the replica number).  The meanDist is the average distance
-between replicas.  Note that in this case the specified {Kspring) is
+between replicas.  Note that in this case the specified {Kspring} is
 in force units.

 Note that the {ideal} form of nudging can often be more effective at
--- a/doc/src/fix_nh.txt
+++ b/doc/src/fix_nh.txt
@ -393,32 +393,36 @@ thermostatting and barostatting.
 :line

 These fixes compute a temperature and pressure each timestep.  To do
-this, the fix creates its own computes of style "temp" and "pressure",
-as if one of these two sets of commands had been issued:
+this, the thermostat and barostat fixes create their own computes of
+style "temp" and "pressure", as if one of these sets of commands had
+been issued:

+For fix nvt:
 compute fix-ID_temp group-ID temp
-compute fix-ID_press group-ID pressure fix-ID_temp :pre

+For fix npt and fix nph:
 compute fix-ID_temp all temp
 compute fix-ID_press all pressure fix-ID_temp :pre

-See the "compute temp"_compute_temp.html and "compute
-pressure"_compute_pressure.html commands for details.  Note that the
-IDs of the new computes are the fix-ID + underscore + "temp" or fix_ID
-+ underscore + "press".  For fix nvt, the group for the new computes
-is the same as the fix group.  For fix nph and fix npt, the group for
-the new computes is "all" since pressure is computed for the entire
-system.
+For fix nvt, the group for the new temperature compute is the same as
+the fix group.  For fix npt and fix nph, the group for both the new
+temperature and pressure compute is "all" since pressure is computed
+for the entire system.  In the case of fix nph, the temperature
+compute is not used for thermostatting, but just for a kinetic-energy
+contribution to the pressure.  See the "compute
+temp"_compute_temp.html and "compute pressure"_compute_pressure.html
+commands for details.  Note that the IDs of the new computes are the
+fix-ID + underscore + "temp" or fix_ID + underscore + "press".

 Note that these are NOT the computes used by thermodynamic output (see
 the "thermo_style"_thermo_style.html command) with ID = {thermo_temp}
-and {thermo_press}.  This means you can change the attributes of this
+and {thermo_press}.  This means you can change the attributes of these
 fix's temperature or pressure via the
-"compute_modify"_compute_modify.html command or print this temperature
-or pressure during thermodynamic output via the "thermo_style
-custom"_thermo_style.html command using the appropriate compute-ID.
-It also means that changing attributes of {thermo_temp} or
-{thermo_press} will have no effect on this fix.
+"compute_modify"_compute_modify.html command.  Or you can print this
+temperature or pressure during thermodynamic output via the
+"thermo_style custom"_thermo_style.html command using the appropriate
+compute-ID.  It also means that changing attributes of {thermo_temp}
+or {thermo_press} will have no effect on this fix.

 Like other fixes that perform thermostatting, fix nvt and fix npt can
 be used with "compute commands"_compute.html that calculate a
--- a/doc/src/fixes.txt
+++ b/doc/src/fixes.txt
@ -59,6 +59,7 @@ Fixes :h1
   fix_langevin
   fix_langevin_drude
   fix_langevin_eff
+   fix_latte
   fix_lb_fluid
   fix_lb_momentum
   fix_lb_pc
--- a/doc/src/lammps.book
+++ b/doc/src/lammps.book
@ -188,6 +188,7 @@ fix_ipi.html
 fix_langevin.html
 fix_langevin_drude.html
 fix_langevin_eff.html
+fix_latte.html
 fix_lb_fluid.html
 fix_lb_momentum.html
 fix_lb_pc.html
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@ -62,7 +62,7 @@ args = arguments specific to the style :l
      {no_affinity} values = none
  {kokkos} args = keyword value ...
    zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward}
+    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
      {neigh} value = {full} or {half}
        full = full neighbor list
        half = half neighbor list built in thread-safe manner
@ -75,9 +75,10 @@ args = arguments specific to the style :l
      {binsize} value = size
        size = bin size for neighbor list construction (distance units)
      {comm} value = {no} or {host} or {device}
-        use value for both comm/exchange and comm/forward
+        use value for comm/exchange and comm/forward and comm/reverse
      {comm/exchange} value = {no} or {host} or {device}
      {comm/forward} value = {no} or {host} or {device}
+      {comm/reverse} value = {no} or {host} or {device}
        no = perform communication pack/unpack in non-KOKKOS mode
        host = perform pack/unpack on host (e.g. with OpenMP threading)
        device = perform pack/unpack on device (e.g. on GPU)
@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at
 performing pairwise interactions, then this rule of thumb may give too
 large a binsize.

-The {comm} and {comm/exchange} and {comm/forward} keywords determine
+The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
 whether the host or device performs the packing and unpacking of data
 when communicating per-atom data between processors.  "Exchange"
 communication happens only on timesteps that neighbor lists are
 rebuilt.  The data is only for atoms that migrate to new processors.
-"Forward" communication happens every timestep.  The data is for atom
+"Forward" communication happens every timestep. "Reverse" communication
+happens every timestep if the {newton} option is on.  The data is for atom
 coordinates and any other atom properties that needs to be updated for
 ghost atoms owned by each processor.

 The {comm} keyword is simply a short-cut to set the same value
-for both the {comm/exchange} and {comm/forward} keywords.
+for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.

 The value options for all 3 keywords are {no} or {host} or {device}.
 A value of {no} means to use the standard non-KOKKOS method of
--- a/doc/src/pair_dpd.txt
+++ b/doc/src/pair_dpd.txt
@ -8,6 +8,7 @@

 pair_style dpd command :h3
 pair_style dpd/gpu command :h3
+pair_style dpd/intel command :h3
 pair_style dpd/omp command :h3
 pair_style dpd/tstat command :h3
 pair_style dpd/tstat/gpu command :h3
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix.

 Style {eam/fs} computes pairwise interactions for metals and metal
 alloys using a generalized form of EAM potentials due to Finnis and
-Sinclair "(Finnis)"_#Finnis.  The total energy Ei of an atom I is
+Sinclair "(Finnis)"_#Finnis1.  The total energy Ei of an atom I is
 given by

 :c,image(Eqs/pair_eam_fs.jpg)
@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004).
 [(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983).
 Daw, Baskes, Phys Rev B, 29, 6443 (1984).

-:link(Finnis)
+:link(Finnis1)
 [(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984).

 :link(Stukowski)
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@ -1,5 +1,24 @@
 # Change Log

+## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
+
+**Implemented enhancements:**
+
+- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
+- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
+- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
+- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
+- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
+- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
+
+**Fixed bugs:**
+
+- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
+- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
+- (Experimental) HBWSpace  Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
+- (Experimental) ROCm:  algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
+
 ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)

--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@ -443,7 +443,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
  KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
  KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
-  KOKKOS_LIBS += -lmemkind
+  KOKKOS_LIBS += -lmemkind -lnuma
  tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
 endif

@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

  else
-    # Assume that this is a really a GNU compiler or it could be XL on P8.
-    KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
-    KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P8.
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+      endif
+    endif
  endif
 endif

@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
  ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)

  else
-    # Assume that this is a really a GNU compiler or it could be XL on P9.
-    KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
-    KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P9
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+      endif
+    endif
  endif
 endif

--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
 }


+#endif
+
+#if defined(KOKKOS_ENABLE_ROCM) 
+
+  template<>
+  class Random_XorShift1024<Kokkos::Experimental::ROCm> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    const int stride_;
+    friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
+  public:
+
+    typedef Kokkos::Experimental::ROCm device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  state_(state.state_idx_) = state.state_;
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
 #endif


--- a/lib/kokkos/algorithms/unit_tests/Makefile
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	TEST_TARGETS += test-cuda
 endif

+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Threads
@ -51,6 +57,9 @@ endif
 KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda

+KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
+
 KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads

@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 test-cuda: KokkosAlgorithms_UnitTest_Cuda
 	./KokkosAlgorithms_UnitTest_Cuda

+test-rocm: KokkosAlgorithms_UnitTest_ROCm
+	./KokkosAlgorithms_UnitTest_ROCm
+
 test-threads: KokkosAlgorithms_UnitTest_Threads
 	./KokkosAlgorithms_UnitTest_Threads

--- a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
+++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+void rocm_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+void rocm_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+
+#define ROCM_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( rocm, Random_XorShift64 ) {        \
+  rocm_test_random_xorshift64(num_draws);    \
+  }
+
+#define ROCM_RANDOM_XORSHIFT1024( num_draws )  \
+  TEST_F( rocm, Random_XorShift1024 ) {        \
+  rocm_test_random_xorshift1024(num_draws);    \
+  }
+
+#define ROCM_SORT_UNSIGNED( size )                                    \
+  TEST_F( rocm, SortUnsigned ) {                                      \
+      Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size);  \
+  }
+
+ROCM_RANDOM_XORSHIFT64(  132141141 )
+ROCM_RANDOM_XORSHIFT1024( 52428813 )
+ROCM_SORT_UNSIGNED(171)
+
+#undef ROCM_RANDOM_XORSHIFT64
+#undef ROCM_RANDOM_XORSHIFT1024
+#undef ROCM_SORT_UNSIGNED
+}
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
+
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@ -27,7 +27,7 @@ fi
 HPCBIND_HWLOC_PARENT_CPUSET=""
 if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
  MY_PID="$BASHPID"
-  HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+  HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
 fi

 ################################################################################
@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
 ################################################################################
 HPCBIND_QUEUE_NAME=""
 declare -i HPCBIND_QUEUE_INDEX=0
-declare -i HPCBIND_QUEUE_GPU_MAPPING=0
+declare -i HPCBIND_QUEUE_MAPPING=0

-if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
-  HPCBIND_QUEUE_NAME="sbatch"
+if [[ ! -z "${PMI_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mpich"
+  HPCBIND_QUEUE_INDEX=${PMI_RANK}
+elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="openmpi"
+  HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
+elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mvapich2"
+  HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
+elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="slurm"
  HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
 elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="bsub"
  HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
 elif [[ ! -z "${ALPS_APP_PE}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
  HPCBIND_QUEUE_NAME="aprun"
  HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
 fi

-
 ################################################################################
 # Show help
 ################################################################################
@ -91,13 +102,14 @@ function show_help {
  echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
  echo "                        LOC can be any valid location argument for"
  echo "                        hwloc-calc  Default: all"
+  echo "  --whole-system        ${cmd} will ignore the its parent process binding"
  echo "  --distribute=N        Distribute the current cpuset into N partitions"
  echo "  --distribute-partition=I"
  echo "                        Use the i'th partition (zero based)"
  echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
  echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
  echo "                        sequential order"
-  echo "  --gpu-ignore-queue    Ignore queue job id when choosing visible GPU"
+  echo "  --ignore-queue        Ignore queue job id when choosing visible GPU and partition"
  echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
  echo "                        Default: 4.0"
@ -110,22 +122,30 @@ function show_help {
  echo "  --force-openmp-proc-bind=<OP>"
  echo "                        Override logic for selecting OMP_PROC_BIND"
  echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  --show-bindings       Show the bindings"
-  echo "  --lstopo              Show bindings in lstopo without executing a command"
-  echo "  -v|--verbose          Show options and relevant environment variables"
+  echo "  --output-prefix=<P>   Save the output to files of the form"
+  echo "                        P-N.log, P-N.out and P-N.err where P is the prefix"
+  echo "                        and N is the queue index or mpi rank (no spaces)"
+  echo "  --output-mode=<Op>    How console output should be handled."
+  echo "                        Options are all, rank0, and none.  Default: rank0" 
+  echo "  --lstopo              Show bindings in lstopo"
+  echo "  -v|--verbose          Print bindings and relevant environment variables"
  echo "  -h|--help             Show this message"
  echo ""
  echo "Sample Usage:"
  echo "  Split the current process cpuset into 4 and use the 3rd partition"
  echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
-  echo "  Bing the process to all even cores"
+  echo "  Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
+  echo "  and save the output to rank specific files"
+  echo "    mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
+  echo "      --distribute=4 -v --output-prefix=output  -- command ..."
+  echo "  Bind the process to all even cores"
  echo "    ${cmd} --proc-bind=core:even -v -- command ..."
-  echo "  Bind to the first 64 cores and split the current process cpuset into 4"
-  echo "    ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
-  echo "  skip GPU 0 when mapping visible devices"
+  echo "  Bind the the even cores of socket 0 and the odd cores of socket 1"
+  echo "    ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
+  echo "  Skip GPU 0 when mapping visible devices"
  echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
  echo "  Display the current bindings"
-  echo "    ${cmd} --proc-bind=numa:0 --show-bindings -- command"
+  echo "    ${cmd} --proc-bind=numa:0 -- command"
  echo "  Display the current bindings using lstopo"
  echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
  echo ""
@ -144,7 +164,7 @@ fi
 declare -a UNKNOWN_ARGS=()
 declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
 declare -i HPCBIND_DISTRIBUTE=1
-declare -i HPCBIND_PARTITION=0
+declare -i HPCBIND_PARTITION=-1
 HPCBIND_PROC_BIND="all"
 HPCBIND_OPENMP_VERSION=4.0
 declare -i HPCBIND_OPENMP_PERCENT=100
@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
 HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
 declare -i HPCBIND_VERBOSE=0

-declare -i HPCBIND_SHOW_BINDINGS=0
 declare -i HPCBIND_LSTOPO=0

-for i in $@; do
-  case $i in
+HPCBIND_OUTPUT_PREFIX=""
+HPCBIND_OUTPUT_MODE="rank0"
+
+declare -i HPCBIND_HAS_COMMAND=0
+
+for i in "$@"; do
+  case "$i" in
    # number of partitions to create
    --no-hwloc-bind)
      HPCBIND_ENABLE_HWLOC_BIND=0
@ -169,6 +193,10 @@ for i in $@; do
      HPCBIND_PROC_BIND="${i#*=}"
      shift
      ;;
+    --whole-system)
+      HPCBIND_HWLOC_PARENT_CPUSET=""
+      shift
+      ;;
    --distribute=*)
      HPCBIND_DISTRIBUTE="${i#*=}"
      shift
@ -182,8 +210,8 @@ for i in $@; do
      HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
      shift
      ;;
-    --gpu-ignore-queue)
-      HPCBIND_QUEUE_GPU_MAPPING=0
+    --ignore-queue)
+      HPCBIND_QUEUE_MAPPING=0
      shift
      ;;
    --no-gpu-mapping)
@ -218,14 +246,18 @@ for i in $@; do
      HPCBIND_OPENMP_NESTED="false"
      shift
      ;;
-    --show-bindings)
-      HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=1
+    --output-prefix=*)
+      HPCBIND_OUTPUT_PREFIX="${i#*=}"
+      shift
+      ;;
+    --output-mode=*)
+      HPCBIND_OUTPUT_MODE="${i#*=}"
+      #convert to lower case
+      HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
      shift
      ;;
    --lstopo)
      HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=0
      HPCBIND_LSTOPO=1
      shift
      ;;
@ -239,6 +271,7 @@ for i in $@; do
      ;;
    # ignore remaining arguments
    --)
+      HPCBIND_HAS_COMMAND=1
      shift
      break
      ;;
@ -250,16 +283,41 @@ for i in $@; do
  esac
 done

+################################################################################
+# Check output mode
+################################################################################
+declare -i HPCBIND_TEE=0
+
+if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
+  HPCBIND_TEE=0
+elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
+  HPCBIND_TEE=1
+elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
+  #default to rank0 printing to screen
+  HPCBIND_TEE=1
+fi
+
+
+if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
+  HPCBIND_LOG=/dev/null
+  HPCBIND_ERR=/dev/null
+  HPCBIND_OUT=/dev/null
+else
+  HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
+  HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
+  HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
+  > ${HPCBIND_LOG}
+fi
+

 ################################################################################
 # Check unknown arguments
 ################################################################################
 if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
  exit 1
 fi

-
 ################################################################################
 # Check that visible gpus are valid
 ################################################################################
@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
  for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
    if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
      ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
-      echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
+      echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
      HPCBIND_VISIBLE_GPUS[$i]=0;
    fi
  done
  NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
 fi

-
 ################################################################################
 # Check OpenMP percent
 ################################################################################
 if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
-  echo "OpenMP percent < 1, setting to 1"
  HPCBIND_OPENMP_PERCENT=1
 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
-  echo "OpenMP percent > 100, setting to 100"
  HPCBIND_OPENMP_PERCENT=100
 fi

@ -291,15 +346,21 @@ fi
 # Check distribute
 ################################################################################
 if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
  HPCBIND_DISTRIBUTE=1
 fi

-if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
-  echo "Invalid input for distribute-partition, changing to 0"
+################################################################################
+#choose the correct partition
+################################################################################
+if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
+  HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
+elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
  HPCBIND_PARTITION=0
 fi

+if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
+  HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
+fi

 ################################################################################
 # Find cpuset and num threads
@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0

 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
  if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
  else
-    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
  fi

-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
-  HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
+  if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
+    CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
+    HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
+  else
+    HPCBIND_HWLOC_CPUSET="${BINDING}"
+  fi
  HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
 else
  HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
 ################################################################################

 if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
-  if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
+  if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
    declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
  else
    declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
    declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
  fi
 fi

@ -389,22 +454,22 @@ fi
 export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
 export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
 export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
-export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
+export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
 export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
 export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
 if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
  export HPCBIND_HWLOC_PARENT_CPUSET="all"
 else
-  export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
+  export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
 fi
-export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
+export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
 export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
 export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
-export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
+export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
 if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
  export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
-  export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
-  export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
+  export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
+  export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
 fi


@ -412,43 +477,63 @@ fi
 # Print verbose
 ################################################################################

-if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
-  MY_ENV=$(env | sort)
-  echo "[HPCBIND]"
-  echo "${MY_ENV}" | grep -E "^HPCBIND_"
-  echo "[CUDA]"
-  echo "${MY_ENV}" | grep -E "^CUDA_"
-  echo "[OPENMP]"
-  echo "${MY_ENV}" | grep -E "^OMP_"
-fi
+TMP_ENV=$(env | sort)
+if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
+  echo "[HOST]" >> ${HPCBIND_LOG}
+  hostname -s >> ${HPCBIND_LOG}
+  echo "[HPCBIND]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
+  echo "[CUDA]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
+  echo "[OPENMP]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}

-if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "[BINDINGS]"
-  hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "Unable to show bindings, hwloc not available."
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" >> ${HPCBIND_LOG}
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
+  else
+    echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
+  fi
+else
+  echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
+  hostname -s > >(tee -a ${HPCBIND_LOG})
+  echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
+  echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
+  echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
+
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
+  else
+    echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
+  fi
 fi

 ################################################################################
 # Run command
 ################################################################################

-if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
-  if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-    hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
-  else
-    eval $@
-  fi
-else
-  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
-    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
-      echo "[BINDINGS]"
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-      hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
+# must be the last executed command so that the return value is correct
+if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
+  hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
+elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
+  # clear output files
+  > ${HPCBIND_ERR}
+  > ${HPCBIND_OUT}
+  if [[ ${HPCBIND_TEE} -eq 0 ]]; then
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
    else
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
+      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
    fi
  else
-    echo "Unable to show bindings, hwloc not available."
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    else
+      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    fi
  fi
 fi
--- a/lib/kokkos/bin/kokkos-bind
+++ b/lib/kokkos/bin/kokkos-bind
@ -1,221 +0,0 @@
-#!/usr/bin/env bash
-
-# check if hwloc commands exist
-declare -i HAS_HWLOC=0
-type hwloc-bind >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-distrib >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ls >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-calc >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ps >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-
-#parse args
-declare -a UNKNOWN_ARGS=()
-declare -i DISTRIBUTE=1
-declare -i INDEX=0
-PROC_BIND="all"
-CURRENT_CPUSET=""
-OPENMP_VERSION=4.0
-OPENMP_PROC_BIND=True
-OPENMP_NESTED=True
-VERBOSE=False
-
-#get the current process cpuset
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-  MY_PID="$BASHPID"
-  CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
-  echo "$CURRENT_CPUSET"
-fi
-
-function show_help {
-  local cmd=$(basename "$0")
-  echo "Usage: ${cmd} <options> -- command ..." 
-  echo "  Uses hwloc to divide the node into the given number of groups,"
-  echo "  set the appropriate OMP_NUM_THREADS and execute the command on the"
-  echo "  selected group."
-  echo ""
-  echo "  NOTE: This command assumes it has exclusive use of the node"
-  echo ""
-  echo "Options:"
-  echo "  --proc-bind=<LOC>     Set the initial process mask for the script.  "
-  echo "                        LOC can be any valid location argumnet for"
-  echo "                        hwloc-calc.  Defaults to the entire machine"
-  echo "  --distribute=N        Distribute the current proc-bind into N groups" 
-  echo "  --index=I             Use the i'th group (zero based)" 
-  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
-  echo "                        (default 4.0)"
-  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"    
-  echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  -v|--verbose" 
-  echo "  -h|--help" 
-  echo ""
-  echo "Sample Usage:"
-  echo "  ${cmd} --distribute=4 --index=2 -v -- command ..."
-  echo ""
-}
-
-if [[ "$#" -eq 0 ]]; then
-  show_help 
-  exit 0
-fi
-
-
-for i in $@; do
-  case $i in
-    # number of partitions to create
-    --proc-bind=*)
-      PROC_BIND="${i#*=}"
-      shift
-      ;;
-    --distribute=*)
-      DISTRIBUTE="${i#*=}"
-      shift
-      ;;
-    # which group to use
-    --index=*)
-      INDEX="${i#*=}"
-      shift
-      ;;
-    --openmp=*)
-      OPENMP_VERSION="${i#*=}"
-      shift
-      ;;
-    --no-openmp-proc-bind)
-      OPENMP_PROC_BIND=False
-      shift
-      ;;
-    --no-openmp-nested)
-      OPENMP_NESTED=False
-      shift
-      ;;
-    -v|--verbose)
-      VERBOSE=True
-      shift
-      ;;
-    -h|--help)
-      show_help
-      exit 0
-      ;;
-    # ignore remaining arguments
-    --)
-      shift
-      break
-      ;;
-    # unknown option
-    *)
-      UNKNOWN_ARGS+=("$i")
-      shift
-      ;;
-  esac
-done
-
-if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
-  exit 1
-fi
-
-if [[ ${DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
-  DISTRIBUTE=1
-fi
-
-if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
-  echo "Invalid input for index, changing index to 0"
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -ne 0 ]]; then
-  echo "hwloc not found, no process binding will occur"
-  DISTRIBUTE=1
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-
-  if [[ "${CURRENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${PROC_BIND})
-  else 
-    BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
-  fi
-
-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
-  CPUSET=${CPUSETS[${INDEX}]}
-  NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:         true"
-    echo "  proc_bind:     ${PROC_BIND}"
-    echo "  distribute:    ${DISTRIBUTE}"
-    echo "  index:         ${INDEX}"
-    echo "  parent_cpuset: ${CURRENT_CPUSET}"
-    echo "  cpuset:        ${CPUSET}"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  hwloc-bind ${CPUSET} -- $@
-else
-  NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:           false"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-    
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  eval $@
-fi
-
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
 # Check if we have an optimization argument already
 optimization_applied=0

+# Check if we have -std=c++X  or --std=c++X already
+stdcxx_applied=0
+
 #echo "Arguments: $# $@"

 while [ $# -gt 0 ]
@ -130,10 +133,16 @@ do
    cuda_args="$cuda_args $1 $2"
    shift
    ;;
-  #Handle c++11 setting
-  --std=c++11|-std=c++11)
-    shared_args="$shared_args $1"
+  #Handle c++11
+  --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
+    if [ $stdcxx_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
+    else
+       shared_args="$shared_args $1"
+       stdcxx_applied=1
+    fi
    ;;
+
  #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
  -std=c++98|--std=c++98)
    ;;
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@ -9,3 +9,4 @@ tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
 tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
 tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
 tag:  2.04.00    date: 08:16:2017    master: 54eb75c0    develop: 32fb8ee1
+tag:  2.04.04    date: 09:11:2017    master: 2b7e9c20    develop: 51e7b25a
--- a/lib/kokkos/config/trilinos-integration/checkin-test
+++ b/lib/kokkos/config/trilinos-integration/checkin-test
@ -1,4 +1,4 @@
 module purge
-module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
+module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu

 #Run Trilinos CheckinTest
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@ -125,6 +125,123 @@ namespace Impl {
  };
 }

+/// \class GraphRowViewConst
+/// \brief View of a row of a sparse graph.
+/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
+///
+/// This class provides a generic view of a row of a sparse graph.
+/// We intended this class to view a row of a StaticCrsGraph, but
+/// GraphType need not necessarily be CrsMatrix.
+///
+/// The row view is suited for computational kernels like sparse
+/// matrix-vector multiply, as well as for modifying entries in the
+/// sparse matrix.  The view is always const as it does not allow graph modification.
+///
+/// Here is an example loop over the entries in the row:
+/// \code
+/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
+///
+/// GraphRowView<GraphType> G_i = ...;
+/// const ordinal_type numEntries = G_i.length;
+/// for (ordinal_type k = 0; k < numEntries; ++k) {
+///   ordinal_type j = G_i.colidx (k);
+///   // ... do something with A_ij and j ...
+/// }
+/// \endcode
+///
+/// GraphType must provide the \c data_type
+/// typedefs. In addition, it must make sense to use GraphRowViewConst to
+/// view a row of GraphType. In particular, column
+/// indices of a row must be accessible using the <tt>entries</tt>
+/// resp. <tt>colidx</tt> arrays given to the constructor of this
+/// class, with a constant <tt>stride</tt> between successive entries.
+/// The stride is one for the compressed sparse row storage format (as
+/// is used by CrsMatrix), but may be greater than one for other
+/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
+template<class GraphType>
+struct GraphRowViewConst {
+  //! The type of the column indices in the row.
+  typedef const typename GraphType::data_type ordinal_type;
+
+private:
+  //! Array of (local) column indices in the row.
+  ordinal_type* colidx_;
+  /// \brief Stride between successive entries in the row.
+  ///
+  /// For compressed sparse row (CSR) storage, this is always one.
+  /// This might be greater than one for storage formats like ELLPACK
+  /// or Jagged Diagonal.  Nevertheless, the stride can never be
+  /// greater than the number of rows or columns in the matrix.  Thus,
+  /// \c ordinal_type is the correct type.
+  const ordinal_type stride_;
+
+public:
+  /// \brief Constructor
+  ///
+  /// \param values [in] Array of the row's values.
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( ordinal_type* const colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count) :
+    colidx_ (colidx_in), stride_ (stride), length (count)
+  {}
+
+  /// \brief Constructor with offset into \c colidx array
+  ///
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  /// \param idx [in] Start offset into \c colidx array
+  ///
+  /// \tparam OffsetType The type of \c idx (see above).  Must be a
+  ///   built-in integer type.  This may differ from ordinal_type.
+  ///   For example, the matrix may have dimensions that fit in int,
+  ///   but a number of entries that does not fit in int.
+  template<class OffsetType>
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count,
+                      const OffsetType& idx,
+                      const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
+    colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
+  {}
+
+  /// \brief Number of entries in the row.
+  ///
+  /// This is a public const field rather than a public const method,
+  /// in order to avoid possible overhead of a method call if the
+  /// compiler is unable to inline that method call.
+  ///
+  /// We assume that rows contain no duplicate entries (i.e., entries
+  /// with the same column index).  Thus, a row may have up to
+  /// A.numCols() entries.  This means that the correct type of
+  /// 'length' is ordinal_type.
+  const ordinal_type length;
+
+  /// \brief (Const) reference to the column index of entry i in this
+  ///   row of the sparse matrix.
+  ///
+  /// "Entry i" is not necessarily the entry with column index i, nor
+  /// does i necessarily correspond to the (local) row index.
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& colidx (const ordinal_type& i) const {
+    return colidx_[i*stride_];
+  }
+
+  /// \brief An alias for colidx
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& operator()(const ordinal_type& i) const {
+    return colidx(i);
+  }
+};
+
+
 /// \class StaticCrsGraph
 /// \brief Compressed row storage array.
 ///
@ -218,6 +335,38 @@ public:
      static_cast<size_type> (0);
  }

+  /// \brief Return a const view of row i of the graph.
+  ///
+  /// If row i does not belong to the graph, return an empty view.
+  ///
+  /// The returned object \c view implements the following interface:
+  /// <ul>
+  /// <li> \c view.length is the number of entries in the row </li>
+  /// <li> \c view.colidx(k) returns a const reference to the
+  ///      column index of the k-th entry in the row </li>
+  /// </ul>
+  /// k is not a column index; it just counts from 0 to
+  /// <tt>view.length - 1</tt>.
+  ///
+  /// Users should not rely on the return type of this method.  They
+  /// should instead assign to 'auto'.  That allows compile-time
+  /// polymorphism for different kinds of sparse matrix formats (e.g.,
+  /// ELLPACK or Jagged Diagonal) that we may wish to support in the
+  /// future.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
+    const size_type start = row_map(i);
+    // count is guaranteed to fit in ordinal_type, as long as no row
+    // has duplicate entries.
+    const data_type count = static_cast<data_type> (row_map(i+1) - start);
+
+    if (count == 0) {
+      return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
+    } else {
+      return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
+    }
+  }
+
  /**  \brief  Create a row partitioning into a given number of blocks
   *           balancing non-zeros + a fixed cost per row.
   */
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
    // LL
    if (RP::inner_direction == RP::Left) {
      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
              m_func(offset_0 , offset_1);
            }
@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
    // LR
    else {
      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
              m_func(offset_0 , offset_1);
            }
@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
    if (RP::inner_direction == RP::Left) {
      // Loop over size maxnumblocks until full range covered
      for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
        if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

          for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
            if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
              m_func(Tag(), offset_0 , offset_1);
            }
@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
    }
    else {
      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
              m_func(Tag(), offset_0 , offset_1);
            }
@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
    // LL
    if (RP::inner_direction == RP::Left) {
      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {

          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                  m_func(offset_0 , offset_1 , offset_2);
                }
@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
    // LR
    else {
      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                  m_func(offset_0 , offset_1 , offset_2);
                }
@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
  {
    if (RP::inner_direction == RP::Left) {
      for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
        if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {

          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                  m_func(Tag(), offset_0 , offset_1 , offset_2);
                }
@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
    }
    else {
      for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {

          for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                  m_func(Tag(), offset_0 , offset_1 , offset_2);
                }
@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {

          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

              for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                  for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                      m_func(offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
      const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];

      for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
        if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {

          for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
            if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

              for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                  for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                    if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                      m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
      const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {

                  for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                      m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
                    }
@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];

      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {

          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                          m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                        }
@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                          m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                        }
@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
      const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];

      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {

          for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
            if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                    if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                      for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                        }
@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
      const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                          m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                        }
@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];

      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                              m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                            }
@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                              m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                            }
@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
      const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];

      for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
        if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {

          for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
            if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

              for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                  for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                    if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                      for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                        if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

                          for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                            if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                            }
@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
      const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];

      for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
        if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {

          for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
            if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {

              for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {

                  for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                    if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {

                      for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                        if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {

                          for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                            if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                              m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                            }
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()

 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_constant_memory()
 {
  const DriverType & driver =
@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )

 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
  driver();
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@ -242,45 +242,89 @@ public:
    re_ = v;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const complex<RealType>& src) {
+  complex<RealType>&
+  operator += (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ += src.re_;
    im_ += src.im_;
    return *this;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile complex<RealType>& src) volatile {
+  void
+  operator += (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ += src.re_;
    im_ += src.im_;
  }

  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const RealType& src) {
+  complex<RealType>&
+  operator += (const std::complex<RealType>& src) {
+    re_ += src.real();
+    im_ += src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator += (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ += src;
    return *this;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile RealType& src) volatile {
+  void
+  operator += (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ += src;
  }
-
+  
+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator -= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ -= src.re_;
    im_ -= src.im_;
    return *this;
  }

  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const RealType& src) {
+  complex<RealType>&
+  operator -= (const std::complex<RealType>& src) {
+    re_ -= src.real();
+    im_ -= src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator -= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ -= src;
    return *this;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator *= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    const RealType realPart = re_ * src.re_ - im_ * src.im_;
    const RealType imagPart = re_ * src.im_ + im_ * src.re_;
    re_ = realPart;
@ -288,8 +332,12 @@ public:
    return *this;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile complex<RealType>& src) volatile {
+  void
+  operator *= (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    const RealType realPart = re_ * src.re_ - im_ * src.im_;
    const RealType imagPart = re_ * src.im_ + im_ * src.re_;
    re_ = realPart;
@ -297,20 +345,70 @@ public:
  }

  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const RealType& src) {
+  complex<RealType>&
+  operator *= (const std::complex<RealType>& src) {
+    const RealType realPart = re_ * src.real() - im_ * src.imag();
+    const RealType imagPart = re_ * src.imag() + im_ * src.real();
+    re_ = realPart;
+    im_ = imagPart;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator *= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ *= src;
    im_ *= src;
    return *this;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile RealType& src) volatile {
+  void
+  operator *= (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
    re_ *= src;
    im_ *= src;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const complex<RealType>& y) {
+  complex<RealType>&
+  operator /= (const complex<InputRealType>& y) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+    // If the real part is +/-Inf and the imaginary part is -/+Inf,
+    // this won't change the result.
+    const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
+
+    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+    // because y/s is NaN.
+    if (s == 0.0) {
+      this->re_ /= s;
+      this->im_ /= s;
+    }
+    else {
+      const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
+      const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
+      const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
+        y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
+      *this = x_scaled * y_conj_scaled;
+      *this /= y_scaled_abs;
+    }
+    return *this;
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator /= (const std::complex<RealType>& y) {
+
    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
    // If the real part is +/-Inf and the imaginary part is -/+Inf,
    // this won't change the result.
@ -334,57 +432,95 @@ public:
    return *this;
  }

+
+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const RealType& src) {
+  complex<RealType>&
+  operator /= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
    re_ /= src;
    im_ /= src;
    return *this;
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  bool operator == (const complex<RealType>& src) {
-    return (re_ == src.re_) && (im_ == src.im_);
+  bool
+  operator == (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
  }

  KOKKOS_INLINE_FUNCTION
-  bool operator == (const RealType src) {
-    return (re_ == src) && (im_ == RealType(0));
+  bool
+  operator == (const std::complex<RealType>& src) {
+    return (re_ == src.real()) && (im_ == src.imag());
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator == (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator != (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
  }

  KOKKOS_INLINE_FUNCTION
-  bool operator != (const complex<RealType>& src) {
-    return (re_ != src.re_) || (im_ != src.im_);
+  bool
+  operator != (const std::complex<RealType>& src) {
+    return (re_ != src.real()) || (im_ != src.imag());
  }

+  template<typename InputRealType>
  KOKKOS_INLINE_FUNCTION
-  bool operator != (const RealType src) {
-    return (re_ != src) || (im_ != RealType(0));
-  }
+  bool
+  operator != (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");

+    return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
+  }
+  
 };

 //! Binary + operator for complex complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
 }

 //! Binary + operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () + y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
 }

 //! Binary + operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x + y.real (), y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
 }

 //! Unary + operator for complex.
@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
 }

 //! Binary - operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
 }

 //! Binary - operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () - y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
 }

 //! Binary - operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x - y.real (), - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
 }

 //! Unary - operator for complex.
@ -428,12 +564,12 @@ operator - (const complex<RealType>& x) {
 }

 //! Binary * operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }

 /// \brief Binary * operator for std::complex and complex.
@ -446,33 +582,34 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
 /// This function cannot be called in a CUDA device function, because
 /// std::complex's methods and nonmember functions are not marked as
 /// CUDA device functions.
-template<class RealType>
-complex<RealType>
-operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+template<class RealType1, class RealType2>
+inline
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }

 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }

 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& y, const RealType& x) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& y, const RealType2& x) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }

 //! Imaginary part of a complex number.
@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const complex<RealType1>& x, const RealType2& y) {
-  return complex<RealType1> (real (x) / y, imag (x) / y);
+  return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
 }

 //! Binary operator / for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator / (const complex<RealType>& x, const complex<RealType>& y) {
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
  // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
  // If the real part is +/-Inf and the imaginary part is -/+Inf,
  // this won't change the result.
-  const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));

  // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
  // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
  // because y/s is NaN.
  if (s == 0.0) {
-    return complex<RealType> (real (x) / s, imag (x) / s);
+    return complex<common_real_type> (real (x) / s, imag (x) / s);
  }
  else {
-    const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
-    const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
-    const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
+    const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
+    const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
+    const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
      imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
-    complex<RealType> result = x_scaled * y_conj_scaled;
+    complex<common_real_type> result = x_scaled * y_conj_scaled;
    result /= y_scaled_abs;
    return result;
  }
@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const RealType1& x, const complex<RealType2>& y) {
-  return complex<RealType1> (x)/y;
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
 }

 //! Equality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) == real (y) && imag (x) == imag (y);
+bool
+operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
 }

 /// \brief Equality operator for std::complex and Kokkos::complex.
@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
 /// Otherwise, CUDA builds will give compiler warnings ("warning:
 /// calling a constexpr __host__ function("real") from a __host__
 /// __device__ function("operator==") is not allowed").
-template<class RealType>
-bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) == real (y) && std::imag (x) == imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
 }
-
+  
 //! Equality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
+bool
+operator == (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
 }

 //! Equality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const RealType& x, const complex<RealType>& y) {
+bool
+operator == (const RealType1& x, const complex<RealType2>& y) {
  return y == x;
 }

 //! Inequality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) != real (y) || imag (x) != imag (y);
+bool
+operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
 }

 //! Inequality operator for std::complex and Kokkos::complex.
-template<class RealType>
-KOKKOS_INLINE_FUNCTION
-bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) != real (y) || std::imag (x) != imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
 }

 //! Inequality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
+bool
+operator != (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
 }

 //! Inequality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const RealType& x, const complex<RealType>& y) {
+bool
+operator != (const RealType1& x, const complex<RealType2>& y) {
  return y != x;
 }

--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@ -353,7 +353,14 @@ struct CountAndFill {
  struct Fill {};
  KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
    auto j = m_crs.row_map(i);
-    data_type* fill = &(m_crs.entries(j));
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
+      nullptr : (&(m_crs.entries(j)));
    m_functor(i, fill);
  }
  using self_type = CountAndFill<CrsType, Functor>;
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@ -147,12 +147,11 @@ public:
                 , const size_t arg_alloc_size ) const;

  /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return "HBW"; }

 private:

  AllocationMechanism  m_alloc_mech;
-  static constexpr const char* m_name = "HBW";
  friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
 };

--- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
@ -192,7 +192,7 @@ template<>
 struct reduction_identity<float> {
  KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum()  {return static_cast<float>(0.0f);}
  KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return FLT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return -FLT_MAX;}
  KOKKOS_FORCEINLINE_FUNCTION constexpr static float min()  {return FLT_MAX;}
 };

@ -200,7 +200,7 @@ template<>
 struct reduction_identity<double> {
  KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum()  {return static_cast<double>(0.0);}
  KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return DBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return -DBL_MAX;}
  KOKKOS_FORCEINLINE_FUNCTION constexpr static double min()  {return DBL_MAX;}
 };

@ -208,7 +208,7 @@ template<>
 struct reduction_identity<long double> {
  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum()  {return static_cast<long double>(0.0);}
  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return LDBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return -LDBL_MAX;}
  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min()  {return LDBL_MAX;}
 };

--- a/lib/kokkos/core/src/Kokkos_ROCm.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
 } // namespace Kokkos


+
+#define threadIdx_x (hc_get_workitem_id(0))
+#define threadIdx_y (hc_get_workitem_id(1))
+#define threadIdx_z (hc_get_workitem_id(2))
+
+#define blockIdx_x  (hc_get_group_id(0))
+#define blockIdx_y  (hc_get_group_id(1))
+#define blockIdx_z  (hc_get_group_id(2))
+
+#define blockDim_x  (hc_get_group_size(0))
+#define blockDim_y  (hc_get_group_size(1))
+#define blockDim_z  (hc_get_group_size(2))
+
+#define gridDim_x   (hc_get_num_groups(0))
+#define gridDim_y   (hc_get_num_groups(1))
+#define gridDim_z   (hc_get_num_groups(2))
+
+
 #include <ROCm/Kokkos_ROCm_Parallel.hpp>
 #include <ROCm/Kokkos_ROCm_Task.hpp>

--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@ -88,6 +88,7 @@ build-makefile-kokkos:
 	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
 	echo "" >> Makefile.kokkos
 	echo "#Variables used in application Makefiles" >> Makefile.kokkos
+	echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
 	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
 	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
 	echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
                                                 , thread_local_bytes
                                                 );

+      omp_set_num_threads(partition_size);
      f( omp_get_thread_num(), omp_get_num_threads() );

      Impl::t_openmp_instance->~Exec();
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
@ -113,7 +113,6 @@ void reduce_enqueue(

  if (output_length < 1) return;

-  assert(output_result != nullptr);
  const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);

  // allocate host and device memory for the results from each team
@ -176,14 +175,17 @@ void reduce_enqueue(
      }
      
  });
-  ValueInit::init(ReducerConditional::select(f, reducer), output_result);
+  if (output_result != nullptr)
+     ValueInit::init(ReducerConditional::select(f, reducer), output_result);
  fut.wait();

  copy(result,result_cpu.data());
-  for(std::size_t i=0;i<td.num_tiles;i++)
-    ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
+  if (output_result != nullptr) {
+    for(std::size_t i=0;i<td.num_tiles;i++)
+       ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);

-  ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+    ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+  }

 }

--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
@ -67,7 +67,7 @@ void scan_enqueue(
    hc::array<value_type> result(td.num_tiles);
    hc::array<value_type> scratch(len);

-    tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
+    tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
    {
        const auto local = t_idx.local[0];
        const auto global = t_idx.global[0];
@ -135,7 +135,7 @@ void scan_enqueue(
      ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);

    copy(result_cpu.data(),result);
-    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] 
+    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] 
    {
 //        const auto local = t_idx.local[0];
        const auto global = t_idx.global[0];
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
  return full != i ? _bit_scan_forward( ~i ) : -1 ;
 #elif defined( KOKKOS_COMPILER_IBM )
  return full != i ? __cnttz4( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
  return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
 #else
@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
  return _bit_scan_forward(i);
 #elif defined( KOKKOS_COMPILER_IBM )
  return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? _popcnt(~i & (i-1)) : -1;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
  return __builtin_ffs(i) - 1;
 #else
-  unsigned t = 1u;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t << 1;
-    ++r;
+  int offset = -1;
+  if ( i ) {
+    for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
  }
-  return r;
+  return offset;
 #endif
 }

@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
  return _bit_scan_reverse(i);
 #elif defined( KOKKOS_COMPILER_IBM )
  return shift - __cntlz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? shift - _leadz32(i) : 0 ;
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
  return shift - __builtin_clz(i);
 #else
-  unsigned t = 1u << shift;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t >> 1;
-    ++r;
+  int offset = 0;
+  if ( i ) {
+    for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
  }
-  return r;
+  return offset;
 #endif
 }

@ -142,6 +142,8 @@ int bit_count( unsigned i )
  return _popcnt32(i);
 #elif defined( KOKKOS_COMPILER_IBM )
  return __popcnt4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return _popcnt(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
  return __builtin_popcount(i);
 #else
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
  }
 }

-constexpr const char* HBWSpace::name() {
-  return m_name;
-}
-
 } // namespace Experimental
 } // namespace Kokkos

--- a/lib/kokkos/core/unit_test/TestComplex.hpp
+++ b/lib/kokkos/core/unit_test/TestComplex.hpp
@ -114,7 +114,7 @@ struct TestComplexBasicMath {
  typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;

  void testit () {
-    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
+    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
    h_results = Kokkos::create_mirror_view(d_results);

    Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
@ -125,6 +125,7 @@ struct TestComplexBasicMath {
    std::complex<double> b(3.25,5.75);
    std::complex<double> d(1.0,2.0);
    double c = 9.3;
+    int e = 2;

    std::complex<double> r;
    r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(),  r.imag());
@ -147,6 +148,12 @@ struct TestComplexBasicMath {
    r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
    r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
    r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
+
+    r = a; 
+    /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(),  r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(),  r.imag());
+    /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(),  r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(),  r.imag());
+    /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(),  r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(),  r.imag()*e);
+    /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(),  r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(),  r.imag()/e);
  }

  KOKKOS_INLINE_FUNCTION
@ -190,6 +197,12 @@ struct TestComplexBasicMath {
    d_results(17) = c-a;
    d_results(18) = c*a;
    d_results(19) = c/a;
+
+    int e = 2;
+    d_results(20) = a+e;
+    d_results(21) = a-e;
+    d_results(22) = a*e;
+    d_results(23) = a/e;
  }
 };

--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@ -286,7 +286,9 @@ struct TestMDRange_2D {
    // Test with reducers - scalar
    {
      typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
-      range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+      int s0 = 1;
+      int s1 = 1;
+      range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );

      TestMDRange_2D functor( N0, N1 );

@ -297,7 +299,7 @@ struct TestMDRange_2D {

      parallel_reduce( range, functor, reducer_scalar );

-      ASSERT_EQ( sum, 2 * N0 * N1 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
    }
    // Test with reducers - scalar view
    {
@ -445,7 +447,9 @@ struct TestMDRange_2D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      const int s0 = 1;
+      const int s1 = 1;
+      range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
      TestMDRange_2D functor( N0, N1 );

      parallel_for( range, functor );
@ -454,8 +458,8 @@ struct TestMDRange_2D {
      Kokkos::deep_copy( h_view, functor.input_view );

      int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
      {
        if ( h_view( i, j ) != 3 ) {
          ++counter;
@ -463,7 +467,7 @@ struct TestMDRange_2D {
      }

      if ( counter != 0 ) {
-        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
      }

      ASSERT_EQ( counter, 0 );
@ -699,6 +703,7 @@ struct TestMDRange_2D {

      ASSERT_EQ( counter, 0 );
    }
+
  } // end test_for2
 }; // MDRange_2D

@ -749,7 +754,10 @@ struct TestMDRange_3D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );

      TestMDRange_3D functor( N0, N1, N2 );

@ -757,7 +765,7 @@ struct TestMDRange_3D {
      double sum = 0.0;
      parallel_reduce( range, functor, sum );

-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
    }

    // Test with reducers - scalar
@ -952,7 +960,10 @@ struct TestMDRange_3D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
      TestMDRange_3D functor( N0, N1, N2 );

      parallel_for( range, functor );
@ -961,9 +972,9 @@ struct TestMDRange_3D {
      Kokkos::deep_copy( h_view, functor.input_view );

      int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
      {
        if ( h_view( i, j, k ) != 3 ) {
          ++counter;
@ -971,7 +982,7 @@ struct TestMDRange_3D {
      }

      if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
      }

      ASSERT_EQ( counter, 0 );
@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );

      TestMDRange_4D functor( N0, N1, N2, N3 );

@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
      double sum = 0.0;
      parallel_reduce( range, functor, sum );

-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
    }

    // Test with reducers - scalar
@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
      TestMDRange_4D functor( N0, N1, N2, N3 );

      parallel_for( range, functor );
@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
      Kokkos::deep_copy( h_view, functor.input_view );

      int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
      {
        if ( h_view( i, j, k, l ) != 3 ) {
          ++counter;
@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
      }

      if ( counter != 0 ) {
-        printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
+        printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
      }

      ASSERT_EQ( counter, 0 );
@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );

      TestMDRange_5D functor( N0, N1, N2, N3, N4 );

@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
      double sum = 0.0;
      parallel_reduce( range, functor, sum );

-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
    }

    // Test with reducers - scalar
@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
      TestMDRange_5D functor( N0, N1, N2, N3, N4 );

      parallel_for( range, functor );
@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
      Kokkos::deep_copy( h_view, functor.input_view );

      int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
      {
        if ( h_view( i, j, k, l, m ) != 3 ) {
          ++counter;
@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
      }

      if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
      }

      ASSERT_EQ( counter, 0 );
@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );

      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );

@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
      double sum = 0.0;
      parallel_reduce( range, functor, sum );

-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
    }

    // Test with reducers - scalar
@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
      typedef typename range_type::tile_type tile_type;
      typedef typename range_type::point_type point_type;

-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
      TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );

      parallel_for( range, functor );
@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
      Kokkos::deep_copy( h_view, functor.input_view );

      int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
-      for ( int n = 0; n < N5; ++n )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      for ( int n = s5; n < N5; ++n )
      {
        if ( h_view( i, j, k, l, m, n ) != 3 ) {
          ++counter;
@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
      }

      if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
      }

      ASSERT_EQ( counter, 0 );
--- a/lib/latte/Install.py
+++ b/lib/latte/Install.py
@ -159,13 +159,13 @@ if buildflag or pathflag:
    os.remove("includelink")
  if os.path.isfile("liblink") or os.path.islink("liblink"):
    os.remove("liblink")
-  if os.path.isfile("filelink") or os.path.islink("filelink"):
-    os.remove("filelink")
+  if os.path.isfile("filelink.o") or os.path.islink("filelink.o"):
+    os.remove("filelink.o")
  cmd = 'ln -s "%s/src" includelink' % lattedir
  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
  cmd = 'ln -s "%s" liblink' % lattedir
  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir
+  cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir
  subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)

 # copy Makefile.lammps.suffix to Makefile.lammps
--- a/lib/latte/Makefile.lammps.gfortran
+++ b/lib/latte/Makefile.lammps.gfortran
@ -3,5 +3,5 @@
 # GNU Fortran settings

 latte_SYSINC = 
-latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas
+latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas
 latte_SYSPATH = -fopenmp
--- a/lib/latte/Makefile.lammps.ifort
+++ b/lib/latte/Makefile.lammps.ifort
@ -3,7 +3,7 @@
 # Intel ifort settings

 latte_SYSINC = 
-latte_SYSLIB = ../../lib/latte/filelink \
+latte_SYSLIB = ../../lib/latte/filelink.o \
               -llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \
               -lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \
               -openmp -O0
--- a/src/.gitignore
+++ b/src/.gitignore
@ -414,6 +414,8 @@
 /fix_lambdah_calc.h
 /fix_langevin_eff.cpp
 /fix_langevin_eff.h
+/fix_latte.cpp
+/fix_latte.h
 /fix_lb_fluid.cpp
 /fix_lb_fluid.h
 /fix_lb_momentum.cpp
--- a/src/Depend.sh
+++ b/src/Depend.sh
@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then
  depend KOKKOS
 fi

+if (test $1 = "USER-DRUDE") then
+  depend USER-OMP
+fi
+
 if (test $1 = "USER-FEP") then
  depend USER-OMP
 fi
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag)

 /* ---------------------------------------------------------------------- */

-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecAtomicKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecAtomicKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
-                                          const DAT::tdual_int_2d &list,
-                                          const int & iswap,
-                                          const DAT::tdual_xfloat_2d &buf,
-                                          const int &pbc_flag,
-                                          const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecAtomicKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecAtomicKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecAtomicKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecAtomicKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0) {
-    sync(Host,F_MASK);
-    modified(Host,F_MASK);
-  }
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecAtomicKokkos_PackBorder {
  typedef DeviceType device_type;
--- a/src/KOKKOS/atom_vec_atomic_kokkos.h
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.h
@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
  virtual ~AtomVecAtomicKokkos() {}
  void grow(int);
  void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
  int pack_border(int, int *, double *, int, int *);
  int pack_border_vel(int, int *, double *, int, int *);
  void unpack_border(int, int, double *);
@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
  bigint memory_usage();

  void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                         DAT::tdual_xfloat_2d buf,int iswap,
                         int pbc_flag, int *pbc, ExecutionSpace space);
@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
  DAT::t_x_array d_x;
  DAT::t_v_array d_v;
  DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;

  DAT::tdual_int_1d k_count;
 };
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag)

 /* ---------------------------------------------------------------------- */

-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecBondKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecBondKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
-                                        const DAT::tdual_int_2d &list,
-                                        const int & iswap,
-                                        const DAT::tdual_xfloat_2d &buf,
-                                        const int &pbc_flag,
-                                        const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecBondKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecBondKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecBondKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecBondKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecBondKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecBondKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                     int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecBondKokkos_PackBorder {
  typedef DeviceType device_type;
--- a/src/KOKKOS/atom_vec_bond_kokkos.h
+++ b/src/KOKKOS/atom_vec_bond_kokkos.h
@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
  virtual ~AtomVecBondKokkos() {}
  void grow(int);
  void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
  int pack_border(int, int *, double *, int, int *);
  int pack_border_vel(int, int *, double *, int, int *);
  int pack_border_hybrid(int, int *, double *);
@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
  bigint memory_usage();

  void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                         DAT::tdual_xfloat_2d buf,int iswap,
                         int pbc_flag, int *pbc, ExecutionSpace space);
@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
  DAT::t_x_array d_x;
  DAT::t_v_array d_v;
  DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;

  DAT::t_tagint_1d d_molecule;
  DAT::t_int_2d d_nspecial;
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm {

 /* ---------------------------------------------------------------------- */

-int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
-                                          const DAT::tdual_int_2d &list,
-                                          const int & iswap,
-                                          const DAT::tdual_xfloat_2d &buf,
-                                          const int &pbc_flag,
-                                          const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecChargeKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecChargeKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-                                        const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecChargeKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecChargeKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecChargeKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecChargeKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecChargeKokkos_PackBorder {
  typedef DeviceType device_type;
--- a/src/KOKKOS/atom_vec_charge_kokkos.h
+++ b/src/KOKKOS/atom_vec_charge_kokkos.h
@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
  virtual ~AtomVecChargeKokkos() {}
  void grow(int);
  void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
  int pack_border(int, int *, double *, int, int *);
  int pack_border_vel(int, int *, double *, int, int *);
  int pack_border_hybrid(int, int *, double *);
@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
  bigint memory_usage();

  void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                         DAT::tdual_xfloat_2d buf,int iswap,
                         int pbc_flag, int *pbc, ExecutionSpace space);
@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
  DAT::t_x_array d_x;
  DAT::t_v_array d_v;
  DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;

  DAT::t_float_1d d_q;

--- a/src/KOKKOS/atom_vec_dpd_kokkos.h
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.h
@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos {
  DAT::t_x_array d_x;
  DAT::t_v_array d_v;
  DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;

  DAT::tdual_int_1d k_count;
 };
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag)

 /* ---------------------------------------------------------------------- */

-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecFullKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecFullKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()
-				*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
-                                             const DAT::tdual_int_2d &list,
-                                             const int & iswap,
-                                             const DAT::tdual_xfloat_2d &buf,
-                                             const int &pbc_flag,
-                                             const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,1,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,1,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,0,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,0,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecFullKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecFullKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-    _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),
-    _list(list.view<DeviceType>()),_iswap(iswap),
-    _xprd(xprd),_yprd(yprd),_zprd(zprd),
-    _xy(xy),_xz(xz),_yz(yz) {
-    _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-    _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                                           const int & iswap,
-                                           const int nfirst, const int &pbc_flag,
-                                           const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecFullKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecFullKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecFullKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecFullKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf,
-                                      int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                          int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecFullKokkos_PackBorder {
  typedef DeviceType device_type;
--- a/src/KOKKOS/atom_vec_full_kokkos.h
+++ b/src/KOKKOS/atom_vec_full_kokkos.h
@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
  virtual ~AtomVecFullKokkos() {}
  void grow(int);
  void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
  int pack_border(int, int *, double *, int, int *);
  int pack_border_vel(int, int *, double *, int, int *);
  int pack_border_hybrid(int, int *, double *);
@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
  bigint memory_usage();

  void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                         DAT::tdual_xfloat_2d buf,int iswap,
                         int pbc_flag, int *pbc, ExecutionSpace space);
@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
  DAT::t_x_array d_x;
  DAT::t_v_array d_v;
  DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;

  DAT::t_float_1d d_q;
  HAT::t_float_1d h_q;
--- a/src/KOKKOS/atom_vec_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_kokkos.cpp
@ -12,6 +12,10 @@
 ------------------------------------------------------------------------- */

 #include "atom_vec_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "atom_masks.h"

 using namespace LAMMPS_NS;

@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp)
  buffer_size = 0;
 }

+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_xfloat_2d &buf,
+                                          const int &pbc_flag,
+                                          const int* const pbc)
+{
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+
+	return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecKokkos_PackCommSelf(
+      const typename DAT::tdual_x_array &x,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _xw(i+_nfirst,0) = _x(j,0);
+          _xw(i+_nfirst,1) = _x(j,1);
+          _xw(i+_nfirst,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+                                        const int nfirst, const int &pbc_flag, const int* const pbc) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+	return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecKokkos_UnpackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first,
+    const DAT::tdual_xfloat_2d &buf ) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    struct AtomVecKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    struct AtomVecKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (atom->mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_PackReverse {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d _buf;
+  int _first;
+
+  AtomVecKokkos_PackReverse(
+      const typename DAT::tdual_f_array &f,
+      const typename DAT::tdual_ffloat_2d &buf,
+      const int& first):_f(f.view<DeviceType>()),_buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    _buf(i,0) = _f(i+_first,0);
+    _buf(i,1) = _f(i+_first,1);
+    _buf(i,2) = _f(i+_first,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first,
+    const DAT::tdual_ffloat_2d &buf ) {
+  if(commKK->reverse_comm_on_host) {
+    sync(Host,F_MASK);
+    struct AtomVecKokkos_PackReverse<LMPHostType> f(atomKK->k_f,buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,F_MASK);
+    struct AtomVecKokkos_PackReverse<LMPDeviceType> f(atomKK->k_f,buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+
+  return n*size_reverse;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnPackReverseSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
+  typename ArrayTypes<DeviceType>::t_f_array _fw;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+
+  AtomVecKokkos_UnPackReverseSelf(
+      const typename DAT::tdual_f_array &f,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap):
+      _f(f.view<DeviceType>()),_fw(f.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap) {
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    _fw(j,0) += _f(i+_nfirst,0);
+    _fw(j,1) += _f(i+_nfirst,1);
+    _fw(j,2) += _f(i+_nfirst,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+                                        const int nfirst) {
+  if(commKK->reverse_comm_on_host) {
+    sync(Host,F_MASK);
+    struct AtomVecKokkos_UnPackReverseSelf<LMPHostType> f(atomKK->k_f,nfirst,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Host,F_MASK);
+  } else {
+    sync(Device,F_MASK);
+    struct AtomVecKokkos_UnPackReverseSelf<LMPDeviceType> f(atomKK->k_f,nfirst,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Device,F_MASK);
+  }
+  return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnPackReverse {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array _f;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+
+  AtomVecKokkos_UnPackReverse(
+      const typename DAT::tdual_f_array &f,
+      const typename DAT::tdual_ffloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap):
+      _f(f.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    _f(j,0) += _buf(i,0);
+    _f(j,1) += _buf(i,1);
+    _f(j,2) += _buf(i,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_reverse_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_ffloat_2d &buf)
+{
+  // Check whether to always run reverse communication on the host
+  // Choose correct reverse UnPackReverse kernel
+
+  if(commKK->reverse_comm_on_host) {
+    struct AtomVecKokkos_UnPackReverse<LMPHostType> f(atomKK->k_f,buf,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Host,F_MASK);
+  } else {
+    struct AtomVecKokkos_UnPackReverse<LMPDeviceType> f(atomKK->k_f,buf,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Device,F_MASK);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_reverse(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,F_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+
+  if(n > 0)
+    modified(Host,F_MASK);
+}
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec {
 public:
  AtomVecKokkos(class LAMMPS *);
  virtual ~AtomVecKokkos() {}
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual int pack_comm_vel(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
+  virtual void unpack_comm_vel(int, int, double *);
+  virtual int pack_reverse(int, int, double *);
+  virtual void unpack_reverse(int, int *, double *);

  virtual void sync(ExecutionSpace space, unsigned int mask) = 0;
  virtual void modified(ExecutionSpace space, unsigned int mask) = 0;
-  virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {};
+  virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0;

  virtual int
    pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
                   const int & iswap, const int nfirst,
-                   const int &pbc_flag, const int pbc[]) = 0;
-  //{return 0;}
+                   const int &pbc_flag, const int pbc[]);
+
  virtual int
    pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list,
                     const int & iswap, const DAT::tdual_xfloat_2d &buf,
-                     const int &pbc_flag, const int pbc[]) = 0;
-  //{return 0;}
+                     const int &pbc_flag, const int pbc[]);
+
  virtual void
    unpack_comm_kokkos(const int &n, const int &nfirst,
-                       const DAT::tdual_xfloat_2d &buf) = 0;
+                       const DAT::tdual_xfloat_2d &buf);
+
+  virtual int
+    unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list,
+                      const int & iswap, const int nfirst);
+
+  virtual int
+    pack_reverse_kokkos(const int &n, const int &nfirst,
+                        const DAT::tdual_ffloat_2d &buf);
+
+  virtual void
+    unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list,
+                          const int & iswap, const DAT::tdual_ffloat_2d &buf);
+
  virtual int
    pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                       DAT::tdual_xfloat_2d buf,int iswap,
                       int pbc_flag, int *pbc, ExecutionSpace space) = 0;
-  //{return 0;};
+
  virtual void
    unpack_border_kokkos(const int &n, const int &nfirst,
                         const DAT::tdual_xfloat_2d &buf,
@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec {
                         DAT::tdual_int_1d k_sendlist,
                         DAT::tdual_int_1d k_copylist,
                         ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0;
-  //{return 0;};
+
  virtual int
    unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
                           int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
                           ExecutionSpace space) = 0;
-  //{return 0;};
+

 protected:

+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
  class CommKokkos *commKK;
  size_t buffer_size;
  void* buffer;
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
  if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
  memory->sfree(sendlist);
  sendlist = NULL;
-  k_sendlist = ArrayTypes<LMPDeviceType>::tdual_int_2d();
+  k_sendlist = DAT::tdual_int_2d();
+  k_total_send = DAT::tdual_int_scalar("comm::k_total_send");

  // error check for disallow of OpenMP threads?

@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
  memory->destroy(buf_recv);
  buf_recv = NULL;

-  k_exchange_sendlist = ArrayTypes<LMPDeviceType>::
+  k_exchange_sendlist = DAT::
    tdual_int_1d("comm:k_exchange_sendlist",100);
-  k_exchange_copylist = ArrayTypes<LMPDeviceType>::
+  k_exchange_copylist = DAT::
    tdual_int_1d("comm:k_exchange_copylist",100);
-  k_count = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_count",1);
-  k_sendflag = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_sendflag",100);
+  k_count = DAT::tdual_int_scalar("comm:k_count");
+  k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);

  memory->destroy(maxsendlist);
  maxsendlist = NULL;
@ -102,8 +103,10 @@ void CommKokkos::init()
  atomKK = (AtomKokkos *) atom;
  exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
  forward_comm_classic = lmp->kokkos->forward_comm_classic;
+  reverse_comm_classic = lmp->kokkos->reverse_comm_classic;
  exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
  forward_comm_on_host = lmp->kokkos->forward_comm_on_host;
+  reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host;

  CommBrick::init();

@ -132,8 +135,11 @@ void CommKokkos::init()
  if (force->newton == 0) check_reverse = 0;
  if (force->pair) check_reverse += force->pair->comm_reverse_off;

-  if(check_reverse || check_forward)
+  if (ghost_velocity)
    forward_comm_classic = true;
+
+  if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
+    reverse_comm_classic = true;
 }

 /* ----------------------------------------------------------------------
@ -173,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy)
  int n;
  MPI_Request request;
  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
-  double **x = atom->x;
  double *buf;

  // exchange data with another proc
@ -181,32 +186,29 @@ void CommKokkos::forward_comm_device(int dummy)
  // if comm_x_only set, exchange or copy directly to x, don't unpack

  k_sendlist.sync<DeviceType>();
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);

  for (int iswap = 0; iswap < nswap; iswap++) {
-
    if (sendproc[iswap] != me) {
      if (comm_x_only) {
-        atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
-        if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
-        else buf = NULL;
-
        if (size_forward_recv[iswap]) {
            buf = atomKK->k_x.view<DeviceType>().ptr_on_device() +
              firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1();
            MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
+                      recvproc[iswap],0,world,&request);
        }
        n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
                                   iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
-
        if (n) {
          MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),
                   n,MPI_DOUBLE,sendproc[iswap],0,world);
        }

-        if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
-                         space,X_MASK);
+        if (size_forward_recv[iswap]) {
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                           space,X_MASK);
+        }
      } else if (ghost_velocity) {
        error->all(FLERR,"Ghost velocity forward comm not yet "
                   "implemented with Kokkos");
@ -248,21 +250,93 @@ void CommKokkos::forward_comm_device(int dummy)
    }
  }
 }
+
+/* ----------------------------------------------------------------------
+   reverse communication of forces on atoms every timestep
+   other per-atom attributes may also be sent via pack/unpack routines
+------------------------------------------------------------------------- */
+
 void CommKokkos::reverse_comm()
 {
+  if (!reverse_comm_classic) {
+    if (reverse_comm_on_host) reverse_comm_device<LMPHostType>();
+    else reverse_comm_device<LMPDeviceType>();
+    return;
+  }
+
  k_sendlist.sync<LMPHostType>();
+
  if (comm_f_only)
    atomKK->sync(Host,F_MASK);
  else
    atomKK->sync(Host,ALL_MASK);
+
  CommBrick::reverse_comm();
+
  if (comm_f_only)
    atomKK->modified(Host,F_MASK);
  else
    atomKK->modified(Host,ALL_MASK);
-  atomKK->sync(Device,ALL_MASK);
+
+  //atomKK->sync(Device,ALL_MASK); // is this needed?
 }

+template<class DeviceType>
+void CommKokkos::reverse_comm_device()
+{
+  int n;
+  MPI_Request request;
+  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
+  double *buf;
+  
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_f_only set, exchange or copy directly from f, don't pack
+
+  k_sendlist.sync<DeviceType>();
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK);
+
+  for (int iswap = nswap-1; iswap >= 0; iswap--) {
+    if (sendproc[iswap] != me) {
+      if (comm_f_only) {
+        if (size_reverse_recv[iswap])
+            MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE,
+                    sendproc[iswap],0,world,&request);
+        if (size_reverse_send[iswap]) {
+          buf = atomKK->k_f.view<DeviceType>().ptr_on_device() +
+            firstrecv[iswap]*atomKK->k_f.view<DeviceType>().dimension_1();
+  
+          MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
+                   recvproc[iswap],0,world);
+        }
+        if (size_reverse_recv[iswap]) {
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                           space,F_MASK);
+        }
+      } else {
+        if (size_reverse_recv[iswap])
+          MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
+                    size_reverse_recv[iswap],MPI_DOUBLE,
+                    sendproc[iswap],0,world,&request);
+        n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
+        if (n)
+          MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n,
+                   MPI_DOUBLE,recvproc[iswap],0,world);
+        if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+      }
+      avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                k_buf_recv);
+    } else {
+      if (sendnum[iswap])
+        n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap,
+                                 firstrecv[iswap]);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void CommKokkos::forward_comm_fix(Fix *fix, int size)
 {
  k_sendlist.sync<LMPHostType>();
@ -408,7 +482,7 @@ struct BuildExchangeListFunctor {
  typename AT::t_x_array _x;

  int _nlocal,_dim;
-  typename AT::t_int_1d _nsend;
+  typename AT::t_int_scalar _nsend;
  typename AT::t_int_1d _sendlist;
  typename AT::t_int_1d _sendflag;

@ -416,7 +490,7 @@ struct BuildExchangeListFunctor {
  BuildExchangeListFunctor(
      const typename AT::tdual_x_array x,
      const typename AT::tdual_int_1d sendlist,
-      typename AT::tdual_int_1d nsend,
+      typename AT::tdual_int_scalar nsend,
      typename AT::tdual_int_1d sendflag,int nlocal, int dim,
                X_FLOAT lo, X_FLOAT hi):
                _x(x.template view<DeviceType>()),
@ -430,7 +504,7 @@ struct BuildExchangeListFunctor {
  KOKKOS_INLINE_FUNCTION
  void operator() (int i) const {
    if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) {
-      const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1);
+      const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1);
      if(mysend<_sendlist.dimension_0()) {
        _sendlist(mysend) = i;
        _sendflag(i) = 1;
@ -489,9 +563,9 @@ void CommKokkos::exchange_device()
    if (true) {
      if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal);
      k_sendflag.sync<DeviceType>();
-      k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0();
-      while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
-        k_count.h_view(0) = 0;
+      k_count.h_view() = k_exchange_sendlist.h_view.dimension_0();
+      while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
+        k_count.h_view() = 0;
        k_count.modify<LMPHostType>();
        k_count.sync<DeviceType>();

@ -504,10 +578,10 @@ void CommKokkos::exchange_device()
        k_count.modify<DeviceType>();

        k_count.sync<LMPHostType>();
-        if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
-          k_exchange_sendlist.resize(k_count.h_view(0)*1.1);
-          k_exchange_copylist.resize(k_count.h_view(0)*1.1);
-          k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0();
+        if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
+          k_exchange_sendlist.resize(k_count.h_view()*1.1);
+          k_exchange_copylist.resize(k_count.h_view()*1.1);
+          k_count.h_view()=k_exchange_sendlist.h_view.dimension_0();
        }
      }
      k_exchange_copylist.sync<LMPHostType>();
@ -515,22 +589,22 @@ void CommKokkos::exchange_device()
      k_sendflag.sync<LMPHostType>();

      int sendpos = nlocal-1;
-      nlocal -= k_count.h_view(0);
-      for(int i = 0; i < k_count.h_view(0); i++) {
+      nlocal -= k_count.h_view();
+      for(int i = 0; i < k_count.h_view(); i++) {
        if (k_exchange_sendlist.h_view(i)<nlocal) {
          while (k_sendflag.h_view(sendpos)) sendpos--;
          k_exchange_copylist.h_view(i) = sendpos;
          sendpos--;
        } else
-        k_exchange_copylist.h_view(i) = -1;
+          k_exchange_copylist.h_view(i) = -1;
      }

      k_exchange_copylist.modify<LMPHostType>();
      k_exchange_copylist.sync<DeviceType>();
-      nsend = k_count.h_view(0);
+      nsend = k_count.h_view();
      if (nsend > maxsend) grow_send_kokkos(nsend,1);
      nsend =
-        avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send,
+        avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
                                   k_exchange_sendlist,k_exchange_copylist,
                                   ExecutionSpaceFromDevice<DeviceType>::
                                   space,dim,lo,hi);
@ -640,9 +714,7 @@ void CommKokkos::borders()
  }

  atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
  k_sendlist.sync<LMPHostType>();
-  k_sendlist.modify<LMPHostType>();
  CommBrick::borders();
  k_sendlist.modify<LMPHostType>();
  atomKK->modified(Host,ALL_MASK);
@ -659,11 +731,11 @@ struct BuildBorderListFunctor {
  int iswap,maxsendlist;
  int nfirst,nlast,dim;
  typename AT::t_int_2d sendlist;
-  typename AT::t_int_1d nsend;
+  typename AT::t_int_scalar nsend;

  BuildBorderListFunctor(typename AT::tdual_x_array _x,
                         typename AT::tdual_int_2d _sendlist,
-                         typename AT::tdual_int_1d _nsend,int _nfirst,
+                         typename AT::tdual_int_scalar _nsend,int _nfirst,
                         int _nlast, int _dim,
                         X_FLOAT _lo, X_FLOAT _hi, int _iswap,
                         int _maxsendlist):
@ -684,7 +756,7 @@ struct BuildBorderListFunctor {
    for (int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()) {
      if (x(i,dim) >= lo && x(i,dim) <= hi) mysend++;
    }
-    const int my_store_pos = dev.team_scan(mysend,&nsend(0));
+    const int my_store_pos = dev.team_scan(mysend,&nsend());

    if (my_store_pos+mysend < maxsendlist) {
    mysend = my_store_pos;
@ -713,7 +785,7 @@ void CommKokkos::borders_device() {
  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;

  ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  k_sendlist.modify<DeviceType>();
+  k_sendlist.sync<DeviceType>();
  atomKK->sync(exec_space,ALL_MASK);

  // do swaps over all 3 dimensions
@ -763,37 +835,38 @@ void CommKokkos::borders_device() {
      if (sendflag) {
        if (!bordergroup || ineed >= 2) {
          if (style == SINGLE) {
-            typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1);
-            total_send.h_view(0) = 0;
-            if(exec_space == Device) {
-              total_send.template modify<DeviceType>();
-              total_send.template sync<LMPDeviceType>();
-            }
+            k_total_send.h_view() = 0;
+            k_total_send.template modify<LMPHostType>();
+            k_total_send.template sync<LMPDeviceType>();

            BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
-                total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+                k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
            Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
            Kokkos::parallel_for(config,f);

-            total_send.template modify<DeviceType>();
-            total_send.template sync<LMPHostType>();
+            k_total_send.template modify<DeviceType>();
+            k_total_send.template sync<LMPHostType>();
+
+            k_sendlist.modify<DeviceType>();
+
+            if(k_total_send.h_view() >= maxsendlist[iswap]) {
+              grow_list(iswap,k_total_send.h_view());
+
+              k_total_send.h_view() = 0;
+              k_total_send.template modify<LMPHostType>();
+              k_total_send.template sync<LMPDeviceType>();

-            if(total_send.h_view(0) >= maxsendlist[iswap]) {
-              grow_list(iswap,total_send.h_view(0));
-              k_sendlist.modify<DeviceType>();
-              total_send.h_view(0) = 0;
-              if(exec_space == Device) {
-                total_send.template modify<LMPHostType>();
-                total_send.template sync<LMPDeviceType>();
-              }
              BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
-                  total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+                  k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
              Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
              Kokkos::parallel_for(config,f);
-              total_send.template modify<DeviceType>();
-              total_send.template sync<LMPHostType>();
+
+              k_total_send.template modify<DeviceType>();
+              k_total_send.template sync<LMPHostType>();
+
+              k_sendlist.modify<DeviceType>();
            }
-            nsend = total_send.h_view(0);
+            nsend = k_total_send.h_view();
          } else {
            error->all(FLERR,"Required border comm not yet "
                       "implemented with Kokkos");
@ -916,10 +989,11 @@ void CommKokkos::borders_device() {

  // reset global->local map

-  if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
  atomKK->modified(exec_space,ALL_MASK);
-  atomKK->sync(Host,TAG_MASK);
-  if (map_style) atom->map_set();
+  if (map_style) {
+    atomKK->sync(Host,TAG_MASK);
+    atom->map_set();
+  }
 }
 /* ----------------------------------------------------------------------
   realloc the size of the send buffer as needed with BUFFACTOR and bufextra
@ -961,7 +1035,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
    buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
  }
  else {
-    k_buf_send = ArrayTypes<LMPDeviceType>::
+    k_buf_send = DAT::
      tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border);
    buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
  }
@ -975,7 +1049,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space)
 {
  maxrecv = static_cast<int> (BUFFACTOR * n);
  int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2;
-  k_buf_recv = ArrayTypes<LMPDeviceType>::
+  k_buf_recv = DAT::
    tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border);
  buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device();
 }
@ -988,6 +1062,11 @@ void CommKokkos::grow_list(int iswap, int n)
 {
  int size = static_cast<int> (BUFFACTOR * n);

+  if (exchange_comm_classic) { // force realloc on Host
+    k_sendlist.sync<LMPHostType>();
+    k_sendlist.modify<LMPHostType>();
+  }
+
  memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");

  for(int i=0;i<maxswap;i++) {
@ -1011,6 +1090,11 @@ void CommKokkos::grow_swap(int n)
  maxswap = n;
  int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN);

+  if (exchange_comm_classic) { // force realloc on Host
+    k_sendlist.sync<LMPHostType>();
+    k_sendlist.modify<LMPHostType>();
+  }
+
  memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");

  memory->grow(maxsendlist,n,"comm:maxsendlist");
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@ -25,15 +25,17 @@ class CommKokkos : public CommBrick {

  bool exchange_comm_classic;
  bool forward_comm_classic;
+  bool reverse_comm_classic;
  bool exchange_comm_on_host;
  bool forward_comm_on_host;
+  bool reverse_comm_on_host;

  CommKokkos(class LAMMPS *);
  ~CommKokkos();
  void init();

  void forward_comm(int dummy = 0);    // forward comm of atom coords
-  void reverse_comm();              // reverse comm of atom coords
+  void reverse_comm();                 // reverse comm of atom coords
  void exchange();                     // move atoms to new procs
  void borders();                      // setup list of atoms to comm

@ -47,15 +49,17 @@ class CommKokkos : public CommBrick {
  void reverse_comm_dump(class Dump *);    // reverse comm from a Dump

  template<class DeviceType> void forward_comm_device(int dummy);
+  template<class DeviceType> void reverse_comm_device();
  template<class DeviceType> void forward_comm_pair_device(Pair *pair);
  template<class DeviceType> void exchange_device();
  template<class DeviceType> void borders_device();

 protected:
  DAT::tdual_int_2d k_sendlist;
+  DAT::tdual_int_scalar k_total_send;
  DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
  DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
-  DAT::tdual_int_1d k_count;
+  DAT::tdual_int_scalar k_count;
  //double *buf_send;                 // send buffer for all comm
  //double *buf_recv;                 // recv buffer for all comm

--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :

  nmax = nmax = m_cap = 0;
  allocated_flag = 0;
+  nprev = 4;
 }

 /* ---------------------------------------------------------------------- */
@ -158,15 +159,15 @@ void FixQEqReaxKokkos<DeviceType>::init_hist()
 {
  int i,j;

-  k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5);
+  k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev);
  d_s_hist = k_s_hist.template view<DeviceType>();
  h_s_hist = k_s_hist.h_view;
-  k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5);
+  k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev);
  d_t_hist = k_t_hist.template view<DeviceType>();
  h_t_hist = k_t_hist.h_view;

  for( i = 0; i < atom->nmax; i++ )
-    for( j = 0; j < 5; j++ )
+    for( j = 0; j < nprev; j++ )
      k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0;

  k_s_hist.template modify<LMPHostType>();
@ -334,11 +335,11 @@ void FixQEqReaxKokkos<DeviceType>::allocate_array()
    d_d = k_d.template view<DeviceType>();
    h_d = k_d.h_view;

-    k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5);
+    k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev);
    d_s_hist = k_s_hist.template view<DeviceType>();
    h_s_hist = k_s_hist.h_view;

-    k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5);
+    k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev);
    d_t_hist = k_t_hist.template view<DeviceType>();
    h_t_hist = k_t_hist.h_view;
  }
@ -368,7 +369,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
    d_o[i] = 0.0;
    d_r[i] = 0.0;
    d_d[i] = 0.0;
-    //for( int j = 0; j < 5; j++ )
+    //for( int j = 0; j < nprev; j++ )
      //d_s_hist(i,j) = d_t_hist(i,j) = 0.0;
  }

@ -1087,7 +1088,7 @@ void FixQEqReaxKokkos<DeviceType>::calculate_q_item(int ii) const
  if (mask[i] & groupbit) {
    q(i) = d_s[i] - delta * d_t[i];

-    for (int k = 4; k > 0; --k) {
+    for (int k = nprev-1; k > 0; --k) {
      d_s_hist(i,k) = d_s_hist(i,k-1);
      d_t_hist(i,k) = d_t_hist(i,k-1);
    }
@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos<DeviceType>::memory_usage()
 {
  double bytes;

-  bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist
+  bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist
  bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage
  bytes += n_cap*2 * sizeof(int); // matrix...
  bytes += m_cap * sizeof(int);
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
  neighflag_qeq_set = 0;
  exchange_comm_classic = 0;
  forward_comm_classic = 0;
+  reverse_comm_classic = 0;
  exchange_comm_on_host = 0;
  forward_comm_on_host = 0;
+  reverse_comm_on_host = 0;

 #ifdef KILL_KOKKOS_ON_SIGSEGV
  signal(SIGSEGV, my_signal_handler);
@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
  neighflag_qeq_set = 0;
  int newtonflag = 0;
  double binsize = 0.0;
-  exchange_comm_classic = forward_comm_classic = 0;
-  exchange_comm_on_host = forward_comm_on_host = 0;
+  exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+  exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;

  int iarg = 0;
  while (iarg < narg) {
@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg)
    } else if (strcmp(arg[iarg],"comm") == 0) {
      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
      if (strcmp(arg[iarg+1],"no") == 0) {
-        exchange_comm_classic = forward_comm_classic = 1;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
      } else if (strcmp(arg[iarg+1],"host") == 0) {
-        exchange_comm_classic = forward_comm_classic = 0;
-        exchange_comm_on_host = forward_comm_on_host = 1;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
      } else if (strcmp(arg[iarg+1],"device") == 0) {
-        exchange_comm_classic = forward_comm_classic = 0;
-        exchange_comm_on_host = forward_comm_on_host = 0;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
      } else error->all(FLERR,"Illegal package kokkos command");
      iarg += 2;
    } else if (strcmp(arg[iarg],"comm/exchange") == 0) {
@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg)
        forward_comm_on_host = 0;
      } else error->all(FLERR,"Illegal package kokkos command");
      iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/reverse") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) {
+        reverse_comm_classic = 0;
+        reverse_comm_on_host = 1;
+      } else if (strcmp(arg[iarg+1],"device") == 0) {
+        reverse_comm_classic = 0;
+        reverse_comm_on_host = 0;
+      } else error->all(FLERR,"Illegal package kokkos command");
+      iarg += 2;
    } else error->all(FLERR,"Illegal package kokkos command");
  }

--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers {
  int neighflag_qeq_set;
  int exchange_comm_classic;
  int forward_comm_classic;
+  int reverse_comm_classic;
  int exchange_comm_on_host;
  int forward_comm_on_host;
+  int reverse_comm_on_host;
  int num_threads,ngpu;
  int numa;
  int auto_sync;
--- a/src/KOKKOS/nbin_kokkos.cpp
+++ b/src/KOKKOS/nbin_kokkos.cpp
@ -75,6 +75,10 @@ void NBinKokkos<DeviceType>::bin_atoms_setup(int nall)
    k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins);
    bincount = k_bincount.view<DeviceType>();
  }
+  if (nall > k_atom2bin.d_view.dimension_0()) {
+    k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall);
+    atom2bin = k_atom2bin.view<DeviceType>();
+  }
 }

 /* ----------------------------------------------------------------------
@ -86,6 +90,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
 {
  last_bin = update->ntimestep;

+  k_bins.template sync<DeviceType>();
+  k_bincount.template sync<DeviceType>();
+  k_atom2bin.template sync<DeviceType>();
+
  h_resize() = 1;

  while(h_resize() > 0) {
@ -115,6 +123,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
      c_bins = bins;
    }
  }
+
+  k_bins.template modify<DeviceType>();
+  k_bincount.template modify<DeviceType>();
+  k_atom2bin.template modify<DeviceType>();
 }

 /* ---------------------------------------------------------------------- */
@ -125,6 +137,7 @@ void NBinKokkos<DeviceType>::binatomsItem(const int &i) const
 {
  const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2));

+  atom2bin(i) = ibin;
  const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1);
  if(ac < bins.dimension_1()) {
    bins(ibin, ac) = i;
--- a/src/KOKKOS/nbin_kokkos.h
+++ b/src/KOKKOS/nbin_kokkos.h
@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard {
  int atoms_per_bin;
  DAT::tdual_int_1d k_bincount;
  DAT::tdual_int_2d k_bins;
+  DAT::tdual_int_1d k_atom2bin;

  typename AT::t_int_1d bincount;
  const typename AT::t_int_1d_const c_bincount;
  typename AT::t_int_2d bins;
  typename AT::t_int_2d_const c_bins;
+  typename AT::t_int_1d atom2bin;
  typename AT::t_int_scalar d_resize;
  typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
  typename AT::t_x_array_randomread x;
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag)
  // build pairwise lists for all perpetual NPair/NeighList
  // grow() with nlocal/nall args so that only realloc if have to

-  atomKK->sync(Host,ALL_MASK);
  for (i = 0; i < npair_perpetual; i++) {
    m = plist[i];
+    if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK);
    if (!lists[m]->copy) lists[m]->grow(nlocal,nall);
    neigh_pair[m]->build_setup();
    neigh_pair[m]->build(lists[m]);
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@ -73,6 +73,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_bin_info()
  atoms_per_bin = nbKK->atoms_per_bin;
  k_bincount = nbKK->k_bincount;
  k_bins = nbKK->k_bins;
+  k_atom2bin = nbKK->k_atom2bin;
 }

 /* ----------------------------------------------------------------------
@ -88,13 +89,15 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_stencil_info()

  int maxstencil = ns->get_maxstencil();

-  k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
+  if (maxstencil > k_stencil.dimension_0())
+    k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
  for (int k = 0; k < maxstencil; k++)
    k_stencil.h_view(k) = ns->stencil[k];
    k_stencil.modify<LMPHostType>();
    k_stencil.sync<DeviceType>();
  if (GHOST) {
-    k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
+    if (maxstencil > k_stencilxyz.dimension_0())
+      k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
    for (int k = 0; k < maxstencil; k++) {
      k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
      k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];
@ -122,6 +125,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
         k_cutneighsq.view<DeviceType>(),
         k_bincount.view<DeviceType>(),
         k_bins.view<DeviceType>(),
+         k_atom2bin.view<DeviceType>(),
         nstencil,
         k_stencil.view<DeviceType>(),
         k_stencilxyz.view<DeviceType>(),
@ -164,8 +168,9 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
  k_ex_mol_group.sync<DeviceType>();
  k_ex_mol_bit.sync<DeviceType>();
  k_ex_mol_intra.sync<DeviceType>();
-  k_bincount.sync<DeviceType>(),
-  k_bins.sync<DeviceType>(),
+  k_bincount.sync<DeviceType>();
+  k_bins.sync<DeviceType>();
+  k_atom2bin.sync<DeviceType>();
  atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);

  data.special_flag[0] = special_flag[0];
@ -317,7 +322,7 @@ void NeighborKokkosExecute<DeviceType>::
  const X_FLOAT ztmp = x(i, 2);
  const int itype = type(i);

-  const int ibin = coord2bin(xtmp, ytmp, ztmp);
+  const int ibin = c_atom2bin(i);

  const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
    = d_stencil;
@ -431,7 +436,7 @@ void NeighborKokkosExecute<DeviceType>::
  if(n > neigh_list.maxneighs) {
    resize() = 1;

-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
  }

  neigh_list.d_ilist(i) = i;
@ -641,7 +646,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli
  if(n > neigh_list.maxneighs) {
    resize() = 1;

-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
  }
  }
 }
@ -678,7 +683,7 @@ void NeighborKokkosExecute<DeviceType>::
  // no molecular test when i = ghost atom

  if (i < nlocal) {
-    const int ibin = coord2bin(xtmp, ytmp, ztmp);
+    const int ibin = c_atom2bin(i);
    for (int k = 0; k < nstencil; k++) {
      const int jbin = ibin + stencil[k];
      for(int m = 0; m < c_bincount(jbin); m++) {
@ -764,7 +769,7 @@ void NeighborKokkosExecute<DeviceType>::
  if(n > neigh_list.maxneighs) {
    resize() = 1;

-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
  }
  neigh_list.d_ilist(i) = i;
 }
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@ -105,6 +105,7 @@ class NPairKokkos : public NPair {
  int atoms_per_bin;
  DAT::tdual_int_1d k_bincount;
  DAT::tdual_int_2d k_bins;
+  DAT::tdual_int_1d k_atom2bin;

  // data from NStencil class

@ -148,6 +149,8 @@ class NeighborKokkosExecute
  const typename AT::t_int_1d_const c_bincount;
  typename AT::t_int_2d bins;
  typename AT::t_int_2d_const c_bins;
+  const typename AT::t_int_1d atom2bin;
+  const typename AT::t_int_1d_const c_atom2bin;


  // data from NStencil class
@ -190,6 +193,7 @@ class NeighborKokkosExecute
                        const typename AT::t_xfloat_2d_randomread &_cutneighsq,
                        const typename AT::t_int_1d &_bincount,
                        const typename AT::t_int_2d &_bins,
+                        const typename AT::t_int_1d &_atom2bin,
                        const int _nstencil,
                        const typename AT::t_int_1d &_d_stencil,
                        const typename AT::t_int_1d_3 &_d_stencilxyz,
@ -224,6 +228,7 @@ class NeighborKokkosExecute
                        const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
    neigh_list(_neigh_list), cutneighsq(_cutneighsq),
    bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
+    atom2bin(_atom2bin),c_atom2bin(_atom2bin),
    nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),
    nlocal(_nlocal),
    x(_x),type(_type),mask(_mask),molecule(_molecule),
@ -281,38 +286,6 @@ class NeighborKokkosExecute
  void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
 #endif

-  KOKKOS_INLINE_FUNCTION
-  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
-  {
-    int ix,iy,iz;
-
-    if (x >= bboxhi[0])
-      ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
-    else if (x >= bboxlo[0]) {
-      ix = static_cast<int> ((x-bboxlo[0])*bininvx);
-      ix = MIN(ix,nbinx-1);
-    } else
-      ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
-
-    if (y >= bboxhi[1])
-      iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
-    else if (y >= bboxlo[1]) {
-      iy = static_cast<int> ((y-bboxlo[1])*bininvy);
-      iy = MIN(iy,nbiny-1);
-    } else
-      iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
-
-    if (z >= bboxhi[2])
-      iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
-    else if (z >= bboxlo[2]) {
-      iz = static_cast<int> ((z-bboxlo[2])*bininvz);
-      iz = MIN(iz,nbinz-1);
-    } else
-      iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
-
-    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
-  }
-
  KOKKOS_INLINE_FUNCTION
  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
  {
--- a/src/KOKKOS/pair_reaxc_kokkos.cpp
+++ b/src/KOKKOS/pair_reaxc_kokkos.cpp
@ -131,6 +131,8 @@ template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::init_style()
 {
  PairReaxC::init_style();
+  if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version
+  fix_reax = NULL;

  // irequest = neigh request made by parent class

@ -555,8 +557,8 @@ void PairReaxCKokkos<DeviceType>::Deallocate_Lookup_Tables()

  ntypes = atom->ntypes;

-  for( i = 0; i < ntypes; ++i ) {
-    for( j = i; j < ntypes; ++j )
+  for( i = 0; i <= ntypes; ++i ) {
+    for( j = i; j <= ntypes; ++j )
      if( LR[i][j].n ) {
        sfree( LR[i][j].y, "LR[i,j].y" );
        sfree( LR[i][j].H, "LR[i,j].H" );
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@ -294,6 +294,7 @@ void VerletKokkos::run(int n)
  int n_pre_exchange = modify->n_pre_exchange;
  int n_pre_neighbor = modify->n_pre_neighbor;
  int n_pre_force = modify->n_pre_force;
+  int n_pre_reverse = modify->n_pre_reverse;
  int n_post_force = modify->n_post_force;
  int n_end_of_step = modify->n_end_of_step;

@ -304,9 +305,9 @@ void VerletKokkos::run(int n)

  f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0());

-  static double time = 0.0;
  atomKK->sync(Device,ALL_MASK);
-  Kokkos::Impl::Timer ktimer;
+  //static double time = 0.0;
+  //Kokkos::Impl::Timer ktimer;

  timer->init_timeout();
  for (int i = 0; i < n; i++) {
@ -320,10 +321,10 @@ void VerletKokkos::run(int n)

    // initial time integration

-    ktimer.reset();
+    //ktimer.reset();
    timer->stamp();
    modify->initial_integrate(vflag);
-    time += ktimer.seconds();
+    //time += ktimer.seconds();
    if (n_post_integrate) modify->post_integrate();
    timer->stamp(Timer::MODIFY);

@ -523,11 +524,18 @@ void VerletKokkos::run(int n)
      atomKK->k_f.modify<LMPDeviceType>();
    }

+    if (n_pre_reverse) {
+      modify->pre_reverse(eflag,vflag);
+      timer->stamp(Timer::MODIFY);
+    }

    // reverse communication of forces

-    if (force->newton) comm->reverse_comm();
-    timer->stamp(Timer::COMM);
+    if (force->newton) {
+      Kokkos::fence();
+      comm->reverse_comm();
+      timer->stamp(Timer::COMM);
+    }

    // force modifications, final time integration, diagnostics

--- a/src/MAKE/MACHINES/Makefile.cori2
+++ b/src/MAKE/MACHINES/Makefile.cori2
@ -15,13 +15,14 @@ SHELL = /bin/sh

 CC =		CC
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \
+                $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M

 LINK =		CC
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           
 SIZE =		size

--- a/src/MAKE/OPTIONS/Makefile.intel_coprocessor
+++ b/src/MAKE/OPTIONS/Makefile.intel_coprocessor
@ -10,7 +10,7 @@ CC =		mpiicpc
 MIC_OPT =       -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
 CCFLAGS =	-g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
                -xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \
-                -qoverride-limits $(MIC_OPT)
+                -qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M

--- a/src/MAKE/OPTIONS/Makefile.intel_cpu
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu
@ -8,14 +8,14 @@ SHELL = /bin/sh

 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M

 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           -ltbbmalloc -ltbbmalloc_proxy
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size

 ARCHIVE =	ar
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@ -8,8 +8,8 @@ SHELL = /bin/sh

 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M

--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@ -8,14 +8,14 @@ SHELL = /bin/sh

 CC =		mpicxx -cxx=icc
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M

 LINK =		mpicxx -cxx=icc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size

 ARCHIVE =	ar
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@ -9,14 +9,14 @@ SHELL = /bin/sh
 export OMPI_CXX = icc
 CC =		mpicxx
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M

 LINK =		mpicxx
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           -ltbbmalloc -ltbbmalloc_proxy
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size

 ARCHIVE =	ar
--- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
+++ b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
@ -1,123 +0,0 @@
-# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT
-
-SHELL = /bin/sh
-
-# ---------------------------------------------------------------------
-# compiler/linker settings
-# specify flags and libraries needed for your compiler
-
-CC =		mpiicpc 
-MIC_OPT =       -qoffload-arch=mic-avx512 -fp-model fast=2
-CCFLAGS =	-O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-                -xHost -fno-alias -ansi-alias -restrict \
-                -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT
-SHFLAGS =	-fPIC
-DEPFLAGS =	-M
-
-LINK =		mpiicpc
-LINKFLAGS =	-g -O3 -xHost -qopenmp -qoffload $(MIC_OPT)
-LIB =           -ltbbmalloc
-SIZE =		size
-
-ARCHIVE =	ar
-ARFLAGS =	-rc
-SHLIBFLAGS =	-shared
-
-# ---------------------------------------------------------------------
-# LAMMPS-specific settings, all OPTIONAL
-# specify settings for LAMMPS features you will use
-# if you change any -D setting, do full re-compile after "make clean"
-
-# LAMMPS ifdef settings
-# see possible settings in Section 2.2 (step 4) of manual
-
-LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
-
-# MPI library
-# see discussion in Section 2.2 (step 5) of manual
-# MPI wrapper compiler/linker can provide this info
-# can point to dummy MPI library in src/STUBS as in Makefile.serial
-# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
-# INC = path for mpi.h, MPI compiler settings
-# PATH = path for MPI library
-# LIB = name of MPI library
-
-MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
-MPI_PATH = 
-MPI_LIB =
-
-# FFT library
-# see discussion in Section 2.2 (step 6) of manaul
-# can be left blank to use provided KISS FFT library
-# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
-# PATH = path for FFT library
-# LIB = name of FFT library
-
-FFT_INC =      -DFFT_MKL -DFFT_SINGLE
-FFT_PATH = 
-FFT_LIB =	-L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core
-
-# JPEG and/or PNG library
-# see discussion in Section 2.2 (step 7) of manual
-# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
-# INC = path(s) for jpeglib.h and/or png.h
-# PATH = path(s) for JPEG library and/or PNG library
-# LIB = name(s) of JPEG library and/or PNG library
-
-JPG_INC =       
-JPG_PATH = 	
-JPG_LIB =	-ljpeg
-
-# ---------------------------------------------------------------------
-# build rules and dependencies
-# do not edit this section
-
-include	Makefile.package.settings
-include	Makefile.package
-
-EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
-EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
-EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
-EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
-EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
-
-# Path to src files
-
-vpath %.cpp ..
-vpath %.h ..
-
-# Link target
-
-$(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
-	$(SIZE) $(EXE)
-
-# Library targets
-
-lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
-
-shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
-        $(OBJ) $(EXTRA_LIB) $(LIB)
-
-# Compilation rules
-
-%.o:%.cpp $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-%.d:%.cpp $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
-
-%.o:%.cu $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-# Individual dependencies
-
-depend : fastdep.exe $(SRC)
-	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
-
-fastdep.exe: ../DEPEND/fastdep.c
-	cc -O -o $@ $<
-
-sinclude .depend
--- a/src/MAKE/OPTIONS/Makefile.knl
+++ b/src/MAKE/OPTIONS/Makefile.knl
@ -8,13 +8,13 @@ SHELL = /bin/sh

 CC =		mpiicpc
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M

 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size

--- a/src/REPLICA/prd.cpp
+++ b/src/REPLICA/prd.cpp
@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg)
  time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0;
  bigint clock = 0;

+  timer->init();
  timer->barrier_start();
  time_start = timer->get_wall(Timer::TOTAL);

--- a/src/REPLICA/tad.cpp
+++ b/src/REPLICA/tad.cpp
@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg)
  nbuild = ndanger = 0;
  time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0;

+  timer->init();
  timer->barrier_start();
  time_start = timer->get_wall(Timer::TOTAL);

--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@ -46,7 +46,7 @@ action nbin_intel.h
 action nbin_intel.cpp
 action npair_intel.h
 action npair_intel.cpp
-action intel_simd.h pair_sw_intel.cpp
+action intel_simd.h
 action intel_intrinsics.h pair_tersoff_intel.cpp
 action intel_intrinsics_airebo.h pair_airebo_intel.cpp

--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@ -30,28 +30,37 @@ be added or changed in the Makefile depending on the version:

 2017 update 2         - No changes needed
 2017 updates 3 or 4   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
-2018 or newer         - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
+2018 inital release   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
+2018u1 or newer       - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 

 -----------------------------------------------------------------------------

 When using the suffix command with "intel", intel styles will be used if they
 exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP 
-USER-OMP styles will be used whenever USER-INTEL styles are not available. This
-allow for running most styles in LAMMPS with threading.
+is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
+available. This allow for running most styles in LAMMPS with threading.

 -----------------------------------------------------------------------------

-The Long-Range Thread mode (LRT) in the Intel package currently uses
-pthreads by default. If pthreads are not supported in the build environment,
-the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for 
-builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to
-build with compilers that support threads using the C++11 standard. When using
+The Long-Range Thread mode (LRT) in the Intel package is enabled through the
+-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles
+include this define. This feature will use pthreads by default.
+Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that
+support threads intrinsically using the C++11 standard. When using
 LRT mode, you might need to disable OpenMP affinity settings (e.g.
 export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings
 need to be changed.

 -----------------------------------------------------------------------------

+Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG
+should be added to the compile flags. This will enable using the MKL Mersenne
+Twister random number generator (RNG) for Dissipative Particle Dynamics 
+(DPD). This RNG can allow significantly faster performance and it also has a 
+significantly longer period than the standard RNG for DPD.
+
+-----------------------------------------------------------------------------
+
 In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag 
 -DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of 
 Intel compilers.
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@ -9,6 +9,7 @@
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
 # in.intel.airebo -     Polyethelene benchmark with AIREBO
+# in.intel.dpd -        Dissipative Particle Dynamics
 #
 #############################################################################

@ -16,16 +17,17 @@
 # Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
 #  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
 #
-#                     Xeon E5-2697v4     Xeon Phi 7250
+#                     Xeon E5-2697v4     Xeon Phi 7250    Xeon Gold 6148
 #                    
-# in.intel.lj -            199.5               282.3
-# in.intel.rhodo -          12.4                17.5
-# in.intel.lc -	            19.0                25.7
-# in.intel.eam -            59.4                92.8
-# in.intel.sw -	           132.4               161.9
-# in.intel.tersoff -        83.3               101.1
-# in.intel.water -          53.4                90.3
-# in.intel.airebo -          7.3                11.8
+# in.intel.lj -            199.5               282.3            317.3
+# in.intel.rhodo -          12.4                17.5             24.4
+# in.intel.lc -	            19.0                25.7             26.8
+# in.intel.eam -            59.4                92.8            105.6 
+# in.intel.sw -	           132.4               161.9            213.8
+# in.intel.tersoff -        83.3               101.1            109.6
+# in.intel.water -          53.4                90.3            105.5
+# in.intel.airebo -          7.3                11.8             17.6
+# in.intel.dpd -            74.5               100.4            148.1
 #
 #############################################################################

--- a/src/USER-INTEL/TEST/in.intel.dpd
+++ b/src/USER-INTEL/TEST/in.intel.dpd
@ -0,0 +1,48 @@
+# DPD benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 4000	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+variable	rr equal floor($t*$m)
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+units		lj
+atom_style	atomic
+comm_modify     mode single vel yes
+
+lattice		fcc 3.0
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.0 87287 loop geom
+
+pair_style	dpd 1.0 1.0 928948
+pair_coeff	1 1 25.0 4.5
+
+neighbor	0.5 bin
+neigh_modify    delay 0 every 1
+
+fix		1 all nve
+timestep	0.04
+
+thermo			1000
+
+if "$p > 0"		then "run_style verlet/power"
+
+if "$w > 0"		then "run $w"
+run    	 ${rr}
--- a/src/USER-INTEL/dihedral_fourier_intel.cpp
+++ b/src/USER-INTEL/dihedral_fourier_intel.cpp
@ -0,0 +1,441 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include "dihedral_fourier_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define PTOLERANCE (flt_t)1.05
+#define MTOLERANCE (flt_t)-1.05
+typedef struct { int a,b,c,d,t;  } int5_t;
+
+/* ---------------------------------------------------------------------- */
+
+DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
+  : DihedralFourier(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    DihedralFourier::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::compute(int eflag, int vflag,
+				   IntelBuffers<flt_t,acc_t> *buffers,
+				   const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  if (evflag) {
+    if (vflag && !eflag) {
+      if (force->newton_bond)
+        eval<0,1,1>(vflag, buffers, fc);
+      else
+        eval<0,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+        eval<1,1,1>(vflag, buffers, fc);
+      else
+        eval<1,1,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralFourierIntel::eval(const int vflag,
+				IntelBuffers<flt_t,acc_t> *buffers,
+				const ForceConst<flt_t> &fc)
+
+{
+  const int inum = neighbor->ndihedrallist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)           \
+    reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int5_t * _noalias const dihedrallist =
+      (int5_t *) neighbor->dihedrallist[0];
+
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
+      const int i1 = dihedrallist[n].a;
+      const int i2 = dihedrallist[n].b;
+      const int i3 = dihedrallist[n].c;
+      const int i4 = dihedrallist[n].d;
+      const int type = dihedrallist[n].t;
+
+      // 1st bond
+
+      const flt_t vb1x = x[i1].x - x[i2].x;
+      const flt_t vb1y = x[i1].y - x[i2].y;
+      const flt_t vb1z = x[i1].z - x[i2].z;
+
+      // 2nd bond
+
+      const flt_t vb2xm = x[i2].x - x[i3].x;
+      const flt_t vb2ym = x[i2].y - x[i3].y;
+      const flt_t vb2zm = x[i2].z - x[i3].z;
+
+      // 3rd bond
+
+      const flt_t vb3x = x[i4].x - x[i3].x;
+      const flt_t vb3y = x[i4].y - x[i3].y;
+      const flt_t vb3z = x[i4].z - x[i3].z;
+
+      // c,s calculation
+
+      const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
+      const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
+      const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
+      const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
+      const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
+      const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
+
+      const flt_t rasq = ax*ax + ay*ay + az*az;
+      const flt_t rbsq = bx*bx + by*by + bz*bz;
+      const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+      const flt_t rg = sqrt(rgsq);
+
+      flt_t rginv, ra2inv, rb2inv;
+      rginv = ra2inv = rb2inv = (flt_t)0.0;
+      if (rg > 0) rginv = (flt_t)1.0/rg;
+      if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
+      if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
+      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+
+      flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
+      const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+      // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
+      if (c > PTOLERANCE || c < MTOLERANCE) {
+        int me = comm->me;
+
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
+      }
+      #endif
+
+      if (c > (flt_t)1.0) c = (flt_t)1.0;
+      if (c < (flt_t)-1.0) c = (flt_t)-1.0;
+
+      flt_t deng;
+      flt_t df = (flt_t)0.0;
+      if (EFLAG) deng = (flt_t)0.0;
+      
+      for (int j = 0; j < nterms[type]; j++) {
+	const flt_t tcos_shift = fc.bp[j][type].cos_shift;
+	const flt_t tsin_shift = fc.bp[j][type].sin_shift;
+	const flt_t tk = fc.bp[j][type].k;
+	const int m = fc.bp[j][type].multiplicity;
+
+	flt_t p = (flt_t)1.0;
+	flt_t ddf1, df1;
+	ddf1 = df1 = (flt_t)0.0;
+
+	for (int i = 0; i < m; i++) {
+	  ddf1 = p*c - df1*s;
+	  df1 = p*s + df1*c;
+	  p = ddf1;
+	}
+
+	p = p*tcos_shift + df1*tsin_shift;
+	df1 = df1*tcos_shift - ddf1*tsin_shift;
+	df1 *= -m;
+	p += (flt_t)1.0;
+	
+	if (m == 0) {
+	  p = (flt_t)1.0 + tcos_shift;
+	  df1 = (flt_t)0.0;
+	}
+
+        if (EFLAG) deng += tk * p;
+        df -= tk * df1;
+      }
+
+      const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+      const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+      const flt_t fga = fg*ra2inv*rginv;
+      const flt_t hgb = hg*rb2inv*rginv;
+      const flt_t gaa = -ra2inv*rg;
+      const flt_t gbb = rb2inv*rg;
+
+      const flt_t dtfx = gaa*ax;
+      const flt_t dtfy = gaa*ay;
+      const flt_t dtfz = gaa*az;
+      const flt_t dtgx = fga*ax - hgb*bx;
+      const flt_t dtgy = fga*ay - hgb*by;
+      const flt_t dtgz = fga*az - hgb*bz;
+      const flt_t dthx = gbb*bx;
+      const flt_t dthy = gbb*by;
+      const flt_t dthz = gbb*bz;
+
+      const flt_t sx2 = df*dtgx;
+      const flt_t sy2 = df*dtgy;
+      const flt_t sz2 = df*dtgz;
+
+      flt_t f1x = df*dtfx;
+      flt_t f1y = df*dtfy;
+      flt_t f1z = df*dtfz;
+
+      const flt_t f2x = sx2 - f1x;
+      const flt_t f2y = sy2 - f1y;
+      const flt_t f2z = sz2 - f1z;
+
+      flt_t f4x = df*dthx;
+      flt_t f4y = df*dthy;
+      flt_t f4z = df*dthz;
+
+      const flt_t f3x = -sx2 - f4x;
+      const flt_t f3y = -sy2 - f4y;
+      const flt_t f3z = -sz2 - f4z;
+
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
+      }
+
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
+
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
+        }
+      }
+    } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
+  } // omp parallel
+
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::init_style()
+{
+  DihedralFourier::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
+					    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->ndihedraltypes + 1;
+  fc.set_ntypes(bp1, setflag, nterms, memory);
+
+  for (int i = 1; i < bp1; i++) {
+    if (setflag[i]) {
+      for (int j = 0; j < nterms[i]; j++) {
+        fc.bp[j][i].cos_shift = cos_shift[i][j];
+	fc.bp[j][i].sin_shift = sin_shift[i][j];
+	fc.bp[j][i].k = k[i][j];
+	fc.bp[j][i].multiplicity = multiplicity[i][j];
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
+                                                         int *setflag,
+							 int *nterms,
+							 Memory *memory) {
+  if (nbondtypes != _nbondtypes) {
+    if (_nbondtypes > 0)
+      _memory->destroy(bp);
+
+    if (nbondtypes > 0) {
+      _maxnterms = 1;
+      for (int i = 1; i <= nbondtypes; i++)
+        if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
+
+      _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
+    }
+  }
+  _nbondtypes = nbondtypes;
+  _memory = memory;
+}
--- a/src/USER-INTEL/dihedral_fourier_intel.h
+++ b/src/USER-INTEL/dihedral_fourier_intel.h
@ -0,0 +1,82 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(fourier/intel,DihedralFourierIntel)
+
+#else
+
+#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
+#define LMP_DIHEDRAL_FOURIER_INTEL_H
+
+#include "dihedral_fourier.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class DihedralFourierIntel : public DihedralFourier {
+
+ public:
+  DihedralFourierIntel(class LAMMPS *lmp);
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cos_shift, sin_shift, k; 
+      int multiplicity; } fc_packed1;
+
+    fc_packed1 **bp;
+
+    ForceConst() : _nbondtypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
+
+    void set_ntypes(const int nbondtypes, int *setflag, int *nterms, 
+		    Memory *memory);
+
+   private:
+    int _nbondtypes, _maxnterms;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@ -285,6 +285,7 @@ int FixIntel::setmask()
 {
  int mask = 0;
  mask |= PRE_REVERSE;
+  mask |= MIN_PRE_REVERSE;
  #ifdef _LMP_INTEL_OFFLOAD
  mask |= POST_FORCE;
  mask |= MIN_POST_FORCE;
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@ -43,6 +43,7 @@ class FixIntel : public Fix {
  virtual int setmask();
  virtual void init();
  virtual void setup(int);
+  inline void min_setup(int in) { setup(in); }
  void setup_pre_reverse(int eflag = 0, int vflag = 0);

  void pair_init_check(const bool cdmessage=false);
@ -50,6 +51,8 @@ class FixIntel : public Fix {
  void kspace_init_check();

  void pre_reverse(int eflag = 0, int vflag = 0);
+  inline void min_pre_reverse(int eflag = 0, int vflag = 0)
+    { pre_reverse(eflag, vflag); }

  // Get all forces, calculation results from coprocesser
  void sync_coprocessor();
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
  IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
  lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
  #endif
+  memset(_ccachei, 0, vsize * sizeof(int));
  memset(_ccachej, 0, vsize * sizeof(int));

  #ifdef _LMP_INTEL_OFFLOAD
@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
      #pragma offload_transfer target(mic:_cop) \
        nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
        nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
-        nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
+        in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
        in(ccachej:length(vsize) alloc_if(1) free_if(0))
    }
    #ifdef LMP_USE_AVXCD
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
    ito = inum;                                                 \
  }

+#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
+                                 nthr, vecsize)                 \
+  {                                                             \
+    tid = 0;							\
+    ifrom = 0;							\
+    ip = 1;							\
+    ito = inum;							\
+  }
+
 #endif

 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + binstart[k]];
 	      const int bend = binhead[ibin + binend[k]];
              #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + stencil[k]];
 	      const int bend = binhead[ibin + stencil[k] + 1];
              #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
              #pragma simd
              #endif
              for (int jj = bstart; jj < bend; jj++)
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin + binstart[k]];
            const int bend = binhead[ibin + binend[k]];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
            #pragma simd
            #endif
            for (int jj = bstart; jj < bend; jj++)
@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
            const int bstart = binhead[ibin];
            const int bend = binhead[ibin + 1];
            #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
            #pragma simd
            #endif
            for (int jj = bstart; jj < bend; jj++) {
--- a/src/USER-INTEL/pair_dpd_intel.cpp
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@ -0,0 +1,617 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_dpd_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LMP_MKL_RNG VSL_BRNG_MT19937
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define IEPSILON 1.0e10
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
+  PairDPD(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  random_thread = NULL;
+  _nrandom_thread = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::~PairDPDIntel()
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_USE_MKL_RNG
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #endif
+  }
+  #endif
+  delete []random_thread;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::compute(int eflag, int vflag,
+                           IntelBuffers<flt_t,acc_t> *buffers,
+                           const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (_onetype) {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  }
+}
+
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairDPDIntel::eval(const int offload, const int vflag,
+                        IntelBuffers<flt_t,acc_t> *buffers,
+                        const ForceConst<flt_t> &fc,
+                        const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  typedef struct { double x, y, z; } lmp_vt;
+  lmp_vt *v = (lmp_vt *)atom->v[0];
+  const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const FC_PACKED1_T * _noalias const param = fc.param[0];
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  int * _noalias const rngi_thread = fc.rngi;
+  const int rng_size = buffers->get_max_nbors();
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+  int *overflow = fix->get_off_overflow_flag();
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      #ifdef LMP_USE_MKL_RNG
+      VSLStreamStatePtr *my_random = &(random_thread[tid]);
+      #else
+      RanMars *my_random = random_thread[tid];
+      #endif
+      flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
+      int rngi = rngi_thread[tid];
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      flt_t icut, a0, gamma, sigma;
+      if (ONETYPE) {
+        icut = param[3].icut;
+        a0 = param[3].a0;
+        gamma = param[3].gamma;
+        sigma = param[3].sigma;
+      }
+      for (int i = iifrom; i < iito; i += iip) {
+        int itype, ptr_off;
+        const FC_PACKED1_T * _noalias parami;
+        if (!ONETYPE) {
+          itype = x[i].w;
+          ptr_off = itype * ntypes;
+          parami = param + ptr_off;
+        }
+
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+	const flt_t vxtmp = v[i].x;
+	const flt_t vytmp = v[i].y;
+	const flt_t vztmp = v[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+	if (rngi + jnum > rng_size) {
+          #ifdef LMP_USE_MKL_RNG
+	  if (sizeof(flt_t) == sizeof(float))
+	    vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+			  (float*)my_rand_buffer, (float)0.0, (float)1.0 );
+	  else
+	    vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+	  		  (double*)my_rand_buffer, 0.0, 1.0 );
+          #else
+          for (int jj = 0; jj < rngi; jj++)
+            my_rand_buffer[jj] = my_random->gaussian();
+          #endif
+	  rngi = 0;
+	}
+
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+	                         sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcelj, evdwl;
+          forcelj = evdwl = (flt_t)0.0;
+
+          int j, jtype, sbindex;
+          if (!ONETYPE) {
+            sbindex = jlist[jj] >> SBBITS & 3;
+            j = jlist[jj] & NEIGHMASK;
+          } else
+            j = jlist[jj];
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          if (!ONETYPE) {
+            jtype = x[j].w;
+            icut = parami[jtype].icut;
+          }
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+	  const flt_t rinv = (flt_t)1.0/sqrt(rsq);
+
+          if (rinv > icut) {
+            flt_t factor_dpd;
+            if (!ONETYPE) factor_dpd = special_lj[sbindex];
+
+	    flt_t delvx = vxtmp - v[j].x;
+	    flt_t delvy = vytmp - v[j].y;
+	    flt_t delvz = vztmp - v[j].z;
+	    flt_t dot = delx*delvx + dely*delvy + delz*delvz;
+	    flt_t randnum = my_rand_buffer[jj];
+
+	    flt_t iwd = rinv - icut;
+	    if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
+
+	    if (!ONETYPE) {
+	      a0 = parami[jtype].a0;
+	      gamma = parami[jtype].gamma;
+	      sigma = parami[jtype].sigma;
+	    }
+	    flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
+	    if (!ONETYPE) fpair *= factor_dpd;
+	    fpair *= iwd;
+
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+            if (EFLAG) {
+	      flt_t cut = (flt_t)1.0/icut;
+	      flt_t r = (flt_t)1.0/rinv;
+	      evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
+	      if (!ONETYPE) evdwl *= factor_dpd;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl;
+              }
+            }
+
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
+          } // if rsq
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+	rngi += jnum;
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+      rngi_thread[tid] = rngi;
+    } // end omp
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::settings(int narg, char **arg) {
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_USE_MKL_RNG
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::settings(narg,arg);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_USE_MKL_RNG
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #else
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::init_style()
+{
+  PairDPD::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_balance() != 0.0)
+    error->all(FLERR,
+          "Offload for dpd/intel is not yet available. Set balance to 0.");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  _onetype = 0;
+  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
+
+  int tp1 = atom->ntypes + 1;
+  fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      } else {
+        cut = init_one(i,j);
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.param[i][j].a0 = a0[i][j];
+      fc.param[i][j].gamma = gamma[i][j];
+      fc.param[i][j].sigma = sigma[i][j];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                 const int nthreads,
+						 const int max_nbors,
+                                                 Memory *memory,
+                                                 const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      _memory->destroy(param);
+      _memory->destroy(rand_buffer_thread);
+      _memory->destroy(rngi);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(param,ntypes,ntypes,"fc.param");
+      memory->create(rand_buffer_thread, nthreads, max_nbors, 
+		     "fc.rand_buffer_thread");
+      memory->create(rngi,nthreads,"fc.param");
+      for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::read_restart_settings(FILE *fp)
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_USE_MKL_RNG
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::read_restart_settings(fp);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_USE_MKL_RNG
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #else
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #endif
+}
--- a/src/USER-INTEL/pair_dpd_intel.h
+++ b/src/USER-INTEL/pair_dpd_intel.h
@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/intel,PairDPDIntel)
+
+#else
+
+#ifndef LMP_PAIR_DPD_INTEL_H
+#define LMP_PAIR_DPD_INTEL_H
+
+#include "pair_dpd.h"
+#include "fix_intel.h"
+
+#ifdef LMP_USE_MKL_RNG
+#include "mkl_vsl.h"
+#else
+#include "random_mars.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class PairDPDIntel : public PairDPD {
+
+ public:
+  PairDPDIntel(class LAMMPS *);
+  ~PairDPDIntel();
+
+  virtual void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  void read_restart_settings(FILE *);
+
+ private:
+  FixIntel *fix;
+  int _cop, _onetype, _nrandom_thread;
+
+  #ifdef LMP_USE_MKL_RNG
+  VSLStreamStatePtr *random_thread;
+  #else
+  RanMars **random_thread;
+  #endif
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
+
+    _alignvar(flt_t special_lj[4],64);
+    fc_packed1 **param;
+    flt_t **rand_buffer_thread;
+    int *rngi;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, 
+                    Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@ -68,7 +68,7 @@ void VerletLRTIntel::init()

  _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));

-  #ifdef LMP_INTEL_NOLRT
+  #ifndef LMP_INTEL_USELRT
  error->all(FLERR,
             "LRT otion for Intel package disabled at compile time");
  #endif
--- a/src/USER-INTEL/verlet_lrt_intel.h
+++ b/src/USER-INTEL/verlet_lrt_intel.h
@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel)
 #include "verlet.h"
 #include "pppm_intel.h"

-#ifndef LMP_INTEL_USELRT
-#define LMP_INTEL_NOLRT
-#else
-
+#ifdef LMP_INTEL_USELRT
 #ifdef LMP_INTEL_LRT11
 #define _LMP_INTEL_LRT_11
 #include <thread>
--- a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
+++ b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
@ -134,7 +134,7 @@ public:
 // Manifold itself:
 manifold_gaussian_bump::manifold_gaussian_bump(class LAMMPS* lmp,
                                               int narg, char **arg)
-	: manifold(lmp), lut_z(NULL), lut_zp(NULL) {}
+        : manifold(lmp), lut_z(NULL), lut_zp(NULL) {}


 manifold_gaussian_bump::~manifold_gaussian_bump()
@ -361,13 +361,13 @@ void manifold_gaussian_bump::test_lut()
    n( x, nn );
    double taper_z;
    if( xx <= rc1 ){
-	    taper_z = gaussian_bump(xx);
+            taper_z = gaussian_bump(xx);
    }else if( xx < rc2 ){
-	    taper_z = lut_get_z( xx );
+            taper_z = lut_get_z( xx );
    }else{
-	    taper_z = 0.0;
+            taper_z = 0.0;
    }
-    fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
+    fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
             gg, nn[0], nn[1], nn[2] );
  }
  fclose(fp);
--- a/src/USER-MISC/fix_srp.cpp
+++ b/src/USER-MISC/fix_srp.cpp
@ -98,7 +98,7 @@ int FixSRP::setmask()

 void FixSRP::init()
 {
-  if (force->pair_match("hybrid",1) == NULL)
+  if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL)
    error->all(FLERR,"Cannot use pair srp without pair_style hybrid");

  int has_rigid = 0;
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@ -88,8 +88,8 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :

  if (multiproc)
    error->all(FLERR,"Multi-processor writes are not supported.");
-  if (multifile)
-    error->all(FLERR,"Multiple files are not supported.");
+  if (append_flag && multifile)
+    error->all(FLERR,"Cannot append when writing to multiple files.");

  perat = new nc_perat_t[nfield];

@ -224,6 +224,24 @@ DumpNetCDF::~DumpNetCDF()

 void DumpNetCDF::openfile()
 {
+  char *filecurrent = filename;
+  if (multifile && !singlefile_opened) {
+    char *filestar = filecurrent;
+    filecurrent = new char[strlen(filestar) + 16];
+    char *ptr = strchr(filestar,'*');
+    *ptr = '\0';
+    if (padflag == 0)
+      sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
+              filestar,update->ntimestep,ptr+1);
+    else {
+      char bif[8],pad[16];
+      strcpy(bif,BIGINT_FORMAT);
+      sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]);
+      sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1);
+    }
+    *ptr = '*';
+  }
+
  if (thermo && !singlefile_opened) {
    if (thermovar)  delete [] thermovar;
    thermovar = new int[output->thermo->nfield];
@ -268,14 +286,14 @@ void DumpNetCDF::openfile()
  ntotalgr = group->count(igroup);

  if (filewriter) {
-    if (append_flag && access(filename, F_OK) != -1) {
+    if (append_flag && !multifile && access(filecurrent, F_OK) != -1) {
      // Fixme! Perform checks if dimensions and variables conform with
      // data structure standard.

      if (singlefile_opened) return;
      singlefile_opened = 1;

-      NCERRX( nc_open(filename, NC_WRITE, &ncid), filename );
+      NCERRX( nc_open(filecurrent, NC_WRITE, &ncid), filecurrent );

      // dimensions
      NCERRX( nc_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR );
@ -348,8 +366,8 @@ void DumpNetCDF::openfile()
      if (singlefile_opened) return;
      singlefile_opened = 1;

-      NCERRX( nc_create(filename, NC_64BIT_DATA, &ncid),
-          filename );
+      NCERRX( nc_create(filecurrent, NC_64BIT_DATA, &ncid),
+              filecurrent );

      // dimensions
      NCERRX( nc_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim),
@ -598,15 +616,39 @@ void DumpNetCDF::closefile()
  if (filewriter && singlefile_opened) {
    NCERR( nc_close(ncid) );
    singlefile_opened = 0;
-    // append next time DumpNetCDF::openfile is called
-    append_flag = 1;
    // write to next frame upon next open
-    framei++;
+    if (multifile)
+      framei = 1;
+    else {
+      // append next time DumpNetCDF::openfile is called
+      append_flag = 1;
+      framei++;
+    }
  }
 }

 /* ---------------------------------------------------------------------- */

+template <typename T>
+int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const T* tp)
+{
+  return nc_put_var1_int(ncid, varid, index, tp);
+}
+
+template <>
+int nc_put_var1_bigint<long>(int ncid, int varid, const size_t index[],
+                        const long* tp)
+{
+  return nc_put_var1_long(ncid, varid, index, tp);
+}
+
+template <>
+int nc_put_var1_bigint<long long>(int ncid, int varid, const size_t index[],
+                             const long long* tp)
+{
+  return nc_put_var1_longlong(ncid, varid, index, tp);
+}
+
 void DumpNetCDF::write()
 {
  // open file
@ -638,13 +680,8 @@ void DumpNetCDF::write()
                  th->keyword[i] );
        }
        else if (th->vtype[i] == BIGINT) {
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-          NCERRX( nc_put_var1_long(ncid, thermovar[i], start, &th->bivalue),
+          NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue),
                  th->keyword[i] );
-#else
-          NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &th->bivalue),
-                  th->keyword[i] );
-#endif
        }
      }
    }
@ -887,6 +924,8 @@ int DumpNetCDF::modify_param(int narg, char **arg)
    return 2;
  }
  else if (strcmp(arg[iarg],"at") == 0) {
+    if (!append_flag)
+      error->all(FLERR,"expected 'append yes' before 'at' keyword");
    iarg++;
    framei = force->inumeric(FLERR,arg[iarg]);
    if (framei < 0)  framei--;
@ -911,68 +950,6 @@ int DumpNetCDF::modify_param(int narg, char **arg)

 /* ---------------------------------------------------------------------- */

-void DumpNetCDF::write_prmtop()
-{
-  char fn[1024];
-  char tmp[81];
-  FILE *f;
-
-  strcpy(fn, filename);
-  strcat(fn, ".prmtop");
-
-  f = fopen(fn, "w");
-  fprintf(f, "%%VERSION  LAMMPS\n");
-  fprintf(f, "%%FLAG TITLE\n");
-  fprintf(f, "%%FORMAT(20a4)\n");
-  memset(tmp, ' ', 76);
-  tmp[76] = '\0';
-  fprintf(f, "NASN%s\n", tmp);
-
-  fprintf(f, "%%FLAG POINTERS\n");
-  fprintf(f, "%%FORMAT(10I8)\n");
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-  fprintf(f, "%8li", ntotalgr);
-#else
-  fprintf(f, "%8i", ntotalgr);
-#endif
-  for (int i = 0; i < 11; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-  for (int i = 0; i < 12; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-  for (int i = 0; i < 6; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-
-  fprintf(f, "%%FLAG ATOM_NAME\n");
-  fprintf(f, "%%FORMAT(20a4)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%4s", "He");
-    if ((i+1) % 20 == 0)
-      fprintf(f, "\n");
-  }
-
-  fprintf(f, "%%FLAG CHARGE\n");
-  fprintf(f, "%%FORMAT(5E16.5)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%16.5e", 0.0);
-    if ((i+1) % 5 == 0)
-      fprintf(f, "\n");
-  }
-
-  fprintf(f, "%%FLAG MASS\n");
-  fprintf(f, "%%FORMAT(5E16.5)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%16.5e", 1.0);
-    if ((i+1) % 5 == 0)
-        fprintf(f, "\n");
-  }
-  fclose(f);
-}
-
-/* ---------------------------------------------------------------------- */
-
 void DumpNetCDF::ncerr(int err, const char *descr, int line)
 {
  if (err != NC_NOERR) {
--- a/src/USER-NETCDF/dump_netcdf.h
+++ b/src/USER-NETCDF/dump_netcdf.h
@ -92,7 +92,6 @@ class DumpNetCDF : public DumpCustom {
  void closefile();
  virtual void write_header(bigint);
  virtual void write_data(int, double *);
-  void write_prmtop();

  virtual int modify_param(int, char **);

--- a/Show More
+++ b/Show More