diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bc33da60de..2cc11d4ecb 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -37,6 +37,10 @@ enable_language(CXX)
 #####################################################################
 include(CheckCCompilerFlag)
 
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
+endif()
+
 ########################################################################
 # User input options                                                   #
 ########################################################################
@@ -76,7 +80,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
 option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
 if(LAMMPS_EXCEPTIONS)
   add_definitions(-DLAMMPS_EXCEPTIONS)
-  set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS")
+  set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS")
 endif()
 
 set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically")
@@ -665,7 +669,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR})
 ############################################
 add_library(lammps ${LIB_SOURCES})
 target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
-add_dependencies(lammps ${LAMMPS_DEPS})
+if(LAMMPS_DEPS)
+  add_dependencies(lammps ${LAMMPS_DEPS})
+endif()
 set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE})
 if(BUILD_SHARED_LIBS)
   set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})
diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png
index 7ec83b3207..5061f1af2e 100755
Binary files a/doc/src/JPG/user_intel.png and b/doc/src/JPG/user_intel.png differ
diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index 6451581d3a..fa7db1d266 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -706,7 +706,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding
 quantum forces calculated by LATTE.
 
 More information on LATTE can be found at this web site:
-"https://github.com/lanl/LATTE"_#latte_home.  A brief technical
+"https://github.com/lanl/LATTE"_latte_home.  A brief technical
 description is given with the "fix latte"_fix_latte.html command.
 
 :link(latte_home,https://github.com/lanl/LATTE)
@@ -729,6 +729,7 @@ make lib-latte args="-b"                # download and build in lib/latte/LATTE-
 make lib-latte args="-p $HOME/latte"    # use existing LATTE installation in $HOME/latte
 make lib-latte args="-b -m gfortran"    # download and build in lib/latte and 
                                         #   copy Makefile.lammps.gfortran to Makefile.lammps
+:pre
 
 Note that 3 symbolic (soft) links, "includelink" and "liblink" and
 "filelink", are created in lib/latte to point into the LATTE home dir.
diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index 83e17b4f27..aaa38d7de2 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -25,14 +25,14 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
 [Currently Available USER-INTEL Styles:]
 
 Angle Styles: charmm, harmonic :ulb,l
-Bond Styles: fene, harmonic :l
+Bond Styles: fene, fourier, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
-Fixes: nve, npt, nvt, nvt/sllod :l
+Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l
 Improper Styles: cvff, harmonic :l
 Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
-buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
-lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
-sw, tersoff :l
+buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
+lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, 
+rebo, sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule
 
@@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks).
 :c,image(JPG/user_intel.png)
 
 Results are speedups obtained on Intel Xeon E5-2697v4 processors
-(code-named Broadwell) and Intel Xeon Phi 7250 processors
-(code-named Knights Landing) with "June 2017" LAMMPS built with
-Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
-per physical core. See {src/USER-INTEL/TEST/README} for the raw
-simulation rates and instructions to reproduce.
+(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named
+Knights Landing), and Intel Xeon Gold 6148 processors (code-named
+Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio
+2017 update 2. Results are with 1 MPI task per physical core. See
+{src/USER-INTEL/TEST/README} for the raw simulation rates and
+instructions to reproduce.
 
 :line
 
@@ -82,6 +83,11 @@ this order :l
 The {newton} setting applies to all atoms, not just atoms shared
 between MPI tasks :l
 Vectorization can change the order for adding pairwise forces :l
+When using the -DLMP_USE_MKL_RNG define (all included intel optimized
+makefiles do) at build time, the random number generator for
+dissipative particle dynamics (pair style dpd/intel) uses the Mersenne
+Twister generator included in the Intel MKL library (that should be
+more robust than the default Masaglia random number generator) :l
 :ule
 
 The precision mode (described below) used with the USER-INTEL
@@ -108,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
 For some of the simple 2-body potentials without long-range
 electrostatics, performance and scalability can be better with
 the "newton off" setting added to the input script :l
-For simulations on higher node counts, add "processors * * * grid 
+For simulations on higher node counts, add "processors * * * grid
 numa" to the beginning of the input script for better scalability :l
 If using {kspace_style pppm} in the input script, add
 "kspace_modify diff ad" for better performance :l
@@ -119,8 +125,8 @@ For Intel Xeon Phi CPUs:
 Runs should be performed using MCDRAM. :ulb,l
 :ule
 
-For simulations using {kspace_style pppm} on Intel CPUs
-supporting AVX-512:
+For simulations using {kspace_style pppm} on Intel CPUs supporting
+AVX-512:
 
 Add "kspace_modify diff ad" to the input script :ulb,l
 The command-line option should be changed to
@@ -237,14 +243,17 @@ However, if you do not have coprocessors on your system, building
 without offload support will produce a smaller binary.
 
 The general requirements for Makefiles with the USER-INTEL package
-are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When
-using Intel compilers, "-restrict" is required and "-qopenmp" is
-highly recommended for CCFLAGS and LINKFLAGS. LIB should include
-"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD"
-is required for CCFLAGS and "-qoffload" is required for LINKFLAGS.
-Other recommended CCFLAG options for best performance are
-"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2
--no-prec-div".
+are as follows. When using Intel compilers, "-restrict" is required 
+and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS. 
+CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads
+are not supported in the build environment) and "-DLMP_USE_MKL_RNG"
+(unless Intel Math Kernel Library (MKL) is not available in the build
+environment). For Intel compilers, LIB should include "-ltbbmalloc" 
+or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added
+to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is
+required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other
+recommended CCFLAG options for best performance are "-O2 -fno-alias
+-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div".
 
 NOTE: The vectorization and math capabilities can differ depending on
 the CPU. For Intel compilers, the "-x" flag specifies the type of
diff --git a/doc/src/atom_modify.txt b/doc/src/atom_modify.txt
index d5c82f16ac..1dc0fa6bfb 100644
--- a/doc/src/atom_modify.txt
+++ b/doc/src/atom_modify.txt
@@ -16,7 +16,7 @@ atom_modify keyword values ... :pre
 one or more keyword/value pairs may be appended :ulb,l
 keyword = {id} or {map} or {first} or {sort} :l
    {id} value = {yes} or {no}
-   {map} value = {array} or {hash}
+   {map} value = {yes} or {array} or {hash}
    {first} value = group-ID = group whose atoms will appear first in internal atom lists
    {sort} values = Nfreq binsize
      Nfreq = sort atoms spatially every this many time steps
@@ -25,8 +25,8 @@ keyword = {id} or {map} or {first} or {sort} :l
 
 [Examples:]
 
-atom_modify map hash
-atom_modify map array sort 10000 2.0
+atom_modify map yes
+atom_modify map hash sort 10000 2.0
 atom_modify first colloid :pre
 
 [Description:]
@@ -62,29 +62,33 @@ switch.  This is described in "Section 2.2"_Section_start.html#start_2
 of the manual.  If atom IDs are not used, they must be specified as 0
 for all atoms, e.g. in a data or restart file.
 
-The {map} keyword determines how atom ID lookup is done for molecular
-atom styles.  Lookups are performed by bond (angle, etc) routines in
-LAMMPS to find the local atom index associated with a global atom ID.
+The {map} keyword determines how atoms with specific IDs are found
+when required.  An example are the bond (angle, etc) methods which
+need to find the local index of an atom with a specific global ID
+which is a bond (angle, etc) partner.  LAMMPS performs this operation
+efficiently by creating a "map", which is either an {array} or {hash}
+table, as descibed below.
 
-When the {array} value is used, each processor stores a lookup table
-of length N, where N is the largest atom ID in the system.  This is a
+When the {map} keyword is not specified in your input script, LAMMPS
+only creates a map for "atom_styles"_atom_style.html for molecular
+systems which have permanent bonds (angles, etc).  No map is created
+for atomic systems, since it is normally not needed.  However some
+LAMMPS commands require a map, even for atomic systems, and will
+generate an error if one does not exist.  The {map} keyword thus
+allows you to force the creation of a map.  The {yes} value will
+create either an {array} or {hash} style map, as explained in the next
+paragraph.  The {array} and {hash} values create an atom-style or
+hash-style map respectively.
+
+For an {array}-style map, each processor stores a lookup table of
+length N, where N is the largest atom ID in the system.  This is a
 fast, simple method for many simulations, but requires too much memory
-for large simulations.  The {hash} value uses a hash table to perform
-the lookups.  This can be slightly slower than the {array} method, but
-its memory cost is proportional to the number of atoms owned by a
-processor, i.e. N/P when N is the total number of atoms in the system
-and P is the number of processors.
-
-When this setting is not specified in your input script, LAMMPS
-creates a map, if one is needed, as an array or hash.  See the
-discussion of default values below for how LAMMPS chooses which kind
-of map to build.  Note that atomic systems do not normally need to
-create a map.  However, even in this case some LAMMPS commands will
-create a map to find atoms (and then destroy it), or require a
-permanent map.  An example of the former is the "velocity loop
-all"_velocity.html command, which uses a map when looping over all
-atoms and insuring the same velocity values are assigned to an atom
-ID, no matter which processor owns it.
+for large simulations.  For a {hash}-style map, a hash table is
+created on each processor, which finds an atom ID in constant time
+(independent of the global number of atom IDs).  It can be slightly
+slower than the {array} map, but its memory cost is proportional to
+the number of atoms owned by a processor, i.e. N/P when N is the total
+number of atoms in the system and P is the number of processors.
 
 The {first} keyword allows a "group"_group.html to be specified whose
 atoms will be maintained as the first atoms in each processor's list
diff --git a/doc/src/dihedral_fourier.txt b/doc/src/dihedral_fourier.txt
index da892b59da..0accbb22bf 100644
--- a/doc/src/dihedral_fourier.txt
+++ b/doc/src/dihedral_fourier.txt
@@ -7,6 +7,7 @@
 :line
 
 dihedral_style fourier command :h3
+dihedral_style fourier/intel command :h3
 dihedral_style fourier/omp command :h3
 
 [Syntax:]
diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt
index 2ea1da3db3..38d9aad4d9 100644
--- a/doc/src/dump_modify.txt
+++ b/doc/src/dump_modify.txt
@@ -15,9 +15,11 @@ dump_modify dump-ID keyword values ... :pre
 dump-ID = ID of dump to modify :ulb,l
 one or more keyword/value pairs may be appended :l
 these keywords apply to various dump styles :l
-keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
-  {append} arg = {yes} or {no} or {at} N
+keyword = {append} or {at} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
+  {append} arg = {yes} or {no}
+  {at} arg = N
     N = index of frame written upon first dump
+    only available after "append yes"
   {buffer} arg = {yes} or {no}
   {element} args = E1 E2 ... EN, where N = # of atom types
     E1,...,EN = element name, e.g. C or Fe or Ga
diff --git a/doc/src/dump_netcdf.txt b/doc/src/dump_netcdf.txt
index 63568137a6..70111a36a8 100644
--- a/doc/src/dump_netcdf.txt
+++ b/doc/src/dump_netcdf.txt
@@ -25,7 +25,8 @@ args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule
 
 dump 1 all netcdf 100 traj.nc type x y z vx vy vz
 dump_modify 1 append yes at -1 thermo yes
-dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre
+dump 1 all netcdf/mpiio 1000 traj.nc id type x y z
+dump 1 all netcdf 1000 traj.*.nc id type x y z :pre
 
 [Description:]
 
@@ -73,4 +74,3 @@ section for more info.
 [Related commands:]
 
 "dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html
-
diff --git a/doc/src/fix_latte.txt b/doc/src/fix_latte.txt
index f78e13b866..4edd610546 100644
--- a/doc/src/fix_latte.txt
+++ b/doc/src/fix_latte.txt
@@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond
 integrals are parameterized using a Slater-Koster tight-binding
 approach. This procedure, which usually is referred to as the DFTB
 method has been described in detail by ("Elstner"_#Elstner) and
-("Finnis"_#Finnis) and coworkers. 
+("Finnis"_#Finnis2) and coworkers. 
 
 The work of the LATTE developers follows that of Elstner closely with
 respect to the physical model.  However, the development of LATTE is
@@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
 M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
 7260 (1998).
 
-:link(Finnis)
+:link(Finnis2)
 [(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van
 Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998).
 
@@ -197,11 +197,11 @@ J. Sci. Comput. 36 (2), 147-170, (2014).
 [(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys.,
 141, 164123, (2014).
 
-:link(Niklasson2014)
+:link(Niklasson2017)
 [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017).
 
-:link(Niklasson2012)
-[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
+:link(Cawkwell2012)
+[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
 (17), 174308 (2012).
 
 :link(Negre2016)
diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt
index 52d8a7df84..73b3e31266 100644
--- a/doc/src/fix_neb.txt
+++ b/doc/src/fix_neb.txt
@@ -93,7 +93,7 @@ intermediate replica with the previous and the next image:
 
 Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
 
-Note that in this case the specified {Kspring) is in force/distance
+Note that in this case the specified {Kspring} is in force/distance
 units.
 
 With a value of {ideal}, the spring force is computed as suggested in
@@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and
 RDideal is the ideal RD for which all the images are equally spaced.
 I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
 I is the replica number).  The meanDist is the average distance
-between replicas.  Note that in this case the specified {Kspring) is
+between replicas.  Note that in this case the specified {Kspring} is
 in force units.
 
 Note that the {ideal} form of nudging can often be more effective at
diff --git a/doc/src/fix_nh.txt b/doc/src/fix_nh.txt
index 8fa30ac222..41d0e6438f 100644
--- a/doc/src/fix_nh.txt
+++ b/doc/src/fix_nh.txt
@@ -393,32 +393,36 @@ thermostatting and barostatting.
 :line
 
 These fixes compute a temperature and pressure each timestep.  To do
-this, the fix creates its own computes of style "temp" and "pressure",
-as if one of these two sets of commands had been issued:
+this, the thermostat and barostat fixes create their own computes of
+style "temp" and "pressure", as if one of these sets of commands had
+been issued:
 
+For fix nvt:
 compute fix-ID_temp group-ID temp
-compute fix-ID_press group-ID pressure fix-ID_temp :pre
 
+For fix npt and fix nph:
 compute fix-ID_temp all temp
 compute fix-ID_press all pressure fix-ID_temp :pre
 
-See the "compute temp"_compute_temp.html and "compute
-pressure"_compute_pressure.html commands for details.  Note that the
-IDs of the new computes are the fix-ID + underscore + "temp" or fix_ID
-+ underscore + "press".  For fix nvt, the group for the new computes
-is the same as the fix group.  For fix nph and fix npt, the group for
-the new computes is "all" since pressure is computed for the entire
-system.
+For fix nvt, the group for the new temperature compute is the same as
+the fix group.  For fix npt and fix nph, the group for both the new
+temperature and pressure compute is "all" since pressure is computed
+for the entire system.  In the case of fix nph, the temperature
+compute is not used for thermostatting, but just for a kinetic-energy
+contribution to the pressure.  See the "compute
+temp"_compute_temp.html and "compute pressure"_compute_pressure.html
+commands for details.  Note that the IDs of the new computes are the
+fix-ID + underscore + "temp" or fix_ID + underscore + "press".
 
 Note that these are NOT the computes used by thermodynamic output (see
 the "thermo_style"_thermo_style.html command) with ID = {thermo_temp}
-and {thermo_press}.  This means you can change the attributes of this
+and {thermo_press}.  This means you can change the attributes of these
 fix's temperature or pressure via the
-"compute_modify"_compute_modify.html command or print this temperature
-or pressure during thermodynamic output via the "thermo_style
-custom"_thermo_style.html command using the appropriate compute-ID.
-It also means that changing attributes of {thermo_temp} or
-{thermo_press} will have no effect on this fix.
+"compute_modify"_compute_modify.html command.  Or you can print this
+temperature or pressure during thermodynamic output via the
+"thermo_style custom"_thermo_style.html command using the appropriate
+compute-ID.  It also means that changing attributes of {thermo_temp}
+or {thermo_press} will have no effect on this fix.
 
 Like other fixes that perform thermostatting, fix nvt and fix npt can
 be used with "compute commands"_compute.html that calculate a
diff --git a/doc/src/fixes.txt b/doc/src/fixes.txt
index b93bb9d0a2..97a7b58050 100644
--- a/doc/src/fixes.txt
+++ b/doc/src/fixes.txt
@@ -59,6 +59,7 @@ Fixes :h1
    fix_langevin
    fix_langevin_drude
    fix_langevin_eff
+   fix_latte
    fix_lb_fluid
    fix_lb_momentum
    fix_lb_pc
diff --git a/doc/src/lammps.book b/doc/src/lammps.book
index 77c70775d7..6a2c422a83 100644
--- a/doc/src/lammps.book
+++ b/doc/src/lammps.book
@@ -188,6 +188,7 @@ fix_ipi.html
 fix_langevin.html
 fix_langevin_drude.html
 fix_langevin_eff.html
+fix_latte.html
 fix_lb_fluid.html
 fix_lb_momentum.html
 fix_lb_pc.html
diff --git a/doc/src/package.txt b/doc/src/package.txt
index 58f6a5e34d..5c698934e8 100644
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@@ -62,7 +62,7 @@ args = arguments specific to the style :l
       {no_affinity} values = none
   {kokkos} args = keyword value ...
     zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward}
+    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
       {neigh} value = {full} or {half}
         full = full neighbor list
         half = half neighbor list built in thread-safe manner
@@ -75,9 +75,10 @@ args = arguments specific to the style :l
       {binsize} value = size
         size = bin size for neighbor list construction (distance units)
       {comm} value = {no} or {host} or {device}
-        use value for both comm/exchange and comm/forward
+        use value for comm/exchange and comm/forward and comm/reverse
       {comm/exchange} value = {no} or {host} or {device}
       {comm/forward} value = {no} or {host} or {device}
+      {comm/reverse} value = {no} or {host} or {device}
         no = perform communication pack/unpack in non-KOKKOS mode
         host = perform pack/unpack on host (e.g. with OpenMP threading)
         device = perform pack/unpack on device (e.g. on GPU)
@@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at
 performing pairwise interactions, then this rule of thumb may give too
 large a binsize.
 
-The {comm} and {comm/exchange} and {comm/forward} keywords determine
+The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
 whether the host or device performs the packing and unpacking of data
 when communicating per-atom data between processors.  "Exchange"
 communication happens only on timesteps that neighbor lists are
 rebuilt.  The data is only for atoms that migrate to new processors.
-"Forward" communication happens every timestep.  The data is for atom
+"Forward" communication happens every timestep. "Reverse" communication
+happens every timestep if the {newton} option is on.  The data is for atom
 coordinates and any other atom properties that needs to be updated for
 ghost atoms owned by each processor.
 
 The {comm} keyword is simply a short-cut to set the same value
-for both the {comm/exchange} and {comm/forward} keywords.
+for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
 
 The value options for all 3 keywords are {no} or {host} or {device}.
 A value of {no} means to use the standard non-KOKKOS method of
diff --git a/doc/src/pair_dpd.txt b/doc/src/pair_dpd.txt
index 8d194bb092..9e29e93430 100644
--- a/doc/src/pair_dpd.txt
+++ b/doc/src/pair_dpd.txt
@@ -8,6 +8,7 @@
 
 pair_style dpd command :h3
 pair_style dpd/gpu command :h3
+pair_style dpd/intel command :h3
 pair_style dpd/omp command :h3
 pair_style dpd/tstat command :h3
 pair_style dpd/tstat/gpu command :h3
diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt
index a0026432ec..03e77f53ab 100644
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix.
 
 Style {eam/fs} computes pairwise interactions for metals and metal
 alloys using a generalized form of EAM potentials due to Finnis and
-Sinclair "(Finnis)"_#Finnis.  The total energy Ei of an atom I is
+Sinclair "(Finnis)"_#Finnis1.  The total energy Ei of an atom I is
 given by
 
 :c,image(Eqs/pair_eam_fs.jpg)
@@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004).
 [(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983).
 Daw, Baskes, Phys Rev B, 29, 6443 (1984).
 
-:link(Finnis)
+:link(Finnis1)
 [(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984).
 
 :link(Stukowski)
diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 43d3f17d63..d414056187 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,24 @@
 # Change Log
 
+## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
+
+**Implemented enhancements:**
+
+- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
+- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
+- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
+- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
+- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
+- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
+
+**Fixed bugs:**
+
+- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
+- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
+- (Experimental) HBWSpace  Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
+- (Experimental) ROCm:  algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
+
 ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
 
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index b8236e8fd1..4641232a1f 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -443,7 +443,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
   KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
   KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
-  KOKKOS_LIBS += -lmemkind
+  KOKKOS_LIBS += -lmemkind -lnuma
   tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
 endif
 
@@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
   else
-    # Assume that this is a really a GNU compiler or it could be XL on P8.
-    KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
-    KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P8.
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+      endif
+    endif
   endif
 endif
 
@@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
   else
-    # Assume that this is a really a GNU compiler or it could be XL on P9.
-    KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
-    KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P9
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+      endif
+    endif
   endif
 endif
 
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 9082e47052..3db9a145d7 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
 }
 
 
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCM) 
+
+  template<>
+  class Random_XorShift1024<Kokkos::Experimental::ROCm> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    const int stride_;
+    friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
+  public:
+
+    typedef Kokkos::Experimental::ROCm device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  state_(state.state_idx_) = state.state_;
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
 #endif
 
 
diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile
index b74192ef18..a5a10c82ee 100644
--- a/lib/kokkos/algorithms/unit_tests/Makefile
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	TEST_TARGETS += test-cuda
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Threads
@@ -51,6 +57,9 @@ endif
 KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
 
+KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
+
 KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
 
@@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 test-cuda: KokkosAlgorithms_UnitTest_Cuda
 	./KokkosAlgorithms_UnitTest_Cuda
 
+test-rocm: KokkosAlgorithms_UnitTest_ROCm
+	./KokkosAlgorithms_UnitTest_ROCm
+
 test-threads: KokkosAlgorithms_UnitTest_Threads
 	./KokkosAlgorithms_UnitTest_Threads
 
diff --git a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
new file mode 100644
index 0000000000..720b377ed2
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+void rocm_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+void rocm_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+
+#define ROCM_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( rocm, Random_XorShift64 ) {        \
+  rocm_test_random_xorshift64(num_draws);    \
+  }
+
+#define ROCM_RANDOM_XORSHIFT1024( num_draws )  \
+  TEST_F( rocm, Random_XorShift1024 ) {        \
+  rocm_test_random_xorshift1024(num_draws);    \
+  }
+
+#define ROCM_SORT_UNSIGNED( size )                                    \
+  TEST_F( rocm, SortUnsigned ) {                                      \
+      Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size);  \
+  }
+
+ROCM_RANDOM_XORSHIFT64(  132141141 )
+ROCM_RANDOM_XORSHIFT1024( 52428813 )
+ROCM_SORT_UNSIGNED(171)
+
+#undef ROCM_RANDOM_XORSHIFT64
+#undef ROCM_RANDOM_XORSHIFT1024
+#undef ROCM_SORT_UNSIGNED
+}
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
+
diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind
index ca34648780..b88b334f8b 100755
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@@ -27,7 +27,7 @@ fi
 HPCBIND_HWLOC_PARENT_CPUSET=""
 if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
   MY_PID="$BASHPID"
-  HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+  HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
 fi
 
 ################################################################################
@@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
 ################################################################################
 HPCBIND_QUEUE_NAME=""
 declare -i HPCBIND_QUEUE_INDEX=0
-declare -i HPCBIND_QUEUE_GPU_MAPPING=0
+declare -i HPCBIND_QUEUE_MAPPING=0
 
-if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
-  HPCBIND_QUEUE_NAME="sbatch"
+if [[ ! -z "${PMI_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mpich"
+  HPCBIND_QUEUE_INDEX=${PMI_RANK}
+elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="openmpi"
+  HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
+elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mvapich2"
+  HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
+elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="slurm"
   HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
 elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
   HPCBIND_QUEUE_NAME="bsub"
   HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
 elif [[ ! -z "${ALPS_APP_PE}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
   HPCBIND_QUEUE_NAME="aprun"
   HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
 fi
 
-
 ################################################################################
 # Show help
 ################################################################################
@@ -91,13 +102,14 @@ function show_help {
   echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
   echo "                        LOC can be any valid location argument for"
   echo "                        hwloc-calc  Default: all"
+  echo "  --whole-system        ${cmd} will ignore the its parent process binding"
   echo "  --distribute=N        Distribute the current cpuset into N partitions"
   echo "  --distribute-partition=I"
   echo "                        Use the i'th partition (zero based)"
   echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
   echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
   echo "                        sequential order"
-  echo "  --gpu-ignore-queue    Ignore queue job id when choosing visible GPU"
+  echo "  --ignore-queue        Ignore queue job id when choosing visible GPU and partition"
   echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
   echo "  --openmp=M.m          Set env variables for the given OpenMP version"
   echo "                        Default: 4.0"
@@ -110,22 +122,30 @@ function show_help {
   echo "  --force-openmp-proc-bind=<OP>"
   echo "                        Override logic for selecting OMP_PROC_BIND"
   echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  --show-bindings       Show the bindings"
-  echo "  --lstopo              Show bindings in lstopo without executing a command"
-  echo "  -v|--verbose          Show options and relevant environment variables"
+  echo "  --output-prefix=<P>   Save the output to files of the form"
+  echo "                        P-N.log, P-N.out and P-N.err where P is the prefix"
+  echo "                        and N is the queue index or mpi rank (no spaces)"
+  echo "  --output-mode=<Op>    How console output should be handled."
+  echo "                        Options are all, rank0, and none.  Default: rank0" 
+  echo "  --lstopo              Show bindings in lstopo"
+  echo "  -v|--verbose          Print bindings and relevant environment variables"
   echo "  -h|--help             Show this message"
   echo ""
   echo "Sample Usage:"
   echo "  Split the current process cpuset into 4 and use the 3rd partition"
   echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
-  echo "  Bing the process to all even cores"
+  echo "  Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
+  echo "  and save the output to rank specific files"
+  echo "    mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
+  echo "      --distribute=4 -v --output-prefix=output  -- command ..."
+  echo "  Bind the process to all even cores"
   echo "    ${cmd} --proc-bind=core:even -v -- command ..."
-  echo "  Bind to the first 64 cores and split the current process cpuset into 4"
-  echo "    ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
-  echo "  skip GPU 0 when mapping visible devices"
+  echo "  Bind the the even cores of socket 0 and the odd cores of socket 1"
+  echo "    ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
+  echo "  Skip GPU 0 when mapping visible devices"
   echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
   echo "  Display the current bindings"
-  echo "    ${cmd} --proc-bind=numa:0 --show-bindings -- command"
+  echo "    ${cmd} --proc-bind=numa:0 -- command"
   echo "  Display the current bindings using lstopo"
   echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
   echo ""
@@ -144,7 +164,7 @@ fi
 declare -a UNKNOWN_ARGS=()
 declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
 declare -i HPCBIND_DISTRIBUTE=1
-declare -i HPCBIND_PARTITION=0
+declare -i HPCBIND_PARTITION=-1
 HPCBIND_PROC_BIND="all"
 HPCBIND_OPENMP_VERSION=4.0
 declare -i HPCBIND_OPENMP_PERCENT=100
@@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
 HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
 declare -i HPCBIND_VERBOSE=0
 
-declare -i HPCBIND_SHOW_BINDINGS=0
 declare -i HPCBIND_LSTOPO=0
 
-for i in $@; do
-  case $i in
+HPCBIND_OUTPUT_PREFIX=""
+HPCBIND_OUTPUT_MODE="rank0"
+
+declare -i HPCBIND_HAS_COMMAND=0
+
+for i in "$@"; do
+  case "$i" in
     # number of partitions to create
     --no-hwloc-bind)
       HPCBIND_ENABLE_HWLOC_BIND=0
@@ -169,6 +193,10 @@ for i in $@; do
       HPCBIND_PROC_BIND="${i#*=}"
       shift
       ;;
+    --whole-system)
+      HPCBIND_HWLOC_PARENT_CPUSET=""
+      shift
+      ;;
     --distribute=*)
       HPCBIND_DISTRIBUTE="${i#*=}"
       shift
@@ -182,8 +210,8 @@ for i in $@; do
       HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
       shift
       ;;
-    --gpu-ignore-queue)
-      HPCBIND_QUEUE_GPU_MAPPING=0
+    --ignore-queue)
+      HPCBIND_QUEUE_MAPPING=0
       shift
       ;;
     --no-gpu-mapping)
@@ -218,14 +246,18 @@ for i in $@; do
       HPCBIND_OPENMP_NESTED="false"
       shift
       ;;
-    --show-bindings)
-      HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=1
+    --output-prefix=*)
+      HPCBIND_OUTPUT_PREFIX="${i#*=}"
+      shift
+      ;;
+    --output-mode=*)
+      HPCBIND_OUTPUT_MODE="${i#*=}"
+      #convert to lower case
+      HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
       shift
       ;;
     --lstopo)
       HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=0
       HPCBIND_LSTOPO=1
       shift
       ;;
@@ -239,6 +271,7 @@ for i in $@; do
       ;;
     # ignore remaining arguments
     --)
+      HPCBIND_HAS_COMMAND=1
       shift
       break
       ;;
@@ -250,16 +283,41 @@ for i in $@; do
   esac
 done
 
+################################################################################
+# Check output mode
+################################################################################
+declare -i HPCBIND_TEE=0
+
+if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
+  HPCBIND_TEE=0
+elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
+  HPCBIND_TEE=1
+elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
+  #default to rank0 printing to screen
+  HPCBIND_TEE=1
+fi
+
+
+if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
+  HPCBIND_LOG=/dev/null
+  HPCBIND_ERR=/dev/null
+  HPCBIND_OUT=/dev/null
+else
+  HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
+  HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
+  HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
+  > ${HPCBIND_LOG}
+fi
+
 
 ################################################################################
 # Check unknown arguments
 ################################################################################
 if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
   exit 1
 fi
 
-
 ################################################################################
 # Check that visible gpus are valid
 ################################################################################
@@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
   for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
     if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
       ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
-      echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
+      echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
       HPCBIND_VISIBLE_GPUS[$i]=0;
     fi
   done
   NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
 fi
 
-
 ################################################################################
 # Check OpenMP percent
 ################################################################################
 if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
-  echo "OpenMP percent < 1, setting to 1"
   HPCBIND_OPENMP_PERCENT=1
 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
-  echo "OpenMP percent > 100, setting to 100"
   HPCBIND_OPENMP_PERCENT=100
 fi
 
@@ -291,15 +346,21 @@ fi
 # Check distribute
 ################################################################################
 if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
   HPCBIND_DISTRIBUTE=1
 fi
 
-if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
-  echo "Invalid input for distribute-partition, changing to 0"
+################################################################################
+#choose the correct partition
+################################################################################
+if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
+  HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
+elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
   HPCBIND_PARTITION=0
 fi
 
+if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
+  HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
+fi
 
 ################################################################################
 # Find cpuset and num threads
@@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0
 
 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
   if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
   else
-    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
   fi
 
-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
-  HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
+  if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
+    CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
+    HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
+  else
+    HPCBIND_HWLOC_CPUSET="${BINDING}"
+  fi
   HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
 else
   HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
@@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
 ################################################################################
 
 if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
-  if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
+  if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
     declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
   else
     declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
     declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
   fi
 fi
 
@@ -389,22 +454,22 @@ fi
 export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
 export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
 export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
-export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
+export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
 export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
 export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
 if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
   export HPCBIND_HWLOC_PARENT_CPUSET="all"
 else
-  export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
+  export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
 fi
-export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
+export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
 export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
 export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
-export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
+export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
 if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
   export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
-  export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
-  export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
+  export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
+  export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
 fi
 
 
@@ -412,43 +477,63 @@ fi
 # Print verbose
 ################################################################################
 
-if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
-  MY_ENV=$(env | sort)
-  echo "[HPCBIND]"
-  echo "${MY_ENV}" | grep -E "^HPCBIND_"
-  echo "[CUDA]"
-  echo "${MY_ENV}" | grep -E "^CUDA_"
-  echo "[OPENMP]"
-  echo "${MY_ENV}" | grep -E "^OMP_"
-fi
+TMP_ENV=$(env | sort)
+if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
+  echo "[HOST]" >> ${HPCBIND_LOG}
+  hostname -s >> ${HPCBIND_LOG}
+  echo "[HPCBIND]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
+  echo "[CUDA]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
+  echo "[OPENMP]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
 
-if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "[BINDINGS]"
-  hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "Unable to show bindings, hwloc not available."
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" >> ${HPCBIND_LOG}
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
+  else
+    echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
+  fi
+else
+  echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
+  hostname -s > >(tee -a ${HPCBIND_LOG})
+  echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
+  echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
+  echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
+
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
+  else
+    echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
+  fi
 fi
 
 ################################################################################
 # Run command
 ################################################################################
 
-if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
-  if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-    hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
-  else
-    eval $@
-  fi
-else
-  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
-    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
-      echo "[BINDINGS]"
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-      hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
+# must be the last executed command so that the return value is correct
+if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
+  hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
+elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
+  # clear output files
+  > ${HPCBIND_ERR}
+  > ${HPCBIND_OUT}
+  if [[ ${HPCBIND_TEE} -eq 0 ]]; then
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     else
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
+      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     fi
   else
-    echo "Unable to show bindings, hwloc not available."
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    else
+      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    fi
   fi
 fi
diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind
deleted file mode 100755
index b6fe07a1bd..0000000000
--- a/lib/kokkos/bin/kokkos-bind
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env bash
-
-# check if hwloc commands exist
-declare -i HAS_HWLOC=0
-type hwloc-bind >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-distrib >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ls >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-calc >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ps >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-
-#parse args
-declare -a UNKNOWN_ARGS=()
-declare -i DISTRIBUTE=1
-declare -i INDEX=0
-PROC_BIND="all"
-CURRENT_CPUSET=""
-OPENMP_VERSION=4.0
-OPENMP_PROC_BIND=True
-OPENMP_NESTED=True
-VERBOSE=False
-
-#get the current process cpuset
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-  MY_PID="$BASHPID"
-  CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
-  echo "$CURRENT_CPUSET"
-fi
-
-function show_help {
-  local cmd=$(basename "$0")
-  echo "Usage: ${cmd} <options> -- command ..." 
-  echo "  Uses hwloc to divide the node into the given number of groups,"
-  echo "  set the appropriate OMP_NUM_THREADS and execute the command on the"
-  echo "  selected group."
-  echo ""
-  echo "  NOTE: This command assumes it has exclusive use of the node"
-  echo ""
-  echo "Options:"
-  echo "  --proc-bind=<LOC>     Set the initial process mask for the script.  "
-  echo "                        LOC can be any valid location argumnet for"
-  echo "                        hwloc-calc.  Defaults to the entire machine"
-  echo "  --distribute=N        Distribute the current proc-bind into N groups" 
-  echo "  --index=I             Use the i'th group (zero based)" 
-  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
-  echo "                        (default 4.0)"
-  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"    
-  echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  -v|--verbose" 
-  echo "  -h|--help" 
-  echo ""
-  echo "Sample Usage:"
-  echo "  ${cmd} --distribute=4 --index=2 -v -- command ..."
-  echo ""
-}
-
-if [[ "$#" -eq 0 ]]; then
-  show_help 
-  exit 0
-fi
-
-
-for i in $@; do
-  case $i in
-    # number of partitions to create
-    --proc-bind=*)
-      PROC_BIND="${i#*=}"
-      shift
-      ;;
-    --distribute=*)
-      DISTRIBUTE="${i#*=}"
-      shift
-      ;;
-    # which group to use
-    --index=*)
-      INDEX="${i#*=}"
-      shift
-      ;;
-    --openmp=*)
-      OPENMP_VERSION="${i#*=}"
-      shift
-      ;;
-    --no-openmp-proc-bind)
-      OPENMP_PROC_BIND=False
-      shift
-      ;;
-    --no-openmp-nested)
-      OPENMP_NESTED=False
-      shift
-      ;;
-    -v|--verbose)
-      VERBOSE=True
-      shift
-      ;;
-    -h|--help)
-      show_help
-      exit 0
-      ;;
-    # ignore remaining arguments
-    --)
-      shift
-      break
-      ;;
-    # unknown option
-    *)
-      UNKNOWN_ARGS+=("$i")
-      shift
-      ;;
-  esac
-done
-
-if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
-  exit 1
-fi
-
-if [[ ${DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
-  DISTRIBUTE=1
-fi
-
-if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
-  echo "Invalid input for index, changing index to 0"
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -ne 0 ]]; then
-  echo "hwloc not found, no process binding will occur"
-  DISTRIBUTE=1
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-
-  if [[ "${CURRENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${PROC_BIND})
-  else 
-    BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
-  fi
-
-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
-  CPUSET=${CPUSETS[${INDEX}]}
-  NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:         true"
-    echo "  proc_bind:     ${PROC_BIND}"
-    echo "  distribute:    ${DISTRIBUTE}"
-    echo "  index:         ${INDEX}"
-    echo "  parent_cpuset: ${CURRENT_CPUSET}"
-    echo "  cpuset:        ${CPUSET}"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  hwloc-bind ${CPUSET} -- $@
-else
-  NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:           false"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-    
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  eval $@
-fi
-
diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper
index 09fa5d500a..76e33f3c66 100755
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
 # Check if we have an optimization argument already
 optimization_applied=0
 
+# Check if we have -std=c++X  or --std=c++X already
+stdcxx_applied=0
+
 #echo "Arguments: $# $@"
 
 while [ $# -gt 0 ]
@@ -130,10 +133,16 @@ do
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
-  #Handle c++11 setting
-  --std=c++11|-std=c++11)
-    shared_args="$shared_args $1"
+  #Handle c++11
+  --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
+    if [ $stdcxx_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
+    else
+       shared_args="$shared_args $1"
+       stdcxx_applied=1
+    fi
     ;;
+
   #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
   -std=c++98|--std=c++98)
     ;;
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index 96b05c02e1..6f9ca897d9 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -9,3 +9,4 @@ tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
 tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
 tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
 tag:  2.04.00    date: 08:16:2017    master: 54eb75c0    develop: 32fb8ee1
+tag:  2.04.04    date: 09:11:2017    master: 2b7e9c20    develop: 51e7b25a
diff --git a/lib/kokkos/config/trilinos-integration/checkin-test b/lib/kokkos/config/trilinos-integration/checkin-test
index 92a1b1c068..ffb565fcbb 100644
--- a/lib/kokkos/config/trilinos-integration/checkin-test
+++ b/lib/kokkos/config/trilinos-integration/checkin-test
@@ -1,4 +1,4 @@
 module purge
-module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
+module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
 
 #Run Trilinos CheckinTest
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index 0408472c68..996b6b5610 100644
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -125,6 +125,123 @@ namespace Impl {
   };
 }
 
+/// \class GraphRowViewConst
+/// \brief View of a row of a sparse graph.
+/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
+///
+/// This class provides a generic view of a row of a sparse graph.
+/// We intended this class to view a row of a StaticCrsGraph, but
+/// GraphType need not necessarily be CrsMatrix.
+///
+/// The row view is suited for computational kernels like sparse
+/// matrix-vector multiply, as well as for modifying entries in the
+/// sparse matrix.  The view is always const as it does not allow graph modification.
+///
+/// Here is an example loop over the entries in the row:
+/// \code
+/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
+///
+/// GraphRowView<GraphType> G_i = ...;
+/// const ordinal_type numEntries = G_i.length;
+/// for (ordinal_type k = 0; k < numEntries; ++k) {
+///   ordinal_type j = G_i.colidx (k);
+///   // ... do something with A_ij and j ...
+/// }
+/// \endcode
+///
+/// GraphType must provide the \c data_type
+/// typedefs. In addition, it must make sense to use GraphRowViewConst to
+/// view a row of GraphType. In particular, column
+/// indices of a row must be accessible using the <tt>entries</tt>
+/// resp. <tt>colidx</tt> arrays given to the constructor of this
+/// class, with a constant <tt>stride</tt> between successive entries.
+/// The stride is one for the compressed sparse row storage format (as
+/// is used by CrsMatrix), but may be greater than one for other
+/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
+template<class GraphType>
+struct GraphRowViewConst {
+  //! The type of the column indices in the row.
+  typedef const typename GraphType::data_type ordinal_type;
+
+private:
+  //! Array of (local) column indices in the row.
+  ordinal_type* colidx_;
+  /// \brief Stride between successive entries in the row.
+  ///
+  /// For compressed sparse row (CSR) storage, this is always one.
+  /// This might be greater than one for storage formats like ELLPACK
+  /// or Jagged Diagonal.  Nevertheless, the stride can never be
+  /// greater than the number of rows or columns in the matrix.  Thus,
+  /// \c ordinal_type is the correct type.
+  const ordinal_type stride_;
+
+public:
+  /// \brief Constructor
+  ///
+  /// \param values [in] Array of the row's values.
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( ordinal_type* const colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count) :
+    colidx_ (colidx_in), stride_ (stride), length (count)
+  {}
+
+  /// \brief Constructor with offset into \c colidx array
+  ///
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  /// \param idx [in] Start offset into \c colidx array
+  ///
+  /// \tparam OffsetType The type of \c idx (see above).  Must be a
+  ///   built-in integer type.  This may differ from ordinal_type.
+  ///   For example, the matrix may have dimensions that fit in int,
+  ///   but a number of entries that does not fit in int.
+  template<class OffsetType>
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count,
+                      const OffsetType& idx,
+                      const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
+    colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
+  {}
+
+  /// \brief Number of entries in the row.
+  ///
+  /// This is a public const field rather than a public const method,
+  /// in order to avoid possible overhead of a method call if the
+  /// compiler is unable to inline that method call.
+  ///
+  /// We assume that rows contain no duplicate entries (i.e., entries
+  /// with the same column index).  Thus, a row may have up to
+  /// A.numCols() entries.  This means that the correct type of
+  /// 'length' is ordinal_type.
+  const ordinal_type length;
+
+  /// \brief (Const) reference to the column index of entry i in this
+  ///   row of the sparse matrix.
+  ///
+  /// "Entry i" is not necessarily the entry with column index i, nor
+  /// does i necessarily correspond to the (local) row index.
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& colidx (const ordinal_type& i) const {
+    return colidx_[i*stride_];
+  }
+
+  /// \brief An alias for colidx
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& operator()(const ordinal_type& i) const {
+    return colidx(i);
+  }
+};
+
+
 /// \class StaticCrsGraph
 /// \brief Compressed row storage array.
 ///
@@ -218,6 +335,38 @@ public:
       static_cast<size_type> (0);
   }
 
+  /// \brief Return a const view of row i of the graph.
+  ///
+  /// If row i does not belong to the graph, return an empty view.
+  ///
+  /// The returned object \c view implements the following interface:
+  /// <ul>
+  /// <li> \c view.length is the number of entries in the row </li>
+  /// <li> \c view.colidx(k) returns a const reference to the
+  ///      column index of the k-th entry in the row </li>
+  /// </ul>
+  /// k is not a column index; it just counts from 0 to
+  /// <tt>view.length - 1</tt>.
+  ///
+  /// Users should not rely on the return type of this method.  They
+  /// should instead assign to 'auto'.  That allows compile-time
+  /// polymorphism for different kinds of sparse matrix formats (e.g.,
+  /// ELLPACK or Jagged Diagonal) that we may wish to support in the
+  /// future.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
+    const size_type start = row_map(i);
+    // count is guaranteed to fit in ordinal_type, as long as no row
+    // has duplicate entries.
+    const data_type count = static_cast<data_type> (row_map(i+1) - start);
+
+    if (count == 0) {
+      return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
+    } else {
+      return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
+    }
+  }
+
   /**  \brief  Create a row partitioning into a given number of blocks
    *           balancing non-zeros + a fixed cost per row.
    */
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
index 46321378d9..c184c14d07 100644
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
     // LL
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
         if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
           for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
             if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
               m_func(offset_0 , offset_1);
             }
@@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
     // LR
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
               m_func(offset_0 , offset_1);
             }
@@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
     if (RP::inner_direction == RP::Left) {
       // Loop over size maxnumblocks until full range covered
       for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
         if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
           for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
             if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
               m_func(Tag(), offset_0 , offset_1);
             }
@@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
     }
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
               m_func(Tag(), offset_0 , offset_1);
             }
@@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
     // LL
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
         if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                 if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                   m_func(offset_0 , offset_1 , offset_2);
                 }
@@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
     // LR
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                   m_func(offset_0 , offset_1 , offset_2);
                 }
@@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
   {
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
         if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                 if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                   m_func(Tag(), offset_0 , offset_1 , offset_2);
                 }
@@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
     }
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                   m_func(Tag(), offset_0 , offset_1 , offset_2);
                 }
@@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
       const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
 
       for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
         if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
 
           for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
             if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
               for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                 if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                   for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                     if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                       m_func(offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
       const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
                   for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                       m_func(offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
       const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
 
       for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
         if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
 
           for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
             if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
               for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                 if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                   for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                     if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                       m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
       const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
                   for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                       m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
       const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
 
       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
 
           for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
             if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                     if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                       for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                           m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                         }
@@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
       const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                           m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                         }
@@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
       const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
 
       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
 
           for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
             if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                     if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                       for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                           m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                         }
@@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
       const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                           m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                         }
@@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
       const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
 
       for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
         if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
 
           for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
             if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
               for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                 if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                   for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                     if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                       for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                         if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                           for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                             if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                               m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                             }
@@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
       const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
                           for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                             if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                               m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                             }
@@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
       const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
 
       for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
         if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
 
           for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
             if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
               for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                 if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                   for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                     if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                       for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                         if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                           for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                             if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                               m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                             }
@@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
       const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
                           for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                             if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                               m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                             }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index cae8ecd489..079d9f0889 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
 
 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_constant_memory()
 {
   const DriverType & driver =
@@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
 
 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
   driver();
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
index 26b47a8b74..f8355f0d06 100644
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -242,45 +242,89 @@ public:
     re_ = v;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const complex<RealType>& src) {
+  complex<RealType>&
+  operator += (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src.re_;
     im_ += src.im_;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile complex<RealType>& src) volatile {
+  void
+  operator += (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src.re_;
     im_ += src.im_;
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const RealType& src) {
+  complex<RealType>&
+  operator += (const std::complex<RealType>& src) {
+    re_ += src.real();
+    im_ += src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator += (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile RealType& src) volatile {
+  void
+  operator += (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src;
   }
-
+  
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator -= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ -= src.re_;
     im_ -= src.im_;
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const RealType& src) {
+  complex<RealType>&
+  operator -= (const std::complex<RealType>& src) {
+    re_ -= src.real();
+    im_ -= src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator -= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ -= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator *= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
     re_ = realPart;
@@ -288,8 +332,12 @@ public:
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile complex<RealType>& src) volatile {
+  void
+  operator *= (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
     re_ = realPart;
@@ -297,20 +345,70 @@ public:
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const RealType& src) {
+  complex<RealType>&
+  operator *= (const std::complex<RealType>& src) {
+    const RealType realPart = re_ * src.real() - im_ * src.imag();
+    const RealType imagPart = re_ * src.imag() + im_ * src.real();
+    re_ = realPart;
+    im_ = imagPart;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator *= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ *= src;
     im_ *= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile RealType& src) volatile {
+  void
+  operator *= (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ *= src;
     im_ *= src;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const complex<RealType>& y) {
+  complex<RealType>&
+  operator /= (const complex<InputRealType>& y) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+    // If the real part is +/-Inf and the imaginary part is -/+Inf,
+    // this won't change the result.
+    const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
+
+    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+    // because y/s is NaN.
+    if (s == 0.0) {
+      this->re_ /= s;
+      this->im_ /= s;
+    }
+    else {
+      const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
+      const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
+      const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
+        y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
+      *this = x_scaled * y_conj_scaled;
+      *this /= y_scaled_abs;
+    }
+    return *this;
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator /= (const std::complex<RealType>& y) {
+
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
     // If the real part is +/-Inf and the imaginary part is -/+Inf,
     // this won't change the result.
@@ -334,57 +432,95 @@ public:
     return *this;
   }
 
+
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const RealType& src) {
+  complex<RealType>&
+  operator /= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
     re_ /= src;
     im_ /= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const complex<RealType>& src) {
-    return (re_ == src.re_) && (im_ == src.im_);
+  bool
+  operator == (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const RealType src) {
-    return (re_ == src) && (im_ == RealType(0));
+  bool
+  operator == (const std::complex<RealType>& src) {
+    return (re_ == src.real()) && (im_ == src.imag());
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator == (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator != (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const complex<RealType>& src) {
-    return (re_ != src.re_) || (im_ != src.im_);
+  bool
+  operator != (const std::complex<RealType>& src) {
+    return (re_ != src.real()) || (im_ != src.imag());
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const RealType src) {
-    return (re_ != src) || (im_ != RealType(0));
-  }
+  bool
+  operator != (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
 
+    return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
+  }
+  
 };
 
 //! Binary + operator for complex complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
 }
 
 //! Binary + operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () + y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
 }
 
 //! Binary + operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x + y.real (), y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
 }
 
 //! Unary + operator for complex.
@@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
 }
 
 //! Binary - operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
 }
 
 //! Binary - operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () - y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
 }
 
 //! Binary - operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x - y.real (), - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
 }
 
 //! Unary - operator for complex.
@@ -428,12 +564,12 @@ operator - (const complex<RealType>& x) {
 }
 
 //! Binary * operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }
 
 /// \brief Binary * operator for std::complex and complex.
@@ -446,33 +582,34 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
 /// This function cannot be called in a CUDA device function, because
 /// std::complex's methods and nonmember functions are not marked as
 /// CUDA device functions.
-template<class RealType>
-complex<RealType>
-operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+template<class RealType1, class RealType2>
+inline
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }
 
 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }
 
 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& y, const RealType& x) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& y, const RealType2& x) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }
 
 //! Imaginary part of a complex number.
@@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const complex<RealType1>& x, const RealType2& y) {
-  return complex<RealType1> (real (x) / y, imag (x) / y);
+  return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
 }
 
 //! Binary operator / for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator / (const complex<RealType>& x, const complex<RealType>& y) {
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
   // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
   // If the real part is +/-Inf and the imaginary part is -/+Inf,
   // this won't change the result.
-  const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
 
   // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
   // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
   // because y/s is NaN.
   if (s == 0.0) {
-    return complex<RealType> (real (x) / s, imag (x) / s);
+    return complex<common_real_type> (real (x) / s, imag (x) / s);
   }
   else {
-    const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
-    const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
-    const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
+    const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
+    const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
+    const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
       imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
-    complex<RealType> result = x_scaled * y_conj_scaled;
+    complex<common_real_type> result = x_scaled * y_conj_scaled;
     result /= y_scaled_abs;
     return result;
   }
@@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const RealType1& x, const complex<RealType2>& y) {
-  return complex<RealType1> (x)/y;
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
 }
 
 //! Equality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) == real (y) && imag (x) == imag (y);
+bool
+operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
 }
 
 /// \brief Equality operator for std::complex and Kokkos::complex.
@@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
 /// Otherwise, CUDA builds will give compiler warnings ("warning:
 /// calling a constexpr __host__ function("real") from a __host__
 /// __device__ function("operator==") is not allowed").
-template<class RealType>
-bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) == real (y) && std::imag (x) == imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
 }
-
+  
 //! Equality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
+bool
+operator == (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
 }
 
 //! Equality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const RealType& x, const complex<RealType>& y) {
+bool
+operator == (const RealType1& x, const complex<RealType2>& y) {
   return y == x;
 }
 
 //! Inequality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) != real (y) || imag (x) != imag (y);
+bool
+operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
 }
 
 //! Inequality operator for std::complex and Kokkos::complex.
-template<class RealType>
-KOKKOS_INLINE_FUNCTION
-bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) != real (y) || std::imag (x) != imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
 }
 
 //! Inequality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
+bool
+operator != (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
 }
 
 //! Inequality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const RealType& x, const complex<RealType>& y) {
+bool
+operator != (const RealType1& x, const complex<RealType2>& y) {
   return y != x;
 }
 
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
index f089c16ad2..b9c131cd7a 100644
--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -353,7 +353,14 @@ struct CountAndFill {
   struct Fill {};
   KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
     auto j = m_crs.row_map(i);
-    data_type* fill = &(m_crs.entries(j));
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
+      nullptr : (&(m_crs.entries(j)));
     m_functor(i, fill);
   }
   using self_type = CountAndFill<CrsType, Functor>;
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index 9c9af0dd8b..b811751a2c 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -147,12 +147,11 @@ public:
                  , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return "HBW"; }
 
 private:
 
   AllocationMechanism  m_alloc_mech;
-  static constexpr const char* m_name = "HBW";
   friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
index 339571941d..a825fd54d3 100644
--- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -192,7 +192,7 @@ template<>
 struct reduction_identity<float> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum()  {return static_cast<float>(0.0f);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return FLT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return -FLT_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float min()  {return FLT_MAX;}
 };
 
@@ -200,7 +200,7 @@ template<>
 struct reduction_identity<double> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum()  {return static_cast<double>(0.0);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return DBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return -DBL_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double min()  {return DBL_MAX;}
 };
 
@@ -208,7 +208,7 @@ template<>
 struct reduction_identity<long double> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum()  {return static_cast<long double>(0.0);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return LDBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return -LDBL_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min()  {return LDBL_MAX;}
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp
index b13b0b01de..0118d4667e 100644
--- a/lib/kokkos/core/src/Kokkos_ROCm.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
 } // namespace Kokkos
 
 
+
+#define threadIdx_x (hc_get_workitem_id(0))
+#define threadIdx_y (hc_get_workitem_id(1))
+#define threadIdx_z (hc_get_workitem_id(2))
+
+#define blockIdx_x  (hc_get_group_id(0))
+#define blockIdx_y  (hc_get_group_id(1))
+#define blockIdx_z  (hc_get_group_id(2))
+
+#define blockDim_x  (hc_get_group_size(0))
+#define blockDim_y  (hc_get_group_size(1))
+#define blockDim_z  (hc_get_group_size(2))
+
+#define gridDim_x   (hc_get_num_groups(0))
+#define gridDim_y   (hc_get_num_groups(1))
+#define gridDim_z   (hc_get_num_groups(2))
+
+
 #include <ROCm/Kokkos_ROCm_Parallel.hpp>
 #include <ROCm/Kokkos_ROCm_Task.hpp>
 
diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile
index 8fb13b8954..a917cf1656 100644
--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@@ -88,6 +88,7 @@ build-makefile-kokkos:
 	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
 	echo "" >> Makefile.kokkos
 	echo "#Variables used in application Makefiles" >> Makefile.kokkos
+	echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
 	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
 	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
 	echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 37d2ac8318..de84f6e59f 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
                                                  , thread_local_bytes
                                                  );
 
+      omp_set_num_threads(partition_size);
       f( omp_get_thread_num(), omp_get_num_threads() );
 
       Impl::t_openmp_instance->~Exec();
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
index 0b7a1e2583..f2674e5929 100644
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
@@ -113,7 +113,6 @@ void reduce_enqueue(
 
   if (output_length < 1) return;
 
-  assert(output_result != nullptr);
   const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
 
   // allocate host and device memory for the results from each team
@@ -176,14 +175,17 @@ void reduce_enqueue(
       }
       
   });
-  ValueInit::init(ReducerConditional::select(f, reducer), output_result);
+  if (output_result != nullptr)
+     ValueInit::init(ReducerConditional::select(f, reducer), output_result);
   fut.wait();
 
   copy(result,result_cpu.data());
-  for(std::size_t i=0;i<td.num_tiles;i++)
-    ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
+  if (output_result != nullptr) {
+    for(std::size_t i=0;i<td.num_tiles;i++)
+       ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
 
-  ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+    ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+  }
 
 }
 
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
index acf75f6f13..c2e85ad112 100644
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
@@ -67,7 +67,7 @@ void scan_enqueue(
     hc::array<value_type> result(td.num_tiles);
     hc::array<value_type> scratch(len);
 
-    tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
+    tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
     {
         const auto local = t_idx.local[0];
         const auto global = t_idx.global[0];
@@ -135,7 +135,7 @@ void scan_enqueue(
       ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
 
     copy(result_cpu.data(),result);
-    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] 
+    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] 
     {
 //        const auto local = t_idx.local[0];
         const auto global = t_idx.global[0];
diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
index 3d3029535e..c5e73c8b26 100644
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
   return full != i ? _bit_scan_forward( ~i ) : -1 ;
 #elif defined( KOKKOS_COMPILER_IBM )
   return full != i ? __cnttz4( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
   return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
 #else
@@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
   return _bit_scan_forward(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? _popcnt(~i & (i-1)) : -1;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
   return __builtin_ffs(i) - 1;
 #else
-  unsigned t = 1u;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t << 1;
-    ++r;
+  int offset = -1;
+  if ( i ) {
+    for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
   }
-  return r;
+  return offset;
 #endif
 }
 
@@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
   return _bit_scan_reverse(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return shift - __cntlz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? shift - _leadz32(i) : 0 ;
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return shift - __builtin_clz(i);
 #else
-  unsigned t = 1u << shift;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t >> 1;
-    ++r;
+  int offset = 0;
+  if ( i ) {
+    for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
   }
-  return r;
+  return offset;
 #endif
 }
 
@@ -142,6 +142,8 @@ int bit_count( unsigned i )
   return _popcnt32(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return __popcnt4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return _popcnt(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return __builtin_popcount(i);
 #else
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index e11f8b6d34..cd0553218d 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
   }
 }
 
-constexpr const char* HBWSpace::name() {
-  return m_name;
-}
-
 } // namespace Experimental
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp
index ce5537fed3..c7f681699e 100644
--- a/lib/kokkos/core/unit_test/TestComplex.hpp
+++ b/lib/kokkos/core/unit_test/TestComplex.hpp
@@ -114,7 +114,7 @@ struct TestComplexBasicMath {
   typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
 
   void testit () {
-    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
+    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
     h_results = Kokkos::create_mirror_view(d_results);
 
     Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
@@ -125,6 +125,7 @@ struct TestComplexBasicMath {
     std::complex<double> b(3.25,5.75);
     std::complex<double> d(1.0,2.0);
     double c = 9.3;
+    int e = 2;
 
     std::complex<double> r;
     r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(),  r.imag());
@@ -147,6 +148,12 @@ struct TestComplexBasicMath {
     r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
     r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
     r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
+
+    r = a; 
+    /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(),  r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(),  r.imag());
+    /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(),  r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(),  r.imag());
+    /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(),  r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(),  r.imag()*e);
+    /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(),  r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(),  r.imag()/e);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -190,6 +197,12 @@ struct TestComplexBasicMath {
     d_results(17) = c-a;
     d_results(18) = c*a;
     d_results(19) = c/a;
+
+    int e = 2;
+    d_results(20) = a+e;
+    d_results(21) = a-e;
+    d_results(22) = a*e;
+    d_results(23) = a/e;
   }
 };
 
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index f579ddf02c..fbc3a65c2f 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -286,7 +286,9 @@ struct TestMDRange_2D {
     // Test with reducers - scalar
     {
       typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
-      range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+      int s0 = 1;
+      int s1 = 1;
+      range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
 
       TestMDRange_2D functor( N0, N1 );
 
@@ -297,7 +299,7 @@ struct TestMDRange_2D {
 
       parallel_reduce( range, functor, reducer_scalar );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
     }
     // Test with reducers - scalar view
     {
@@ -445,7 +447,9 @@ struct TestMDRange_2D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      const int s0 = 1;
+      const int s1 = 1;
+      range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
       TestMDRange_2D functor( N0, N1 );
 
       parallel_for( range, functor );
@@ -454,8 +458,8 @@ struct TestMDRange_2D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
       {
         if ( h_view( i, j ) != 3 ) {
           ++counter;
@@ -463,7 +467,7 @@ struct TestMDRange_2D {
       }
 
       if ( counter != 0 ) {
-        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -699,6 +703,7 @@ struct TestMDRange_2D {
 
       ASSERT_EQ( counter, 0 );
     }
+
   } // end test_for2
 }; // MDRange_2D
 
@@ -749,7 +754,10 @@ struct TestMDRange_3D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
 
       TestMDRange_3D functor( N0, N1, N2 );
 
@@ -757,7 +765,7 @@ struct TestMDRange_3D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
     }
 
     // Test with reducers - scalar
@@ -952,7 +960,10 @@ struct TestMDRange_3D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
       parallel_for( range, functor );
@@ -961,9 +972,9 @@ struct TestMDRange_3D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
       {
         if ( h_view( i, j, k ) != 3 ) {
           ++counter;
@@ -971,7 +982,7 @@ struct TestMDRange_3D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
@@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
     }
 
     // Test with reducers - scalar
@@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
       parallel_for( range, functor );
@@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
       {
         if ( h_view( i, j, k, l ) != 3 ) {
           ++counter;
@@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
       }
 
       if ( counter != 0 ) {
-        printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
+        printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
       }
 
       ASSERT_EQ( counter, 0 );
@@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
@@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
     }
 
     // Test with reducers - scalar
@@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
       parallel_for( range, functor );
@@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
       {
         if ( h_view( i, j, k, l, m ) != 3 ) {
           ++counter;
@@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
@@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
     }
 
     // Test with reducers - scalar
@@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
       parallel_for( range, functor );
@@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
-      for ( int n = 0; n < N5; ++n )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      for ( int n = s5; n < N5; ++n )
       {
         if ( h_view( i, j, k, l, m, n ) != 3 ) {
           ++counter;
@@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
diff --git a/lib/latte/Install.py b/lib/latte/Install.py
index b3e771e4cc..37cb5d6b17 100644
--- a/lib/latte/Install.py
+++ b/lib/latte/Install.py
@@ -159,13 +159,13 @@ if buildflag or pathflag:
     os.remove("includelink")
   if os.path.isfile("liblink") or os.path.islink("liblink"):
     os.remove("liblink")
-  if os.path.isfile("filelink") or os.path.islink("filelink"):
-    os.remove("filelink")
+  if os.path.isfile("filelink.o") or os.path.islink("filelink.o"):
+    os.remove("filelink.o")
   cmd = 'ln -s "%s/src" includelink' % lattedir
   subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
   cmd = 'ln -s "%s" liblink' % lattedir
   subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir
+  cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir
   subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
 
 # copy Makefile.lammps.suffix to Makefile.lammps
diff --git a/lib/latte/Makefile.lammps.gfortran b/lib/latte/Makefile.lammps.gfortran
index 921721552b..6aa7782f8a 100644
--- a/lib/latte/Makefile.lammps.gfortran
+++ b/lib/latte/Makefile.lammps.gfortran
@@ -3,5 +3,5 @@
 # GNU Fortran settings
 
 latte_SYSINC = 
-latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas
+latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas
 latte_SYSPATH = -fopenmp
diff --git a/lib/latte/Makefile.lammps.ifort b/lib/latte/Makefile.lammps.ifort
index 23d2b32fcc..0491bdd8a5 100644
--- a/lib/latte/Makefile.lammps.ifort
+++ b/lib/latte/Makefile.lammps.ifort
@@ -3,7 +3,7 @@
 # Intel ifort settings
 
 latte_SYSINC = 
-latte_SYSLIB = ../../lib/latte/filelink \
+latte_SYSLIB = ../../lib/latte/filelink.o \
                -llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \
                -lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \
                -openmp -O0
diff --git a/src/.gitignore b/src/.gitignore
index 8fe732ece6..3036343d2a 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -414,6 +414,8 @@
 /fix_lambdah_calc.h
 /fix_langevin_eff.cpp
 /fix_langevin_eff.h
+/fix_latte.cpp
+/fix_latte.h
 /fix_lb_fluid.cpp
 /fix_lb_fluid.h
 /fix_lb_momentum.cpp
diff --git a/src/Depend.sh b/src/Depend.sh
index 9463607960..e1c812ebc2 100644
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then
   depend KOKKOS
 fi
 
+if (test $1 = "USER-DRUDE") then
+  depend USER-OMP
+fi
+
 if (test $1 = "USER-FEP") then
   depend USER-OMP
 fi
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
index b63dc5fb8c..6c610c8c11 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecAtomicKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecAtomicKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
-                                          const DAT::tdual_int_2d &list,
-                                          const int & iswap,
-                                          const DAT::tdual_xfloat_2d &buf,
-                                          const int &pbc_flag,
-                                          const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecAtomicKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecAtomicKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecAtomicKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecAtomicKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0) {
-    sync(Host,F_MASK);
-    modified(Host,F_MASK);
-  }
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecAtomicKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h
index 5e9a72c2e3..e4d2654e2c 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.h
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.h
@@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
   virtual ~AtomVecAtomicKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   void unpack_border(int, int, double *);
@@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::tdual_int_1d k_count;
 };
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp
index e0f29a27bb..076144420c 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecBondKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecBondKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
-                                        const DAT::tdual_int_2d &list,
-                                        const int & iswap,
-                                        const DAT::tdual_xfloat_2d &buf,
-                                        const int &pbc_flag,
-                                        const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecBondKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecBondKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecBondKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecBondKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecBondKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecBondKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                     int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecBondKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.h b/src/KOKKOS/atom_vec_bond_kokkos.h
index 3dcc99fa78..7ec15450ef 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.h
+++ b/src/KOKKOS/atom_vec_bond_kokkos.h
@@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
   virtual ~AtomVecBondKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
@@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::t_tagint_1d d_molecule;
   DAT::t_int_2d d_nspecial;
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp
index 89f7e91c2b..7b8b74b405 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm {
 
 /* ---------------------------------------------------------------------- */
 
-int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
-                                          const DAT::tdual_int_2d &list,
-                                          const int & iswap,
-                                          const DAT::tdual_xfloat_2d &buf,
-                                          const int &pbc_flag,
-                                          const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecChargeKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecChargeKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-                                        const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecChargeKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecChargeKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecChargeKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecChargeKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecChargeKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.h b/src/KOKKOS/atom_vec_charge_kokkos.h
index f9b385e7ed..e9ff70bbe1 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.h
+++ b/src/KOKKOS/atom_vec_charge_kokkos.h
@@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
   virtual ~AtomVecChargeKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
@@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::t_float_1d d_q;
 
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h
index 372404cc7d..cec1b82357 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.h
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.h
@@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::tdual_int_1d k_count;
 };
diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp
index fd7eaf7c81..8e9abe4067 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecFullKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecFullKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()
-				*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
-                                             const DAT::tdual_int_2d &list,
-                                             const int & iswap,
-                                             const DAT::tdual_xfloat_2d &buf,
-                                             const int &pbc_flag,
-                                             const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,1,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,1,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,0,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,0,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecFullKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecFullKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-    _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),
-    _list(list.view<DeviceType>()),_iswap(iswap),
-    _xprd(xprd),_yprd(yprd),_zprd(zprd),
-    _xy(xy),_xz(xz),_yz(yz) {
-    _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-    _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                                           const int & iswap,
-                                           const int nfirst, const int &pbc_flag,
-                                           const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecFullKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecFullKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecFullKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecFullKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf,
-                                      int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                          int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecFullKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_full_kokkos.h b/src/KOKKOS/atom_vec_full_kokkos.h
index 760df087e1..33760a8b5f 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.h
+++ b/src/KOKKOS/atom_vec_full_kokkos.h
@@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
   virtual ~AtomVecFullKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
@@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::t_float_1d d_q;
   HAT::t_float_1d h_q;
diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp
index 5542991395..03fb2a4ead 100644
--- a/src/KOKKOS/atom_vec_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_kokkos.cpp
@@ -12,6 +12,10 @@
 ------------------------------------------------------------------------- */
 
 #include "atom_vec_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "atom_masks.h"
 
 using namespace LAMMPS_NS;
 
@@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp)
   buffer_size = 0;
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_xfloat_2d &buf,
+                                          const int &pbc_flag,
+                                          const int* const pbc)
+{
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+
+	return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecKokkos_PackCommSelf(
+      const typename DAT::tdual_x_array &x,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _xw(i+_nfirst,0) = _x(j,0);
+          _xw(i+_nfirst,1) = _x(j,1);
+          _xw(i+_nfirst,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+                                        const int nfirst, const int &pbc_flag, const int* const pbc) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+	return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecKokkos_UnpackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first,
+    const DAT::tdual_xfloat_2d &buf ) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    struct AtomVecKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    struct AtomVecKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (atom->mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_PackReverse {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d _buf;
+  int _first;
+
+  AtomVecKokkos_PackReverse(
+      const typename DAT::tdual_f_array &f,
+      const typename DAT::tdual_ffloat_2d &buf,
+      const int& first):_f(f.view<DeviceType>()),_buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    _buf(i,0) = _f(i+_first,0);
+    _buf(i,1) = _f(i+_first,1);
+    _buf(i,2) = _f(i+_first,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first,
+    const DAT::tdual_ffloat_2d &buf ) {
+  if(commKK->reverse_comm_on_host) {
+    sync(Host,F_MASK);
+    struct AtomVecKokkos_PackReverse<LMPHostType> f(atomKK->k_f,buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,F_MASK);
+    struct AtomVecKokkos_PackReverse<LMPDeviceType> f(atomKK->k_f,buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+
+  return n*size_reverse;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnPackReverseSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
+  typename ArrayTypes<DeviceType>::t_f_array _fw;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+
+  AtomVecKokkos_UnPackReverseSelf(
+      const typename DAT::tdual_f_array &f,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap):
+      _f(f.view<DeviceType>()),_fw(f.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap) {
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    _fw(j,0) += _f(i+_nfirst,0);
+    _fw(j,1) += _f(i+_nfirst,1);
+    _fw(j,2) += _f(i+_nfirst,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+                                        const int nfirst) {
+  if(commKK->reverse_comm_on_host) {
+    sync(Host,F_MASK);
+    struct AtomVecKokkos_UnPackReverseSelf<LMPHostType> f(atomKK->k_f,nfirst,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Host,F_MASK);
+  } else {
+    sync(Device,F_MASK);
+    struct AtomVecKokkos_UnPackReverseSelf<LMPDeviceType> f(atomKK->k_f,nfirst,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Device,F_MASK);
+  }
+  return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnPackReverse {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array _f;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+
+  AtomVecKokkos_UnPackReverse(
+      const typename DAT::tdual_f_array &f,
+      const typename DAT::tdual_ffloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap):
+      _f(f.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    _f(j,0) += _buf(i,0);
+    _f(j,1) += _buf(i,1);
+    _f(j,2) += _buf(i,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_reverse_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_ffloat_2d &buf)
+{
+  // Check whether to always run reverse communication on the host
+  // Choose correct reverse UnPackReverse kernel
+
+  if(commKK->reverse_comm_on_host) {
+    struct AtomVecKokkos_UnPackReverse<LMPHostType> f(atomKK->k_f,buf,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Host,F_MASK);
+  } else {
+    struct AtomVecKokkos_UnPackReverse<LMPDeviceType> f(atomKK->k_f,buf,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Device,F_MASK);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_reverse(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,F_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+
+  if(n > 0)
+    modified(Host,F_MASK);
+}
diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h
index 7f593f235f..20a07ec443 100644
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec {
  public:
   AtomVecKokkos(class LAMMPS *);
   virtual ~AtomVecKokkos() {}
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual int pack_comm_vel(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
+  virtual void unpack_comm_vel(int, int, double *);
+  virtual int pack_reverse(int, int, double *);
+  virtual void unpack_reverse(int, int *, double *);
 
   virtual void sync(ExecutionSpace space, unsigned int mask) = 0;
   virtual void modified(ExecutionSpace space, unsigned int mask) = 0;
-  virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {};
+  virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0;
 
   virtual int
     pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
                    const int & iswap, const int nfirst,
-                   const int &pbc_flag, const int pbc[]) = 0;
-  //{return 0;}
+                   const int &pbc_flag, const int pbc[]);
+
   virtual int
     pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list,
                      const int & iswap, const DAT::tdual_xfloat_2d &buf,
-                     const int &pbc_flag, const int pbc[]) = 0;
-  //{return 0;}
+                     const int &pbc_flag, const int pbc[]);
+
   virtual void
     unpack_comm_kokkos(const int &n, const int &nfirst,
-                       const DAT::tdual_xfloat_2d &buf) = 0;
+                       const DAT::tdual_xfloat_2d &buf);
+
+  virtual int
+    unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list,
+                      const int & iswap, const int nfirst);
+
+  virtual int
+    pack_reverse_kokkos(const int &n, const int &nfirst,
+                        const DAT::tdual_ffloat_2d &buf);
+
+  virtual void
+    unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list,
+                          const int & iswap, const DAT::tdual_ffloat_2d &buf);
+
   virtual int
     pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                        DAT::tdual_xfloat_2d buf,int iswap,
                        int pbc_flag, int *pbc, ExecutionSpace space) = 0;
-  //{return 0;};
+
   virtual void
     unpack_border_kokkos(const int &n, const int &nfirst,
                          const DAT::tdual_xfloat_2d &buf,
@@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec {
                          DAT::tdual_int_1d k_sendlist,
                          DAT::tdual_int_1d k_copylist,
                          ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0;
-  //{return 0;};
+
   virtual int
     unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
                            int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
                            ExecutionSpace space) = 0;
-  //{return 0;};
+
 
  protected:
 
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
   class CommKokkos *commKK;
   size_t buffer_size;
   void* buffer;
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index f5ed0f525f..5534341342 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
   if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
   memory->sfree(sendlist);
   sendlist = NULL;
-  k_sendlist = ArrayTypes<LMPDeviceType>::tdual_int_2d();
+  k_sendlist = DAT::tdual_int_2d();
+  k_total_send = DAT::tdual_int_scalar("comm::k_total_send");
 
   // error check for disallow of OpenMP threads?
 
@@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
   memory->destroy(buf_recv);
   buf_recv = NULL;
 
-  k_exchange_sendlist = ArrayTypes<LMPDeviceType>::
+  k_exchange_sendlist = DAT::
     tdual_int_1d("comm:k_exchange_sendlist",100);
-  k_exchange_copylist = ArrayTypes<LMPDeviceType>::
+  k_exchange_copylist = DAT::
     tdual_int_1d("comm:k_exchange_copylist",100);
-  k_count = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_count",1);
-  k_sendflag = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_sendflag",100);
+  k_count = DAT::tdual_int_scalar("comm:k_count");
+  k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);
 
   memory->destroy(maxsendlist);
   maxsendlist = NULL;
@@ -102,8 +103,10 @@ void CommKokkos::init()
   atomKK = (AtomKokkos *) atom;
   exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
   forward_comm_classic = lmp->kokkos->forward_comm_classic;
+  reverse_comm_classic = lmp->kokkos->reverse_comm_classic;
   exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
   forward_comm_on_host = lmp->kokkos->forward_comm_on_host;
+  reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host;
 
   CommBrick::init();
 
@@ -132,8 +135,11 @@ void CommKokkos::init()
   if (force->newton == 0) check_reverse = 0;
   if (force->pair) check_reverse += force->pair->comm_reverse_off;
 
-  if(check_reverse || check_forward)
+  if (ghost_velocity)
     forward_comm_classic = true;
+
+  if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
+    reverse_comm_classic = true;
 }
 
 /* ----------------------------------------------------------------------
@@ -173,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy)
   int n;
   MPI_Request request;
   AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
-  double **x = atom->x;
   double *buf;
 
   // exchange data with another proc
@@ -181,32 +186,29 @@ void CommKokkos::forward_comm_device(int dummy)
   // if comm_x_only set, exchange or copy directly to x, don't unpack
 
   k_sendlist.sync<DeviceType>();
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
 
   for (int iswap = 0; iswap < nswap; iswap++) {
-
     if (sendproc[iswap] != me) {
       if (comm_x_only) {
-        atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
-        if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
-        else buf = NULL;
-
         if (size_forward_recv[iswap]) {
             buf = atomKK->k_x.view<DeviceType>().ptr_on_device() +
               firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1();
             MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
+                      recvproc[iswap],0,world,&request);
         }
         n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
                                    iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
-
         if (n) {
           MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),
                    n,MPI_DOUBLE,sendproc[iswap],0,world);
         }
 
-        if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
-                         space,X_MASK);
+        if (size_forward_recv[iswap]) {
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                           space,X_MASK);
+        }
       } else if (ghost_velocity) {
         error->all(FLERR,"Ghost velocity forward comm not yet "
                    "implemented with Kokkos");
@@ -248,21 +250,93 @@ void CommKokkos::forward_comm_device(int dummy)
     }
   }
 }
+
+/* ----------------------------------------------------------------------
+   reverse communication of forces on atoms every timestep
+   other per-atom attributes may also be sent via pack/unpack routines
+------------------------------------------------------------------------- */
+
 void CommKokkos::reverse_comm()
 {
+  if (!reverse_comm_classic) {
+    if (reverse_comm_on_host) reverse_comm_device<LMPHostType>();
+    else reverse_comm_device<LMPDeviceType>();
+    return;
+  }
+
   k_sendlist.sync<LMPHostType>();
+
   if (comm_f_only)
     atomKK->sync(Host,F_MASK);
   else
     atomKK->sync(Host,ALL_MASK);
+
   CommBrick::reverse_comm();
+
   if (comm_f_only)
     atomKK->modified(Host,F_MASK);
   else
     atomKK->modified(Host,ALL_MASK);
-  atomKK->sync(Device,ALL_MASK);
+
+  //atomKK->sync(Device,ALL_MASK); // is this needed?
 }
 
+template<class DeviceType>
+void CommKokkos::reverse_comm_device()
+{
+  int n;
+  MPI_Request request;
+  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
+  double *buf;
+  
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_f_only set, exchange or copy directly from f, don't pack
+
+  k_sendlist.sync<DeviceType>();
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK);
+
+  for (int iswap = nswap-1; iswap >= 0; iswap--) {
+    if (sendproc[iswap] != me) {
+      if (comm_f_only) {
+        if (size_reverse_recv[iswap])
+            MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE,
+                    sendproc[iswap],0,world,&request);
+        if (size_reverse_send[iswap]) {
+          buf = atomKK->k_f.view<DeviceType>().ptr_on_device() +
+            firstrecv[iswap]*atomKK->k_f.view<DeviceType>().dimension_1();
+  
+          MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
+                   recvproc[iswap],0,world);
+        }
+        if (size_reverse_recv[iswap]) {
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                           space,F_MASK);
+        }
+      } else {
+        if (size_reverse_recv[iswap])
+          MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
+                    size_reverse_recv[iswap],MPI_DOUBLE,
+                    sendproc[iswap],0,world,&request);
+        n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
+        if (n)
+          MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n,
+                   MPI_DOUBLE,recvproc[iswap],0,world);
+        if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+      }
+      avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                k_buf_recv);
+    } else {
+      if (sendnum[iswap])
+        n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap,
+                                 firstrecv[iswap]);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void CommKokkos::forward_comm_fix(Fix *fix, int size)
 {
   k_sendlist.sync<LMPHostType>();
@@ -408,7 +482,7 @@ struct BuildExchangeListFunctor {
   typename AT::t_x_array _x;
 
   int _nlocal,_dim;
-  typename AT::t_int_1d _nsend;
+  typename AT::t_int_scalar _nsend;
   typename AT::t_int_1d _sendlist;
   typename AT::t_int_1d _sendflag;
 
@@ -416,7 +490,7 @@ struct BuildExchangeListFunctor {
   BuildExchangeListFunctor(
       const typename AT::tdual_x_array x,
       const typename AT::tdual_int_1d sendlist,
-      typename AT::tdual_int_1d nsend,
+      typename AT::tdual_int_scalar nsend,
       typename AT::tdual_int_1d sendflag,int nlocal, int dim,
                 X_FLOAT lo, X_FLOAT hi):
                 _x(x.template view<DeviceType>()),
@@ -430,7 +504,7 @@ struct BuildExchangeListFunctor {
   KOKKOS_INLINE_FUNCTION
   void operator() (int i) const {
     if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) {
-      const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1);
+      const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1);
       if(mysend<_sendlist.dimension_0()) {
         _sendlist(mysend) = i;
         _sendflag(i) = 1;
@@ -489,9 +563,9 @@ void CommKokkos::exchange_device()
     if (true) {
       if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal);
       k_sendflag.sync<DeviceType>();
-      k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0();
-      while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
-        k_count.h_view(0) = 0;
+      k_count.h_view() = k_exchange_sendlist.h_view.dimension_0();
+      while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
+        k_count.h_view() = 0;
         k_count.modify<LMPHostType>();
         k_count.sync<DeviceType>();
 
@@ -504,10 +578,10 @@ void CommKokkos::exchange_device()
         k_count.modify<DeviceType>();
 
         k_count.sync<LMPHostType>();
-        if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
-          k_exchange_sendlist.resize(k_count.h_view(0)*1.1);
-          k_exchange_copylist.resize(k_count.h_view(0)*1.1);
-          k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0();
+        if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
+          k_exchange_sendlist.resize(k_count.h_view()*1.1);
+          k_exchange_copylist.resize(k_count.h_view()*1.1);
+          k_count.h_view()=k_exchange_sendlist.h_view.dimension_0();
         }
       }
       k_exchange_copylist.sync<LMPHostType>();
@@ -515,22 +589,22 @@ void CommKokkos::exchange_device()
       k_sendflag.sync<LMPHostType>();
 
       int sendpos = nlocal-1;
-      nlocal -= k_count.h_view(0);
-      for(int i = 0; i < k_count.h_view(0); i++) {
+      nlocal -= k_count.h_view();
+      for(int i = 0; i < k_count.h_view(); i++) {
         if (k_exchange_sendlist.h_view(i)<nlocal) {
           while (k_sendflag.h_view(sendpos)) sendpos--;
           k_exchange_copylist.h_view(i) = sendpos;
           sendpos--;
         } else
-        k_exchange_copylist.h_view(i) = -1;
+          k_exchange_copylist.h_view(i) = -1;
       }
 
       k_exchange_copylist.modify<LMPHostType>();
       k_exchange_copylist.sync<DeviceType>();
-      nsend = k_count.h_view(0);
+      nsend = k_count.h_view();
       if (nsend > maxsend) grow_send_kokkos(nsend,1);
       nsend =
-        avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send,
+        avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
                                    k_exchange_sendlist,k_exchange_copylist,
                                    ExecutionSpaceFromDevice<DeviceType>::
                                    space,dim,lo,hi);
@@ -640,9 +714,7 @@ void CommKokkos::borders()
   }
 
   atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
   k_sendlist.sync<LMPHostType>();
-  k_sendlist.modify<LMPHostType>();
   CommBrick::borders();
   k_sendlist.modify<LMPHostType>();
   atomKK->modified(Host,ALL_MASK);
@@ -659,11 +731,11 @@ struct BuildBorderListFunctor {
   int iswap,maxsendlist;
   int nfirst,nlast,dim;
   typename AT::t_int_2d sendlist;
-  typename AT::t_int_1d nsend;
+  typename AT::t_int_scalar nsend;
 
   BuildBorderListFunctor(typename AT::tdual_x_array _x,
                          typename AT::tdual_int_2d _sendlist,
-                         typename AT::tdual_int_1d _nsend,int _nfirst,
+                         typename AT::tdual_int_scalar _nsend,int _nfirst,
                          int _nlast, int _dim,
                          X_FLOAT _lo, X_FLOAT _hi, int _iswap,
                          int _maxsendlist):
@@ -684,7 +756,7 @@ struct BuildBorderListFunctor {
     for (int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()) {
       if (x(i,dim) >= lo && x(i,dim) <= hi) mysend++;
     }
-    const int my_store_pos = dev.team_scan(mysend,&nsend(0));
+    const int my_store_pos = dev.team_scan(mysend,&nsend());
 
     if (my_store_pos+mysend < maxsendlist) {
     mysend = my_store_pos;
@@ -713,7 +785,7 @@ void CommKokkos::borders_device() {
   AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
 
   ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  k_sendlist.modify<DeviceType>();
+  k_sendlist.sync<DeviceType>();
   atomKK->sync(exec_space,ALL_MASK);
 
   // do swaps over all 3 dimensions
@@ -763,37 +835,38 @@ void CommKokkos::borders_device() {
       if (sendflag) {
         if (!bordergroup || ineed >= 2) {
           if (style == SINGLE) {
-            typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1);
-            total_send.h_view(0) = 0;
-            if(exec_space == Device) {
-              total_send.template modify<DeviceType>();
-              total_send.template sync<LMPDeviceType>();
-            }
+            k_total_send.h_view() = 0;
+            k_total_send.template modify<LMPHostType>();
+            k_total_send.template sync<LMPDeviceType>();
 
             BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
-                total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+                k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
             Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
             Kokkos::parallel_for(config,f);
 
-            total_send.template modify<DeviceType>();
-            total_send.template sync<LMPHostType>();
+            k_total_send.template modify<DeviceType>();
+            k_total_send.template sync<LMPHostType>();
+
+            k_sendlist.modify<DeviceType>();
+
+            if(k_total_send.h_view() >= maxsendlist[iswap]) {
+              grow_list(iswap,k_total_send.h_view());
+
+              k_total_send.h_view() = 0;
+              k_total_send.template modify<LMPHostType>();
+              k_total_send.template sync<LMPDeviceType>();
 
-            if(total_send.h_view(0) >= maxsendlist[iswap]) {
-              grow_list(iswap,total_send.h_view(0));
-              k_sendlist.modify<DeviceType>();
-              total_send.h_view(0) = 0;
-              if(exec_space == Device) {
-                total_send.template modify<LMPHostType>();
-                total_send.template sync<LMPDeviceType>();
-              }
               BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
-                  total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+                  k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
               Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
               Kokkos::parallel_for(config,f);
-              total_send.template modify<DeviceType>();
-              total_send.template sync<LMPHostType>();
+
+              k_total_send.template modify<DeviceType>();
+              k_total_send.template sync<LMPHostType>();
+
+              k_sendlist.modify<DeviceType>();
             }
-            nsend = total_send.h_view(0);
+            nsend = k_total_send.h_view();
           } else {
             error->all(FLERR,"Required border comm not yet "
                        "implemented with Kokkos");
@@ -916,10 +989,11 @@ void CommKokkos::borders_device() {
 
   // reset global->local map
 
-  if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
   atomKK->modified(exec_space,ALL_MASK);
-  atomKK->sync(Host,TAG_MASK);
-  if (map_style) atom->map_set();
+  if (map_style) {
+    atomKK->sync(Host,TAG_MASK);
+    atom->map_set();
+  }
 }
 /* ----------------------------------------------------------------------
    realloc the size of the send buffer as needed with BUFFACTOR and bufextra
@@ -961,7 +1035,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
     buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
   }
   else {
-    k_buf_send = ArrayTypes<LMPDeviceType>::
+    k_buf_send = DAT::
       tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border);
     buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
   }
@@ -975,7 +1049,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space)
 {
   maxrecv = static_cast<int> (BUFFACTOR * n);
   int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2;
-  k_buf_recv = ArrayTypes<LMPDeviceType>::
+  k_buf_recv = DAT::
     tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border);
   buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device();
 }
@@ -988,6 +1062,11 @@ void CommKokkos::grow_list(int iswap, int n)
 {
   int size = static_cast<int> (BUFFACTOR * n);
 
+  if (exchange_comm_classic) { // force realloc on Host
+    k_sendlist.sync<LMPHostType>();
+    k_sendlist.modify<LMPHostType>();
+  }
+
   memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
 
   for(int i=0;i<maxswap;i++) {
@@ -1011,6 +1090,11 @@ void CommKokkos::grow_swap(int n)
   maxswap = n;
   int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN);
 
+  if (exchange_comm_classic) { // force realloc on Host
+    k_sendlist.sync<LMPHostType>();
+    k_sendlist.modify<LMPHostType>();
+  }
+
   memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
 
   memory->grow(maxsendlist,n,"comm:maxsendlist");
diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h
index a8ae973124..f137655cb8 100644
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@@ -25,15 +25,17 @@ class CommKokkos : public CommBrick {
 
   bool exchange_comm_classic;
   bool forward_comm_classic;
+  bool reverse_comm_classic;
   bool exchange_comm_on_host;
   bool forward_comm_on_host;
+  bool reverse_comm_on_host;
 
   CommKokkos(class LAMMPS *);
   ~CommKokkos();
   void init();
 
   void forward_comm(int dummy = 0);    // forward comm of atom coords
-  void reverse_comm();              // reverse comm of atom coords
+  void reverse_comm();                 // reverse comm of atom coords
   void exchange();                     // move atoms to new procs
   void borders();                      // setup list of atoms to comm
 
@@ -47,15 +49,17 @@ class CommKokkos : public CommBrick {
   void reverse_comm_dump(class Dump *);    // reverse comm from a Dump
 
   template<class DeviceType> void forward_comm_device(int dummy);
+  template<class DeviceType> void reverse_comm_device();
   template<class DeviceType> void forward_comm_pair_device(Pair *pair);
   template<class DeviceType> void exchange_device();
   template<class DeviceType> void borders_device();
 
  protected:
   DAT::tdual_int_2d k_sendlist;
+  DAT::tdual_int_scalar k_total_send;
   DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
   DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
-  DAT::tdual_int_1d k_count;
+  DAT::tdual_int_scalar k_count;
   //double *buf_send;                 // send buffer for all comm
   //double *buf_recv;                 // recv buffer for all comm
 
diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
index e54b53ae89..5d2f6a0438 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
 
   nmax = nmax = m_cap = 0;
   allocated_flag = 0;
+  nprev = 4;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -158,15 +159,15 @@ void FixQEqReaxKokkos<DeviceType>::init_hist()
 {
   int i,j;
 
-  k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5);
+  k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev);
   d_s_hist = k_s_hist.template view<DeviceType>();
   h_s_hist = k_s_hist.h_view;
-  k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5);
+  k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev);
   d_t_hist = k_t_hist.template view<DeviceType>();
   h_t_hist = k_t_hist.h_view;
 
   for( i = 0; i < atom->nmax; i++ )
-    for( j = 0; j < 5; j++ )
+    for( j = 0; j < nprev; j++ )
       k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0;
 
   k_s_hist.template modify<LMPHostType>();
@@ -334,11 +335,11 @@ void FixQEqReaxKokkos<DeviceType>::allocate_array()
     d_d = k_d.template view<DeviceType>();
     h_d = k_d.h_view;
 
-    k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5);
+    k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev);
     d_s_hist = k_s_hist.template view<DeviceType>();
     h_s_hist = k_s_hist.h_view;
 
-    k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5);
+    k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev);
     d_t_hist = k_t_hist.template view<DeviceType>();
     h_t_hist = k_t_hist.h_view;
   }
@@ -368,7 +369,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
     d_o[i] = 0.0;
     d_r[i] = 0.0;
     d_d[i] = 0.0;
-    //for( int j = 0; j < 5; j++ )
+    //for( int j = 0; j < nprev; j++ )
       //d_s_hist(i,j) = d_t_hist(i,j) = 0.0;
   }
 
@@ -1087,7 +1088,7 @@ void FixQEqReaxKokkos<DeviceType>::calculate_q_item(int ii) const
   if (mask[i] & groupbit) {
     q(i) = d_s[i] - delta * d_t[i];
 
-    for (int k = 4; k > 0; --k) {
+    for (int k = nprev-1; k > 0; --k) {
       d_s_hist(i,k) = d_s_hist(i,k-1);
       d_t_hist(i,k) = d_t_hist(i,k-1);
     }
@@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos<DeviceType>::memory_usage()
 {
   double bytes;
 
-  bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist
+  bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist
   bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage
   bytes += n_cap*2 * sizeof(int); // matrix...
   bytes += m_cap * sizeof(int);
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 072a802b54..2b02624dce 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   neighflag_qeq_set = 0;
   exchange_comm_classic = 0;
   forward_comm_classic = 0;
+  reverse_comm_classic = 0;
   exchange_comm_on_host = 0;
   forward_comm_on_host = 0;
+  reverse_comm_on_host = 0;
 
 #ifdef KILL_KOKKOS_ON_SIGSEGV
   signal(SIGSEGV, my_signal_handler);
@@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
   neighflag_qeq_set = 0;
   int newtonflag = 0;
   double binsize = 0.0;
-  exchange_comm_classic = forward_comm_classic = 0;
-  exchange_comm_on_host = forward_comm_on_host = 0;
+  exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+  exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
 
   int iarg = 0;
   while (iarg < narg) {
@@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg)
     } else if (strcmp(arg[iarg],"comm") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) {
-        exchange_comm_classic = forward_comm_classic = 1;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
       } else if (strcmp(arg[iarg+1],"host") == 0) {
-        exchange_comm_classic = forward_comm_classic = 0;
-        exchange_comm_on_host = forward_comm_on_host = 1;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
       } else if (strcmp(arg[iarg+1],"device") == 0) {
-        exchange_comm_classic = forward_comm_classic = 0;
-        exchange_comm_on_host = forward_comm_on_host = 0;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"comm/exchange") == 0) {
@@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg)
         forward_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/reverse") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) {
+        reverse_comm_classic = 0;
+        reverse_comm_on_host = 1;
+      } else if (strcmp(arg[iarg+1],"device") == 0) {
+        reverse_comm_classic = 0;
+        reverse_comm_on_host = 0;
+      } else error->all(FLERR,"Illegal package kokkos command");
+      iarg += 2;
     } else error->all(FLERR,"Illegal package kokkos command");
   }
 
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 8e28b38cbf..7b7848f1f0 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers {
   int neighflag_qeq_set;
   int exchange_comm_classic;
   int forward_comm_classic;
+  int reverse_comm_classic;
   int exchange_comm_on_host;
   int forward_comm_on_host;
+  int reverse_comm_on_host;
   int num_threads,ngpu;
   int numa;
   int auto_sync;
diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp
index c7e815928a..95ea105ad9 100644
--- a/src/KOKKOS/nbin_kokkos.cpp
+++ b/src/KOKKOS/nbin_kokkos.cpp
@@ -75,6 +75,10 @@ void NBinKokkos<DeviceType>::bin_atoms_setup(int nall)
     k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins);
     bincount = k_bincount.view<DeviceType>();
   }
+  if (nall > k_atom2bin.d_view.dimension_0()) {
+    k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall);
+    atom2bin = k_atom2bin.view<DeviceType>();
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -86,6 +90,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
 {
   last_bin = update->ntimestep;
 
+  k_bins.template sync<DeviceType>();
+  k_bincount.template sync<DeviceType>();
+  k_atom2bin.template sync<DeviceType>();
+
   h_resize() = 1;
 
   while(h_resize() > 0) {
@@ -115,6 +123,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
       c_bins = bins;
     }
   }
+
+  k_bins.template modify<DeviceType>();
+  k_bincount.template modify<DeviceType>();
+  k_atom2bin.template modify<DeviceType>();
 }
 
 /* ---------------------------------------------------------------------- */
@@ -125,6 +137,7 @@ void NBinKokkos<DeviceType>::binatomsItem(const int &i) const
 {
   const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2));
 
+  atom2bin(i) = ibin;
   const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1);
   if(ac < bins.dimension_1()) {
     bins(ibin, ac) = i;
diff --git a/src/KOKKOS/nbin_kokkos.h b/src/KOKKOS/nbin_kokkos.h
index de3cf41d19..bf2ccc5908 100644
--- a/src/KOKKOS/nbin_kokkos.h
+++ b/src/KOKKOS/nbin_kokkos.h
@@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard {
   int atoms_per_bin;
   DAT::tdual_int_1d k_bincount;
   DAT::tdual_int_2d k_bins;
+  DAT::tdual_int_1d k_atom2bin;
 
   typename AT::t_int_1d bincount;
   const typename AT::t_int_1d_const c_bincount;
   typename AT::t_int_2d bins;
   typename AT::t_int_2d_const c_bins;
+  typename AT::t_int_1d atom2bin;
   typename AT::t_int_scalar d_resize;
   typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
   typename AT::t_x_array_randomread x;
diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp
index 9a40808052..f34b149864 100644
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag)
   // build pairwise lists for all perpetual NPair/NeighList
   // grow() with nlocal/nall args so that only realloc if have to
 
-  atomKK->sync(Host,ALL_MASK);
   for (i = 0; i < npair_perpetual; i++) {
     m = plist[i];
+    if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK);
     if (!lists[m]->copy) lists[m]->grow(nlocal,nall);
     neigh_pair[m]->build_setup();
     neigh_pair[m]->build(lists[m]);
diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index b568bd5c93..d3cdcb0680 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -73,6 +73,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_bin_info()
   atoms_per_bin = nbKK->atoms_per_bin;
   k_bincount = nbKK->k_bincount;
   k_bins = nbKK->k_bins;
+  k_atom2bin = nbKK->k_atom2bin;
 }
 
 /* ----------------------------------------------------------------------
@@ -88,13 +89,15 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_stencil_info()
 
   int maxstencil = ns->get_maxstencil();
 
-  k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
+  if (maxstencil > k_stencil.dimension_0())
+    k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
   for (int k = 0; k < maxstencil; k++)
     k_stencil.h_view(k) = ns->stencil[k];
     k_stencil.modify<LMPHostType>();
     k_stencil.sync<DeviceType>();
   if (GHOST) {
-    k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
+    if (maxstencil > k_stencilxyz.dimension_0())
+      k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
     for (int k = 0; k < maxstencil; k++) {
       k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
       k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];
@@ -122,6 +125,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
          k_cutneighsq.view<DeviceType>(),
          k_bincount.view<DeviceType>(),
          k_bins.view<DeviceType>(),
+         k_atom2bin.view<DeviceType>(),
          nstencil,
          k_stencil.view<DeviceType>(),
          k_stencilxyz.view<DeviceType>(),
@@ -164,8 +168,9 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   k_ex_mol_group.sync<DeviceType>();
   k_ex_mol_bit.sync<DeviceType>();
   k_ex_mol_intra.sync<DeviceType>();
-  k_bincount.sync<DeviceType>(),
-  k_bins.sync<DeviceType>(),
+  k_bincount.sync<DeviceType>();
+  k_bins.sync<DeviceType>();
+  k_atom2bin.sync<DeviceType>();
   atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
 
   data.special_flag[0] = special_flag[0];
@@ -317,7 +322,7 @@ void NeighborKokkosExecute<DeviceType>::
   const X_FLOAT ztmp = x(i, 2);
   const int itype = type(i);
 
-  const int ibin = coord2bin(xtmp, ytmp, ztmp);
+  const int ibin = c_atom2bin(i);
 
   const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
     = d_stencil;
@@ -431,7 +436,7 @@ void NeighborKokkosExecute<DeviceType>::
   if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
   }
 
   neigh_list.d_ilist(i) = i;
@@ -641,7 +646,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli
   if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
   }
   }
 }
@@ -678,7 +683,7 @@ void NeighborKokkosExecute<DeviceType>::
   // no molecular test when i = ghost atom
 
   if (i < nlocal) {
-    const int ibin = coord2bin(xtmp, ytmp, ztmp);
+    const int ibin = c_atom2bin(i);
     for (int k = 0; k < nstencil; k++) {
       const int jbin = ibin + stencil[k];
       for(int m = 0; m < c_bincount(jbin); m++) {
@@ -764,7 +769,7 @@ void NeighborKokkosExecute<DeviceType>::
   if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
   }
   neigh_list.d_ilist(i) = i;
 }
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index 517ea546fa..6c1c0e958b 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -105,6 +105,7 @@ class NPairKokkos : public NPair {
   int atoms_per_bin;
   DAT::tdual_int_1d k_bincount;
   DAT::tdual_int_2d k_bins;
+  DAT::tdual_int_1d k_atom2bin;
 
   // data from NStencil class
 
@@ -148,6 +149,8 @@ class NeighborKokkosExecute
   const typename AT::t_int_1d_const c_bincount;
   typename AT::t_int_2d bins;
   typename AT::t_int_2d_const c_bins;
+  const typename AT::t_int_1d atom2bin;
+  const typename AT::t_int_1d_const c_atom2bin;
 
 
   // data from NStencil class
@@ -190,6 +193,7 @@ class NeighborKokkosExecute
                         const typename AT::t_xfloat_2d_randomread &_cutneighsq,
                         const typename AT::t_int_1d &_bincount,
                         const typename AT::t_int_2d &_bins,
+                        const typename AT::t_int_1d &_atom2bin,
                         const int _nstencil,
                         const typename AT::t_int_1d &_d_stencil,
                         const typename AT::t_int_1d_3 &_d_stencilxyz,
@@ -224,6 +228,7 @@ class NeighborKokkosExecute
                         const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
     neigh_list(_neigh_list), cutneighsq(_cutneighsq),
     bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
+    atom2bin(_atom2bin),c_atom2bin(_atom2bin),
     nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),
     nlocal(_nlocal),
     x(_x),type(_type),mask(_mask),molecule(_molecule),
@@ -281,38 +286,6 @@ class NeighborKokkosExecute
   void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
 #endif
 
-  KOKKOS_INLINE_FUNCTION
-  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
-  {
-    int ix,iy,iz;
-
-    if (x >= bboxhi[0])
-      ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
-    else if (x >= bboxlo[0]) {
-      ix = static_cast<int> ((x-bboxlo[0])*bininvx);
-      ix = MIN(ix,nbinx-1);
-    } else
-      ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
-
-    if (y >= bboxhi[1])
-      iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
-    else if (y >= bboxlo[1]) {
-      iy = static_cast<int> ((y-bboxlo[1])*bininvy);
-      iy = MIN(iy,nbiny-1);
-    } else
-      iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
-
-    if (z >= bboxhi[2])
-      iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
-    else if (z >= bboxlo[2]) {
-      iz = static_cast<int> ((z-bboxlo[2])*bininvz);
-      iz = MIN(iz,nbinz-1);
-    } else
-      iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
-
-    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
-  }
-
   KOKKOS_INLINE_FUNCTION
   int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
   {
diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp
index d95cd8f8ae..d5f83f4537 100644
--- a/src/KOKKOS/pair_reaxc_kokkos.cpp
+++ b/src/KOKKOS/pair_reaxc_kokkos.cpp
@@ -131,6 +131,8 @@ template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::init_style()
 {
   PairReaxC::init_style();
+  if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version
+  fix_reax = NULL;
 
   // irequest = neigh request made by parent class
 
@@ -555,8 +557,8 @@ void PairReaxCKokkos<DeviceType>::Deallocate_Lookup_Tables()
 
   ntypes = atom->ntypes;
 
-  for( i = 0; i < ntypes; ++i ) {
-    for( j = i; j < ntypes; ++j )
+  for( i = 0; i <= ntypes; ++i ) {
+    for( j = i; j <= ntypes; ++j )
       if( LR[i][j].n ) {
         sfree( LR[i][j].y, "LR[i,j].y" );
         sfree( LR[i][j].H, "LR[i,j].H" );
diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index e4a3f857d3..adec5ff1bd 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -294,6 +294,7 @@ void VerletKokkos::run(int n)
   int n_pre_exchange = modify->n_pre_exchange;
   int n_pre_neighbor = modify->n_pre_neighbor;
   int n_pre_force = modify->n_pre_force;
+  int n_pre_reverse = modify->n_pre_reverse;
   int n_post_force = modify->n_post_force;
   int n_end_of_step = modify->n_end_of_step;
 
@@ -304,9 +305,9 @@ void VerletKokkos::run(int n)
 
   f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0());
 
-  static double time = 0.0;
   atomKK->sync(Device,ALL_MASK);
-  Kokkos::Impl::Timer ktimer;
+  //static double time = 0.0;
+  //Kokkos::Impl::Timer ktimer;
 
   timer->init_timeout();
   for (int i = 0; i < n; i++) {
@@ -320,10 +321,10 @@ void VerletKokkos::run(int n)
 
     // initial time integration
 
-    ktimer.reset();
+    //ktimer.reset();
     timer->stamp();
     modify->initial_integrate(vflag);
-    time += ktimer.seconds();
+    //time += ktimer.seconds();
     if (n_post_integrate) modify->post_integrate();
     timer->stamp(Timer::MODIFY);
 
@@ -523,11 +524,18 @@ void VerletKokkos::run(int n)
       atomKK->k_f.modify<LMPDeviceType>();
     }
 
+    if (n_pre_reverse) {
+      modify->pre_reverse(eflag,vflag);
+      timer->stamp(Timer::MODIFY);
+    }
 
     // reverse communication of forces
 
-    if (force->newton) comm->reverse_comm();
-    timer->stamp(Timer::COMM);
+    if (force->newton) {
+      Kokkos::fence();
+      comm->reverse_comm();
+      timer->stamp(Timer::COMM);
+    }
 
     // force modifications, final time integration, diagnostics
 
diff --git a/src/MAKE/MACHINES/Makefile.cori2 b/src/MAKE/MACHINES/Makefile.cori2
index a367d54080..45e1ab1f8a 100755
--- a/src/MAKE/MACHINES/Makefile.cori2
+++ b/src/MAKE/MACHINES/Makefile.cori2
@@ -15,13 +15,14 @@ SHELL = /bin/sh
 
 CC =		CC
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \
+                $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		CC
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_coprocessor
index a717be93ff..75e4d89170 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_coprocessor
+++ b/src/MAKE/OPTIONS/Makefile.intel_coprocessor
@@ -10,7 +10,7 @@ CC =		mpiicpc
 MIC_OPT =       -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
 CCFLAGS =	-g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
                 -xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \
-                -qoverride-limits $(MIC_OPT)
+                -qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu
old mode 100755
new mode 100644
index b7db064574..41d0f959fe
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu
@@ -8,14 +8,14 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           -ltbbmalloc -ltbbmalloc_proxy
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size
 
 ARCHIVE =	ar
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index 8a45b781f8..ef514f43c6 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -8,8 +8,8 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
index 40d517bce4..e4dc74d79b 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@@ -8,14 +8,14 @@ SHELL = /bin/sh
 
 CC =		mpicxx -cxx=icc
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx -cxx=icc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size
 
 ARCHIVE =	ar
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
index fe1be99e58..457a64b223 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@@ -9,14 +9,14 @@ SHELL = /bin/sh
 export OMPI_CXX = icc
 CC =		mpicxx
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           -ltbbmalloc -ltbbmalloc_proxy
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size
 
 ARCHIVE =	ar
diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
deleted file mode 100644
index 406e98b36d..0000000000
--- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
+++ /dev/null
@@ -1,123 +0,0 @@
-# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT
-
-SHELL = /bin/sh
-
-# ---------------------------------------------------------------------
-# compiler/linker settings
-# specify flags and libraries needed for your compiler
-
-CC =		mpiicpc 
-MIC_OPT =       -qoffload-arch=mic-avx512 -fp-model fast=2
-CCFLAGS =	-O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-                -xHost -fno-alias -ansi-alias -restrict \
-                -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT
-SHFLAGS =	-fPIC
-DEPFLAGS =	-M
-
-LINK =		mpiicpc
-LINKFLAGS =	-g -O3 -xHost -qopenmp -qoffload $(MIC_OPT)
-LIB =           -ltbbmalloc
-SIZE =		size
-
-ARCHIVE =	ar
-ARFLAGS =	-rc
-SHLIBFLAGS =	-shared
-
-# ---------------------------------------------------------------------
-# LAMMPS-specific settings, all OPTIONAL
-# specify settings for LAMMPS features you will use
-# if you change any -D setting, do full re-compile after "make clean"
-
-# LAMMPS ifdef settings
-# see possible settings in Section 2.2 (step 4) of manual
-
-LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
-
-# MPI library
-# see discussion in Section 2.2 (step 5) of manual
-# MPI wrapper compiler/linker can provide this info
-# can point to dummy MPI library in src/STUBS as in Makefile.serial
-# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
-# INC = path for mpi.h, MPI compiler settings
-# PATH = path for MPI library
-# LIB = name of MPI library
-
-MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
-MPI_PATH = 
-MPI_LIB =
-
-# FFT library
-# see discussion in Section 2.2 (step 6) of manaul
-# can be left blank to use provided KISS FFT library
-# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
-# PATH = path for FFT library
-# LIB = name of FFT library
-
-FFT_INC =      -DFFT_MKL -DFFT_SINGLE
-FFT_PATH = 
-FFT_LIB =	-L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core
-
-# JPEG and/or PNG library
-# see discussion in Section 2.2 (step 7) of manual
-# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
-# INC = path(s) for jpeglib.h and/or png.h
-# PATH = path(s) for JPEG library and/or PNG library
-# LIB = name(s) of JPEG library and/or PNG library
-
-JPG_INC =       
-JPG_PATH = 	
-JPG_LIB =	-ljpeg
-
-# ---------------------------------------------------------------------
-# build rules and dependencies
-# do not edit this section
-
-include	Makefile.package.settings
-include	Makefile.package
-
-EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
-EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
-EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
-EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
-EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
-
-# Path to src files
-
-vpath %.cpp ..
-vpath %.h ..
-
-# Link target
-
-$(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
-	$(SIZE) $(EXE)
-
-# Library targets
-
-lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
-
-shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
-        $(OBJ) $(EXTRA_LIB) $(LIB)
-
-# Compilation rules
-
-%.o:%.cpp $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-%.d:%.cpp $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
-
-%.o:%.cu $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-# Individual dependencies
-
-depend : fastdep.exe $(SRC)
-	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
-
-fastdep.exe: ../DEPEND/fastdep.c
-	cc -O -o $@ $<
-
-sinclude .depend
diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl
index 881c51f0e4..8e266a4fce 100644
--- a/src/MAKE/OPTIONS/Makefile.knl
+++ b/src/MAKE/OPTIONS/Makefile.knl
@@ -8,13 +8,13 @@ SHELL = /bin/sh
 
 CC =		mpiicpc
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size
 
diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp
index 30ebc779c5..14eeac8d66 100644
--- a/src/REPLICA/prd.cpp
+++ b/src/REPLICA/prd.cpp
@@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg)
   time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0;
   bigint clock = 0;
 
+  timer->init();
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
 
diff --git a/src/REPLICA/tad.cpp b/src/REPLICA/tad.cpp
index 5a4d885224..347cd3ba67 100644
--- a/src/REPLICA/tad.cpp
+++ b/src/REPLICA/tad.cpp
@@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg)
   nbuild = ndanger = 0;
   time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0;
 
+  timer->init();
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
 
diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
index f7163e6791..da553d158a 100644
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@@ -46,7 +46,7 @@ action nbin_intel.h
 action nbin_intel.cpp
 action npair_intel.h
 action npair_intel.cpp
-action intel_simd.h pair_sw_intel.cpp
+action intel_simd.h
 action intel_intrinsics.h pair_tersoff_intel.cpp
 action intel_intrinsics_airebo.h pair_airebo_intel.cpp
 
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index 3b84446057..871d881f39 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -30,28 +30,37 @@ be added or changed in the Makefile depending on the version:
 
 2017 update 2         - No changes needed
 2017 updates 3 or 4   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
-2018 or newer         - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
+2018 inital release   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
+2018u1 or newer       - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
 
 -----------------------------------------------------------------------------
 
 When using the suffix command with "intel", intel styles will be used if they
 exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP 
-USER-OMP styles will be used whenever USER-INTEL styles are not available. This
-allow for running most styles in LAMMPS with threading.
+is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
+available. This allow for running most styles in LAMMPS with threading.
 
 -----------------------------------------------------------------------------
 
-The Long-Range Thread mode (LRT) in the Intel package currently uses
-pthreads by default. If pthreads are not supported in the build environment,
-the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for 
-builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to
-build with compilers that support threads using the C++11 standard. When using
+The Long-Range Thread mode (LRT) in the Intel package is enabled through the
+-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles
+include this define. This feature will use pthreads by default.
+Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that
+support threads intrinsically using the C++11 standard. When using
 LRT mode, you might need to disable OpenMP affinity settings (e.g.
 export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings
 need to be changed.
 
 -----------------------------------------------------------------------------
 
+Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG
+should be added to the compile flags. This will enable using the MKL Mersenne
+Twister random number generator (RNG) for Dissipative Particle Dynamics 
+(DPD). This RNG can allow significantly faster performance and it also has a 
+significantly longer period than the standard RNG for DPD.
+
+-----------------------------------------------------------------------------
+
 In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag 
 -DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of 
 Intel compilers.
diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README
index 434189dd26..62602d5920 100644
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@@ -9,6 +9,7 @@
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
 # in.intel.airebo -     Polyethelene benchmark with AIREBO
+# in.intel.dpd -        Dissipative Particle Dynamics
 #
 #############################################################################
 
@@ -16,16 +17,17 @@
 # Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
 #  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
 #
-#                     Xeon E5-2697v4     Xeon Phi 7250
+#                     Xeon E5-2697v4     Xeon Phi 7250    Xeon Gold 6148
 #                    
-# in.intel.lj -            199.5               282.3
-# in.intel.rhodo -          12.4                17.5
-# in.intel.lc -	            19.0                25.7
-# in.intel.eam -            59.4                92.8
-# in.intel.sw -	           132.4               161.9
-# in.intel.tersoff -        83.3               101.1
-# in.intel.water -          53.4                90.3
-# in.intel.airebo -          7.3                11.8
+# in.intel.lj -            199.5               282.3            317.3
+# in.intel.rhodo -          12.4                17.5             24.4
+# in.intel.lc -	            19.0                25.7             26.8
+# in.intel.eam -            59.4                92.8            105.6 
+# in.intel.sw -	           132.4               161.9            213.8
+# in.intel.tersoff -        83.3               101.1            109.6
+# in.intel.water -          53.4                90.3            105.5
+# in.intel.airebo -          7.3                11.8             17.6
+# in.intel.dpd -            74.5               100.4            148.1
 #
 #############################################################################
 
diff --git a/src/USER-INTEL/TEST/in.intel.dpd b/src/USER-INTEL/TEST/in.intel.dpd
new file mode 100644
index 0000000000..e257d91f84
--- /dev/null
+++ b/src/USER-INTEL/TEST/in.intel.dpd
@@ -0,0 +1,48 @@
+# DPD benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 4000	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+variable	rr equal floor($t*$m)
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+units		lj
+atom_style	atomic
+comm_modify     mode single vel yes
+
+lattice		fcc 3.0
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.0 87287 loop geom
+
+pair_style	dpd 1.0 1.0 928948
+pair_coeff	1 1 25.0 4.5
+
+neighbor	0.5 bin
+neigh_modify    delay 0 every 1
+
+fix		1 all nve
+timestep	0.04
+
+thermo			1000
+
+if "$p > 0"		then "run_style verlet/power"
+
+if "$w > 0"		then "run $w"
+run    	 ${rr}
diff --git a/src/USER-INTEL/dihedral_fourier_intel.cpp b/src/USER-INTEL/dihedral_fourier_intel.cpp
new file mode 100644
index 0000000000..805ffc0e25
--- /dev/null
+++ b/src/USER-INTEL/dihedral_fourier_intel.cpp
@@ -0,0 +1,441 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include "dihedral_fourier_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define PTOLERANCE (flt_t)1.05
+#define MTOLERANCE (flt_t)-1.05
+typedef struct { int a,b,c,d,t;  } int5_t;
+
+/* ---------------------------------------------------------------------- */
+
+DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
+  : DihedralFourier(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    DihedralFourier::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::compute(int eflag, int vflag,
+				   IntelBuffers<flt_t,acc_t> *buffers,
+				   const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  if (evflag) {
+    if (vflag && !eflag) {
+      if (force->newton_bond)
+        eval<0,1,1>(vflag, buffers, fc);
+      else
+        eval<0,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+        eval<1,1,1>(vflag, buffers, fc);
+      else
+        eval<1,1,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralFourierIntel::eval(const int vflag,
+				IntelBuffers<flt_t,acc_t> *buffers,
+				const ForceConst<flt_t> &fc)
+
+{
+  const int inum = neighbor->ndihedrallist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)           \
+    reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int5_t * _noalias const dihedrallist =
+      (int5_t *) neighbor->dihedrallist[0];
+
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
+      const int i1 = dihedrallist[n].a;
+      const int i2 = dihedrallist[n].b;
+      const int i3 = dihedrallist[n].c;
+      const int i4 = dihedrallist[n].d;
+      const int type = dihedrallist[n].t;
+
+      // 1st bond
+
+      const flt_t vb1x = x[i1].x - x[i2].x;
+      const flt_t vb1y = x[i1].y - x[i2].y;
+      const flt_t vb1z = x[i1].z - x[i2].z;
+
+      // 2nd bond
+
+      const flt_t vb2xm = x[i2].x - x[i3].x;
+      const flt_t vb2ym = x[i2].y - x[i3].y;
+      const flt_t vb2zm = x[i2].z - x[i3].z;
+
+      // 3rd bond
+
+      const flt_t vb3x = x[i4].x - x[i3].x;
+      const flt_t vb3y = x[i4].y - x[i3].y;
+      const flt_t vb3z = x[i4].z - x[i3].z;
+
+      // c,s calculation
+
+      const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
+      const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
+      const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
+      const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
+      const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
+      const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
+
+      const flt_t rasq = ax*ax + ay*ay + az*az;
+      const flt_t rbsq = bx*bx + by*by + bz*bz;
+      const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+      const flt_t rg = sqrt(rgsq);
+
+      flt_t rginv, ra2inv, rb2inv;
+      rginv = ra2inv = rb2inv = (flt_t)0.0;
+      if (rg > 0) rginv = (flt_t)1.0/rg;
+      if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
+      if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
+      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+
+      flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
+      const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+      // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
+      if (c > PTOLERANCE || c < MTOLERANCE) {
+        int me = comm->me;
+
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
+      }
+      #endif
+
+      if (c > (flt_t)1.0) c = (flt_t)1.0;
+      if (c < (flt_t)-1.0) c = (flt_t)-1.0;
+
+      flt_t deng;
+      flt_t df = (flt_t)0.0;
+      if (EFLAG) deng = (flt_t)0.0;
+      
+      for (int j = 0; j < nterms[type]; j++) {
+	const flt_t tcos_shift = fc.bp[j][type].cos_shift;
+	const flt_t tsin_shift = fc.bp[j][type].sin_shift;
+	const flt_t tk = fc.bp[j][type].k;
+	const int m = fc.bp[j][type].multiplicity;
+
+	flt_t p = (flt_t)1.0;
+	flt_t ddf1, df1;
+	ddf1 = df1 = (flt_t)0.0;
+
+	for (int i = 0; i < m; i++) {
+	  ddf1 = p*c - df1*s;
+	  df1 = p*s + df1*c;
+	  p = ddf1;
+	}
+
+	p = p*tcos_shift + df1*tsin_shift;
+	df1 = df1*tcos_shift - ddf1*tsin_shift;
+	df1 *= -m;
+	p += (flt_t)1.0;
+	
+	if (m == 0) {
+	  p = (flt_t)1.0 + tcos_shift;
+	  df1 = (flt_t)0.0;
+	}
+
+        if (EFLAG) deng += tk * p;
+        df -= tk * df1;
+      }
+
+      const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+      const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+      const flt_t fga = fg*ra2inv*rginv;
+      const flt_t hgb = hg*rb2inv*rginv;
+      const flt_t gaa = -ra2inv*rg;
+      const flt_t gbb = rb2inv*rg;
+
+      const flt_t dtfx = gaa*ax;
+      const flt_t dtfy = gaa*ay;
+      const flt_t dtfz = gaa*az;
+      const flt_t dtgx = fga*ax - hgb*bx;
+      const flt_t dtgy = fga*ay - hgb*by;
+      const flt_t dtgz = fga*az - hgb*bz;
+      const flt_t dthx = gbb*bx;
+      const flt_t dthy = gbb*by;
+      const flt_t dthz = gbb*bz;
+
+      const flt_t sx2 = df*dtgx;
+      const flt_t sy2 = df*dtgy;
+      const flt_t sz2 = df*dtgz;
+
+      flt_t f1x = df*dtfx;
+      flt_t f1y = df*dtfy;
+      flt_t f1z = df*dtfz;
+
+      const flt_t f2x = sx2 - f1x;
+      const flt_t f2y = sy2 - f1y;
+      const flt_t f2z = sz2 - f1z;
+
+      flt_t f4x = df*dthx;
+      flt_t f4y = df*dthy;
+      flt_t f4z = df*dthz;
+
+      const flt_t f3x = -sx2 - f4x;
+      const flt_t f3y = -sy2 - f4y;
+      const flt_t f3z = -sz2 - f4z;
+
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
+      }
+
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
+
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
+        }
+      }
+    } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
+  } // omp parallel
+
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::init_style()
+{
+  DihedralFourier::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
+					    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->ndihedraltypes + 1;
+  fc.set_ntypes(bp1, setflag, nterms, memory);
+
+  for (int i = 1; i < bp1; i++) {
+    if (setflag[i]) {
+      for (int j = 0; j < nterms[i]; j++) {
+        fc.bp[j][i].cos_shift = cos_shift[i][j];
+	fc.bp[j][i].sin_shift = sin_shift[i][j];
+	fc.bp[j][i].k = k[i][j];
+	fc.bp[j][i].multiplicity = multiplicity[i][j];
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
+                                                         int *setflag,
+							 int *nterms,
+							 Memory *memory) {
+  if (nbondtypes != _nbondtypes) {
+    if (_nbondtypes > 0)
+      _memory->destroy(bp);
+
+    if (nbondtypes > 0) {
+      _maxnterms = 1;
+      for (int i = 1; i <= nbondtypes; i++)
+        if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
+
+      _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
+    }
+  }
+  _nbondtypes = nbondtypes;
+  _memory = memory;
+}
diff --git a/src/USER-INTEL/dihedral_fourier_intel.h b/src/USER-INTEL/dihedral_fourier_intel.h
new file mode 100644
index 0000000000..a775e129f4
--- /dev/null
+++ b/src/USER-INTEL/dihedral_fourier_intel.h
@@ -0,0 +1,82 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(fourier/intel,DihedralFourierIntel)
+
+#else
+
+#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
+#define LMP_DIHEDRAL_FOURIER_INTEL_H
+
+#include "dihedral_fourier.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class DihedralFourierIntel : public DihedralFourier {
+
+ public:
+  DihedralFourierIntel(class LAMMPS *lmp);
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cos_shift, sin_shift, k; 
+      int multiplicity; } fc_packed1;
+
+    fc_packed1 **bp;
+
+    ForceConst() : _nbondtypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
+
+    void set_ntypes(const int nbondtypes, int *setflag, int *nterms, 
+		    Memory *memory);
+
+   private:
+    int _nbondtypes, _maxnterms;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index 637fc0d06e..eac48b8510 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -285,6 +285,7 @@ int FixIntel::setmask()
 {
   int mask = 0;
   mask |= PRE_REVERSE;
+  mask |= MIN_PRE_REVERSE;
   #ifdef _LMP_INTEL_OFFLOAD
   mask |= POST_FORCE;
   mask |= MIN_POST_FORCE;
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index 068e5ed890..d7093e79bb 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -43,6 +43,7 @@ class FixIntel : public Fix {
   virtual int setmask();
   virtual void init();
   virtual void setup(int);
+  inline void min_setup(int in) { setup(in); }
   void setup_pre_reverse(int eflag = 0, int vflag = 0);
 
   void pair_init_check(const bool cdmessage=false);
@@ -50,6 +51,8 @@ class FixIntel : public Fix {
   void kspace_init_check();
 
   void pre_reverse(int eflag = 0, int vflag = 0);
+  inline void min_pre_reverse(int eflag = 0, int vflag = 0)
+    { pre_reverse(eflag, vflag); }
 
   // Get all forces, calculation results from coprocesser
   void sync_coprocessor();
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index b4b664cb94..ac208f5a0c 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
   IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
   lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
   #endif
+  memset(_ccachei, 0, vsize * sizeof(int));
   memset(_ccachej, 0, vsize * sizeof(int));
 
   #ifdef _LMP_INTEL_OFFLOAD
@@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
-        nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
+        in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
         in(ccachej:length(vsize) alloc_if(1) free_if(0))
     }
     #ifdef LMP_USE_AVXCD
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index a7663d54a6..d49d0d8b00 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
     ito = inum;                                                 \
   }
 
+#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
+                                 nthr, vecsize)                 \
+  {                                                             \
+    tid = 0;							\
+    ifrom = 0;							\
+    ip = 1;							\
+    ito = inum;							\
+  }
+
 #endif
 
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
index 12101712f1..e6d45d7b2c 100644
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + binstart[k]];
 	      const int bend = binhead[ibin + binend[k]];
               #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
               #pragma simd
               #endif
               for (int jj = bstart; jj < bend; jj++)
@@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + stencil[k]];
 	      const int bend = binhead[ibin + stencil[k] + 1];
               #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
               #pragma simd
               #endif
               for (int jj = bstart; jj < bend; jj++)
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index 79dc75366e..0068e02635 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin + binstart[k]];
             const int bend = binhead[ibin + binend[k]];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
             #pragma simd
             #endif
             for (int jj = bstart; jj < bend; jj++)
@@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin];
             const int bend = binhead[ibin + 1];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
             #pragma simd
             #endif
             for (int jj = bstart; jj < bend; jj++) {
diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp
new file mode 100644
index 0000000000..c7cddfccc1
--- /dev/null
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@@ -0,0 +1,617 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_dpd_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LMP_MKL_RNG VSL_BRNG_MT19937
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define IEPSILON 1.0e10
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
+  PairDPD(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  random_thread = NULL;
+  _nrandom_thread = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::~PairDPDIntel()
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_USE_MKL_RNG
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #endif
+  }
+  #endif
+  delete []random_thread;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::compute(int eflag, int vflag,
+                           IntelBuffers<flt_t,acc_t> *buffers,
+                           const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (_onetype) {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  }
+}
+
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairDPDIntel::eval(const int offload, const int vflag,
+                        IntelBuffers<flt_t,acc_t> *buffers,
+                        const ForceConst<flt_t> &fc,
+                        const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  typedef struct { double x, y, z; } lmp_vt;
+  lmp_vt *v = (lmp_vt *)atom->v[0];
+  const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const FC_PACKED1_T * _noalias const param = fc.param[0];
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  int * _noalias const rngi_thread = fc.rngi;
+  const int rng_size = buffers->get_max_nbors();
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+  int *overflow = fix->get_off_overflow_flag();
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      #ifdef LMP_USE_MKL_RNG
+      VSLStreamStatePtr *my_random = &(random_thread[tid]);
+      #else
+      RanMars *my_random = random_thread[tid];
+      #endif
+      flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
+      int rngi = rngi_thread[tid];
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      flt_t icut, a0, gamma, sigma;
+      if (ONETYPE) {
+        icut = param[3].icut;
+        a0 = param[3].a0;
+        gamma = param[3].gamma;
+        sigma = param[3].sigma;
+      }
+      for (int i = iifrom; i < iito; i += iip) {
+        int itype, ptr_off;
+        const FC_PACKED1_T * _noalias parami;
+        if (!ONETYPE) {
+          itype = x[i].w;
+          ptr_off = itype * ntypes;
+          parami = param + ptr_off;
+        }
+
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+	const flt_t vxtmp = v[i].x;
+	const flt_t vytmp = v[i].y;
+	const flt_t vztmp = v[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+	if (rngi + jnum > rng_size) {
+          #ifdef LMP_USE_MKL_RNG
+	  if (sizeof(flt_t) == sizeof(float))
+	    vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+			  (float*)my_rand_buffer, (float)0.0, (float)1.0 );
+	  else
+	    vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+	  		  (double*)my_rand_buffer, 0.0, 1.0 );
+          #else
+          for (int jj = 0; jj < rngi; jj++)
+            my_rand_buffer[jj] = my_random->gaussian();
+          #endif
+	  rngi = 0;
+	}
+
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+	                         sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcelj, evdwl;
+          forcelj = evdwl = (flt_t)0.0;
+
+          int j, jtype, sbindex;
+          if (!ONETYPE) {
+            sbindex = jlist[jj] >> SBBITS & 3;
+            j = jlist[jj] & NEIGHMASK;
+          } else
+            j = jlist[jj];
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          if (!ONETYPE) {
+            jtype = x[j].w;
+            icut = parami[jtype].icut;
+          }
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+	  const flt_t rinv = (flt_t)1.0/sqrt(rsq);
+
+          if (rinv > icut) {
+            flt_t factor_dpd;
+            if (!ONETYPE) factor_dpd = special_lj[sbindex];
+
+	    flt_t delvx = vxtmp - v[j].x;
+	    flt_t delvy = vytmp - v[j].y;
+	    flt_t delvz = vztmp - v[j].z;
+	    flt_t dot = delx*delvx + dely*delvy + delz*delvz;
+	    flt_t randnum = my_rand_buffer[jj];
+
+	    flt_t iwd = rinv - icut;
+	    if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
+
+	    if (!ONETYPE) {
+	      a0 = parami[jtype].a0;
+	      gamma = parami[jtype].gamma;
+	      sigma = parami[jtype].sigma;
+	    }
+	    flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
+	    if (!ONETYPE) fpair *= factor_dpd;
+	    fpair *= iwd;
+
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+            if (EFLAG) {
+	      flt_t cut = (flt_t)1.0/icut;
+	      flt_t r = (flt_t)1.0/rinv;
+	      evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
+	      if (!ONETYPE) evdwl *= factor_dpd;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl;
+              }
+            }
+
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
+          } // if rsq
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+	rngi += jnum;
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+      rngi_thread[tid] = rngi;
+    } // end omp
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::settings(int narg, char **arg) {
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_USE_MKL_RNG
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::settings(narg,arg);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_USE_MKL_RNG
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #else
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::init_style()
+{
+  PairDPD::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_balance() != 0.0)
+    error->all(FLERR,
+          "Offload for dpd/intel is not yet available. Set balance to 0.");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  _onetype = 0;
+  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
+
+  int tp1 = atom->ntypes + 1;
+  fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      } else {
+        cut = init_one(i,j);
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.param[i][j].a0 = a0[i][j];
+      fc.param[i][j].gamma = gamma[i][j];
+      fc.param[i][j].sigma = sigma[i][j];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                 const int nthreads,
+						 const int max_nbors,
+                                                 Memory *memory,
+                                                 const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      _memory->destroy(param);
+      _memory->destroy(rand_buffer_thread);
+      _memory->destroy(rngi);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(param,ntypes,ntypes,"fc.param");
+      memory->create(rand_buffer_thread, nthreads, max_nbors, 
+		     "fc.rand_buffer_thread");
+      memory->create(rngi,nthreads,"fc.param");
+      for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::read_restart_settings(FILE *fp)
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_USE_MKL_RNG
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::read_restart_settings(fp);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_USE_MKL_RNG
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #else
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #endif
+}
diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h
new file mode 100644
index 0000000000..416d873c00
--- /dev/null
+++ b/src/USER-INTEL/pair_dpd_intel.h
@@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/intel,PairDPDIntel)
+
+#else
+
+#ifndef LMP_PAIR_DPD_INTEL_H
+#define LMP_PAIR_DPD_INTEL_H
+
+#include "pair_dpd.h"
+#include "fix_intel.h"
+
+#ifdef LMP_USE_MKL_RNG
+#include "mkl_vsl.h"
+#else
+#include "random_mars.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class PairDPDIntel : public PairDPD {
+
+ public:
+  PairDPDIntel(class LAMMPS *);
+  ~PairDPDIntel();
+
+  virtual void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  void read_restart_settings(FILE *);
+
+ private:
+  FixIntel *fix;
+  int _cop, _onetype, _nrandom_thread;
+
+  #ifdef LMP_USE_MKL_RNG
+  VSLStreamStatePtr *random_thread;
+  #else
+  RanMars **random_thread;
+  #endif
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
+
+    _alignvar(flt_t special_lj[4],64);
+    fc_packed1 **param;
+    flt_t **rand_buffer_thread;
+    int *rngi;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, 
+                    Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/
diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index 81f4586143..9ff5f85176 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -68,7 +68,7 @@ void VerletLRTIntel::init()
 
   _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
 
-  #ifdef LMP_INTEL_NOLRT
+  #ifndef LMP_INTEL_USELRT
   error->all(FLERR,
              "LRT otion for Intel package disabled at compile time");
   #endif
diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h
index 813cd53605..0d7154ff64 100644
--- a/src/USER-INTEL/verlet_lrt_intel.h
+++ b/src/USER-INTEL/verlet_lrt_intel.h
@@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel)
 #include "verlet.h"
 #include "pppm_intel.h"
 
-#ifndef LMP_INTEL_USELRT
-#define LMP_INTEL_NOLRT
-#else
-
+#ifdef LMP_INTEL_USELRT
 #ifdef LMP_INTEL_LRT11
 #define _LMP_INTEL_LRT_11
 #include <thread>
diff --git a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
index db8c589afb..a9ee35bbfc 100644
--- a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
+++ b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
@@ -134,7 +134,7 @@ public:
 // Manifold itself:
 manifold_gaussian_bump::manifold_gaussian_bump(class LAMMPS* lmp,
                                                int narg, char **arg)
-	: manifold(lmp), lut_z(NULL), lut_zp(NULL) {}
+        : manifold(lmp), lut_z(NULL), lut_zp(NULL) {}
 
 
 manifold_gaussian_bump::~manifold_gaussian_bump()
@@ -361,13 +361,13 @@ void manifold_gaussian_bump::test_lut()
     n( x, nn );
     double taper_z;
     if( xx <= rc1 ){
-	    taper_z = gaussian_bump(xx);
+            taper_z = gaussian_bump(xx);
     }else if( xx < rc2 ){
-	    taper_z = lut_get_z( xx );
+            taper_z = lut_get_z( xx );
     }else{
-	    taper_z = 0.0;
+            taper_z = 0.0;
     }
-    fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
+    fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
              gg, nn[0], nn[1], nn[2] );
   }
   fclose(fp);
diff --git a/src/USER-MISC/fix_srp.cpp b/src/USER-MISC/fix_srp.cpp
index f3dec42a83..e1e5f579b8 100644
--- a/src/USER-MISC/fix_srp.cpp
+++ b/src/USER-MISC/fix_srp.cpp
@@ -98,7 +98,7 @@ int FixSRP::setmask()
 
 void FixSRP::init()
 {
-  if (force->pair_match("hybrid",1) == NULL)
+  if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL)
     error->all(FLERR,"Cannot use pair srp without pair_style hybrid");
 
   int has_rigid = 0;
diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index 971f69f7cc..a9532d1077 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -88,8 +88,8 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
 
   if (multiproc)
     error->all(FLERR,"Multi-processor writes are not supported.");
-  if (multifile)
-    error->all(FLERR,"Multiple files are not supported.");
+  if (append_flag && multifile)
+    error->all(FLERR,"Cannot append when writing to multiple files.");
 
   perat = new nc_perat_t[nfield];
 
@@ -224,6 +224,24 @@ DumpNetCDF::~DumpNetCDF()
 
 void DumpNetCDF::openfile()
 {
+  char *filecurrent = filename;
+  if (multifile && !singlefile_opened) {
+    char *filestar = filecurrent;
+    filecurrent = new char[strlen(filestar) + 16];
+    char *ptr = strchr(filestar,'*');
+    *ptr = '\0';
+    if (padflag == 0)
+      sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
+              filestar,update->ntimestep,ptr+1);
+    else {
+      char bif[8],pad[16];
+      strcpy(bif,BIGINT_FORMAT);
+      sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]);
+      sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1);
+    }
+    *ptr = '*';
+  }
+
   if (thermo && !singlefile_opened) {
     if (thermovar)  delete [] thermovar;
     thermovar = new int[output->thermo->nfield];
@@ -268,14 +286,14 @@ void DumpNetCDF::openfile()
   ntotalgr = group->count(igroup);
 
   if (filewriter) {
-    if (append_flag && access(filename, F_OK) != -1) {
+    if (append_flag && !multifile && access(filecurrent, F_OK) != -1) {
       // Fixme! Perform checks if dimensions and variables conform with
       // data structure standard.
 
       if (singlefile_opened) return;
       singlefile_opened = 1;
 
-      NCERRX( nc_open(filename, NC_WRITE, &ncid), filename );
+      NCERRX( nc_open(filecurrent, NC_WRITE, &ncid), filecurrent );
 
       // dimensions
       NCERRX( nc_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR );
@@ -348,8 +366,8 @@ void DumpNetCDF::openfile()
       if (singlefile_opened) return;
       singlefile_opened = 1;
 
-      NCERRX( nc_create(filename, NC_64BIT_DATA, &ncid),
-          filename );
+      NCERRX( nc_create(filecurrent, NC_64BIT_DATA, &ncid),
+              filecurrent );
 
       // dimensions
       NCERRX( nc_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim),
@@ -598,15 +616,39 @@ void DumpNetCDF::closefile()
   if (filewriter && singlefile_opened) {
     NCERR( nc_close(ncid) );
     singlefile_opened = 0;
-    // append next time DumpNetCDF::openfile is called
-    append_flag = 1;
     // write to next frame upon next open
-    framei++;
+    if (multifile)
+      framei = 1;
+    else {
+      // append next time DumpNetCDF::openfile is called
+      append_flag = 1;
+      framei++;
+    }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
+template <typename T>
+int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const T* tp)
+{
+  return nc_put_var1_int(ncid, varid, index, tp);
+}
+
+template <>
+int nc_put_var1_bigint<long>(int ncid, int varid, const size_t index[],
+                        const long* tp)
+{
+  return nc_put_var1_long(ncid, varid, index, tp);
+}
+
+template <>
+int nc_put_var1_bigint<long long>(int ncid, int varid, const size_t index[],
+                             const long long* tp)
+{
+  return nc_put_var1_longlong(ncid, varid, index, tp);
+}
+
 void DumpNetCDF::write()
 {
   // open file
@@ -638,13 +680,8 @@ void DumpNetCDF::write()
                   th->keyword[i] );
         }
         else if (th->vtype[i] == BIGINT) {
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-          NCERRX( nc_put_var1_long(ncid, thermovar[i], start, &th->bivalue),
+          NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue),
                   th->keyword[i] );
-#else
-          NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &th->bivalue),
-                  th->keyword[i] );
-#endif
         }
       }
     }
@@ -887,6 +924,8 @@ int DumpNetCDF::modify_param(int narg, char **arg)
     return 2;
   }
   else if (strcmp(arg[iarg],"at") == 0) {
+    if (!append_flag)
+      error->all(FLERR,"expected 'append yes' before 'at' keyword");
     iarg++;
     framei = force->inumeric(FLERR,arg[iarg]);
     if (framei < 0)  framei--;
@@ -911,68 +950,6 @@ int DumpNetCDF::modify_param(int narg, char **arg)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNetCDF::write_prmtop()
-{
-  char fn[1024];
-  char tmp[81];
-  FILE *f;
-
-  strcpy(fn, filename);
-  strcat(fn, ".prmtop");
-
-  f = fopen(fn, "w");
-  fprintf(f, "%%VERSION  LAMMPS\n");
-  fprintf(f, "%%FLAG TITLE\n");
-  fprintf(f, "%%FORMAT(20a4)\n");
-  memset(tmp, ' ', 76);
-  tmp[76] = '\0';
-  fprintf(f, "NASN%s\n", tmp);
-
-  fprintf(f, "%%FLAG POINTERS\n");
-  fprintf(f, "%%FORMAT(10I8)\n");
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-  fprintf(f, "%8li", ntotalgr);
-#else
-  fprintf(f, "%8i", ntotalgr);
-#endif
-  for (int i = 0; i < 11; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-  for (int i = 0; i < 12; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-  for (int i = 0; i < 6; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-
-  fprintf(f, "%%FLAG ATOM_NAME\n");
-  fprintf(f, "%%FORMAT(20a4)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%4s", "He");
-    if ((i+1) % 20 == 0)
-      fprintf(f, "\n");
-  }
-
-  fprintf(f, "%%FLAG CHARGE\n");
-  fprintf(f, "%%FORMAT(5E16.5)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%16.5e", 0.0);
-    if ((i+1) % 5 == 0)
-      fprintf(f, "\n");
-  }
-
-  fprintf(f, "%%FLAG MASS\n");
-  fprintf(f, "%%FORMAT(5E16.5)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%16.5e", 1.0);
-    if ((i+1) % 5 == 0)
-        fprintf(f, "\n");
-  }
-  fclose(f);
-}
-
-/* ---------------------------------------------------------------------- */
-
 void DumpNetCDF::ncerr(int err, const char *descr, int line)
 {
   if (err != NC_NOERR) {
diff --git a/src/USER-NETCDF/dump_netcdf.h b/src/USER-NETCDF/dump_netcdf.h
index b86f294d30..25d64efade 100644
--- a/src/USER-NETCDF/dump_netcdf.h
+++ b/src/USER-NETCDF/dump_netcdf.h
@@ -92,7 +92,6 @@ class DumpNetCDF : public DumpCustom {
   void closefile();
   virtual void write_header(bigint);
   virtual void write_data(int, double *);
-  void write_prmtop();
 
   virtual int modify_param(int, char **);
 
diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
index 3b753b1b04..746b904655 100644
--- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
@@ -88,8 +88,8 @@ DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) :
 
   if (multiproc)
     error->all(FLERR,"Multi-processor writes are not supported.");
-  if (multifile)
-    error->all(FLERR,"Multiple files are not supported.");
+  if (append_flag && multifile)
+    error->all(FLERR,"Cannot append when writing to multiple files.");
 
   perat = new nc_perat_t[nfield];
 
@@ -217,6 +217,24 @@ DumpNetCDFMPIIO::~DumpNetCDFMPIIO()
 
 void DumpNetCDFMPIIO::openfile()
 {
+  char *filecurrent = filename;
+  if (multifile && !singlefile_opened) {
+    char *filestar = filecurrent;
+    filecurrent = new char[strlen(filestar) + 16];
+    char *ptr = strchr(filestar,'*');
+    *ptr = '\0';
+    if (padflag == 0)
+      sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
+              filestar,update->ntimestep,ptr+1);
+    else {
+      char bif[8],pad[16];
+      strcpy(bif,BIGINT_FORMAT);
+      sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]);
+      sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1);
+    }
+    *ptr = '*';
+  }
+
   if (thermo && !singlefile_opened) {
     if (thermovar)  delete [] thermovar;
     thermovar = new int[output->thermo->nfield];
@@ -260,7 +278,7 @@ void DumpNetCDFMPIIO::openfile()
   // get total number of atoms
   ntotalgr = group->count(igroup);
 
-  if (append_flag && access(filename, F_OK) != -1) {
+  if (append_flag && !multifile && access(filecurrent, F_OK) != -1) {
     // Fixme! Perform checks if dimensions and variables conform with
     // data structure standard.
 
@@ -270,8 +288,8 @@ void DumpNetCDFMPIIO::openfile()
     if (singlefile_opened) return;
     singlefile_opened = 1;
 
-    NCERRX( ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL,
-                       &ncid), filename );
+    NCERRX( ncmpi_open(MPI_COMM_WORLD, filecurrent, NC_WRITE, MPI_INFO_NULL,
+                       &ncid), filecurrent );
 
     // dimensions
     NCERRX( ncmpi_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR );
@@ -344,8 +362,8 @@ void DumpNetCDFMPIIO::openfile()
     if (singlefile_opened) return;
     singlefile_opened = 1;
 
-    NCERRX( ncmpi_create(MPI_COMM_WORLD, filename, NC_64BIT_DATA,
-                         MPI_INFO_NULL, &ncid), filename );
+    NCERRX( ncmpi_create(MPI_COMM_WORLD, filecurrent, NC_64BIT_DATA,
+                         MPI_INFO_NULL, &ncid), filecurrent );
 
     // dimensions
     NCERRX( ncmpi_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim),
@@ -574,15 +592,40 @@ void DumpNetCDFMPIIO::closefile()
   if (singlefile_opened) {
     NCERR( ncmpi_close(ncid) );
     singlefile_opened = 0;
-    // append next time DumpNetCDFMPIIO::openfile is called
-    append_flag = 1;
     // write to next frame upon next open
-    framei++;
+    if (multifile)
+      framei = 1;
+    else {
+      // append next time DumpNetCDFMPIIO::openfile is called
+      append_flag = 1;
+      framei++;
+    }
   }
 }
 
 /* ---------------------------------------------------------------------- */
 
+template <typename T>
+int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[],
+                     const T* tp)
+{
+  return ncmpi_put_var1_int(ncid, varid, index, tp);
+}
+
+template <>
+int ncmpi_put_var1_bigint<long>(int ncid, int varid, const MPI_Offset index[],
+                           const long* tp)
+{
+  return ncmpi_put_var1_long(ncid, varid, index, tp);
+}
+
+template <>
+int ncmpi_put_var1_bigint<long long>(int ncid, int varid, const MPI_Offset index[],
+                                const long long* tp)
+{
+  return ncmpi_put_var1_longlong(ncid, varid, index, tp);
+}
+
 void DumpNetCDFMPIIO::write()
 {
   // open file
@@ -616,13 +659,8 @@ void DumpNetCDFMPIIO::write()
                   th->keyword[i] );
         }
         else if (th->vtype[i] == BIGINT) {
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-          NCERRX( ncmpi_put_var1_long(ncid, thermovar[i], start, &th->bivalue),
+          NCERRX( ncmpi_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue),
                   th->keyword[i] );
-#else
-          NCERRX( ncmpi_put_var1_int(ncid, thermovar[i], start, &th->bivalue),
-                  th->keyword[i] );
-#endif
         }
       }
     }
@@ -882,6 +920,8 @@ int DumpNetCDFMPIIO::modify_param(int narg, char **arg)
     return 2;
   }
   else if (strcmp(arg[iarg],"at") == 0) {
+    if (!append_flag)
+      error->all(FLERR,"expected 'append yes' before 'at' keyword");
     iarg++;
     framei = force->inumeric(FLERR,arg[iarg]);
     if (framei < 0)  framei--;
diff --git a/src/USER-OMP/fix_qeq_reax_omp.cpp b/src/USER-OMP/fix_qeq_reax_omp.cpp
index 4457ab6592..d89c9627fe 100644
--- a/src/USER-OMP/fix_qeq_reax_omp.cpp
+++ b/src/USER-OMP/fix_qeq_reax_omp.cpp
@@ -703,7 +703,7 @@ void FixQEqReaxOMP::calculate_Q()
       q[i] = s[i] - u * t[i];
 
       // backup s & t
-      for (int k = 4; k > 0; --k) {
+      for (int k = nprev-1; k > 0; --k) {
         s_hist[i][k] = s_hist[i][k-1];
         t_hist[i][k] = t_hist[i][k-1];
       }
diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp
index 9d165f3fd3..d1c4f90771 100644
--- a/src/USER-REAXC/fix_qeq_reax.cpp
+++ b/src/USER-REAXC/fix_qeq_reax.cpp
@@ -95,7 +95,7 @@ FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) :
   pack_flag = 0;
   s = NULL;
   t = NULL;
-  nprev = 5;
+  nprev = 4;
 
   Hdia_inv = NULL;
   b_s = NULL;
@@ -817,7 +817,7 @@ void FixQEqReax::calculate_Q()
       q[i] = s[i] - u * t[i];
 
       /* backup s & t */
-      for (k = 4; k > 0; --k) {
+      for (k = nprev-1; k > 0; --k) {
         s_hist[i][k] = s_hist[i][k-1];
         t_hist[i][k] = t_hist[i][k-1];
       }
diff --git a/src/atom.cpp b/src/atom.cpp
index 1191f0f2b5..7d343a0807 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -453,12 +453,12 @@ void Atom::create_avec(const char *style, int narg, char **arg, int trysuffix)
   // if molecular system:
   // atom IDs must be defined
   // force atom map to be created
-  // map style may be reset by map_init() and its call to map_style_set()
+  // map style will be reset to array vs hash to by map_init()
 
   molecular = avec->molecular;
   if (molecular && tag_enable == 0)
     error->all(FLERR,"Atom IDs must be used for molecular systems");
-  if (molecular) map_style = 1;
+  if (molecular) map_style = 3;
 }
 
 /* ----------------------------------------------------------------------
@@ -593,6 +593,7 @@ void Atom::modify_params(int narg, char **arg)
                    "Atom_modify map command after simulation box is defined");
       if (strcmp(arg[iarg+1],"array") == 0) map_user = 1;
       else if (strcmp(arg[iarg+1],"hash") == 0) map_user = 2;
+      else if (strcmp(arg[iarg+1],"yes") == 0) map_user = 3;
       else error->all(FLERR,"Illegal atom_modify command");
       map_style = map_user;
       iarg += 2;
diff --git a/src/atom_map.cpp b/src/atom_map.cpp
index bbfe014dec..9d257d99de 100644
--- a/src/atom_map.cpp
+++ b/src/atom_map.cpp
@@ -298,12 +298,12 @@ int Atom::map_style_set()
   MPI_Allreduce(&max,&map_tag_max,1,MPI_LMP_TAGINT,MPI_MAX,world);
 
   // set map_style for new map
-  // if user-selected, use that setting
+  // if user-selected to array/hash, use that setting
   // else if map_tag_max > 1M, use hash
   // else use array
 
   int map_style_old = map_style;
-  if (map_user) map_style = map_user;
+  if (map_user == 1 || map_user == 2) map_style = map_user;
   else if (map_tag_max > 1000000) map_style = 2;
   else map_style = 1;
 
diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp
index 3c972b8244..06227b7a84 100644
--- a/src/comm_brick.cpp
+++ b/src/comm_brick.cpp
@@ -476,8 +476,7 @@ void CommBrick::forward_comm(int dummy)
     if (sendproc[iswap] != me) {
       if (comm_x_only) {
         if (size_forward_recv[iswap]) {
-          if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
-          else buf = NULL;
+          buf = x[firstrecv[iswap]];
           MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
                     recvproc[iswap],0,world,&request);
         }
@@ -547,8 +546,7 @@ void CommBrick::reverse_comm()
           MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE,
                     sendproc[iswap],0,world,&request);
         if (size_reverse_send[iswap]) {
-          if (size_reverse_send[iswap]) buf = f[firstrecv[iswap]];
-          else buf = NULL;
+          buf = f[firstrecv[iswap]];
           MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
                    recvproc[iswap],0,world);
         }
diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp
index 04a2df91f8..992049a81f 100644
--- a/src/create_atoms.cpp
+++ b/src/create_atoms.cpp
@@ -343,6 +343,11 @@ void CreateAtoms::command(int narg, char **arg)
     }
   }
 
+  // Record wall time for atom creation
+
+  MPI_Barrier(world);
+  double time1 = MPI_Wtime();
+
   // clear ghost count and any ghost bonus data internal to AtomVec
   // same logic as beginning of Comm::exchange()
   // do it now b/c creating atoms will overwrite ghost atoms
@@ -509,6 +514,9 @@ void CreateAtoms::command(int narg, char **arg)
     if (domain->triclinic) domain->lamda2x(atom->nlocal);
   }
 
+  MPI_Barrier(world);
+  double time2 = MPI_Wtime();
+
   // clean up
 
   delete ranmol;
@@ -521,12 +529,16 @@ void CreateAtoms::command(int narg, char **arg)
   // print status
 
   if (comm->me == 0) {
-    if (screen)
+    if (screen) {
       fprintf(screen,"Created " BIGINT_FORMAT " atoms\n",
               atom->natoms-natoms_previous);
-    if (logfile)
+      fprintf(screen,"  Time spent = %g secs\n",time2-time1);
+    }
+    if (logfile) {
       fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n",
               atom->natoms-natoms_previous);
+      fprintf(logfile,"  Time spent = %g secs\n",time2-time1);
+    }
   }
 
   // for MOLECULE mode:
diff --git a/src/dump.cpp b/src/dump.cpp
index 44098298ba..ddd958c25c 100644
--- a/src/dump.cpp
+++ b/src/dump.cpp
@@ -238,7 +238,7 @@ void Dump::init()
     int gcmcflag = 0;
     for (int i = 0; i < modify->nfix; i++)
       if ((strcmp(modify->fix[i]->style,"gcmc") == 0))
-	gcmcflag = 1;
+        gcmcflag = 1;
 
     if (sortcol == 0 && atom->tag_consecutive() && !gcmcflag) {
       tagint *tag = atom->tag;
@@ -898,7 +898,7 @@ void Dump::modify_params(int narg, char **arg)
     } else if (strcmp(arg[iarg],"fileper") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
       if (!multiproc)
-	error->all(FLERR,"Cannot use dump_modify fileper "
+        error->all(FLERR,"Cannot use dump_modify fileper "
                    "without % in dump file name");
       int nper = force->inumeric(FLERR,arg[iarg+1]);
       if (nper <= 0) error->all(FLERR,"Illegal dump_modify command");
@@ -973,7 +973,7 @@ void Dump::modify_params(int narg, char **arg)
     } else if (strcmp(arg[iarg],"nfile") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
       if (!multiproc)
-	error->all(FLERR,"Cannot use dump_modify nfile "
+        error->all(FLERR,"Cannot use dump_modify nfile "
                    "without % in dump file name");
       int nfile = force->inumeric(FLERR,arg[iarg+1]);
       if (nfile <= 0) error->all(FLERR,"Illegal dump_modify command");
diff --git a/src/finish.cpp b/src/finish.cpp
index 45e9226388..c22ecaae60 100644
--- a/src/finish.cpp
+++ b/src/finish.cpp
@@ -130,7 +130,7 @@ void Finish::end(int flag)
                           atom->natoms);
       if (logfile) fprintf(logfile,fmt1,time_loop,ntasks,update->nsteps,
                            atom->natoms);
-      
+
       // Gromacs/NAMD-style performance metric for suitable unit settings
 
       if ( timeflag && !minflag && !prdflag && !tadflag &&
@@ -144,7 +144,7 @@ void Finish::end(int flag)
         double one_fs = force->femtosecond;
         double t_step = ((double) time_loop) / ((double) update->nsteps);
         double step_t = 1.0/t_step;
-        
+
         if (strcmp(update->unit_style,"lj") == 0) {
           double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs;
           const char perf[] = "Performance: %.3f tau/day, %.3f timesteps/s\n";
@@ -161,7 +161,7 @@ void Finish::end(int flag)
       }
 
       // CPU use on MPI tasks and OpenMP threads
-      
+
       if (timeflag) {
         if (lmp->kokkos) {
           const char fmt2[] =
diff --git a/src/input.cpp b/src/input.cpp
index 7d11b8741b..23b89d3040 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -18,7 +18,7 @@
 #include <errno.h>
 #include <ctype.h>
 #include <unistd.h>
-#include "sys/stat.h"
+#include <sys/stat.h>
 #include "input.h"
 #include "style_command.h"
 #include "universe.h"
diff --git a/src/main.cpp b/src/main.cpp
index 7401183fea..82dac5af6d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -22,6 +22,10 @@
 #include <fenv.h>
 #endif
 
+#ifdef FFT_FFTW3
+#include <fftw3.h>
+#endif
+
 using namespace LAMMPS_NS;
 
 /* ----------------------------------------------------------------------
@@ -62,4 +66,10 @@ int main(int argc, char **argv)
 #endif
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
+
+#ifdef FFT_FFTW3
+  // tell fftw3 to delete its global memory pool
+  // and thus avoid bogus valgrind memory leak reports
+  fftw_cleanup();
+#endif
 }
diff --git a/src/modify.cpp b/src/modify.cpp
index 4516788aa9..361079bc16 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -110,7 +110,7 @@ Modify::~Modify()
   // delete all fixes
   // do it via delete_fix() so callbacks in Atom are also updated correctly
 
-  while (nfix) delete_fix(fix[0]->id);
+  while (nfix) delete_fix(0);
   memory->sfree(fix);
   memory->destroy(fmask);
 
@@ -863,9 +863,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix)
       fix[ifix]->restart(state_restart_global[i]);
       used_restart_global[i] = 1;
       if (comm->me == 0) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"Resetting global fix info from restart file:\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"Resetting global fix info from restart file:\n");
         if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
                             fix[ifix]->style,fix[ifix]->id);
@@ -885,9 +885,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix)
         fix[ifix]->unpack_restart(j,index_restart_peratom[i]);
       fix[ifix]->restart_reset = 1;
       if (comm->me == 0) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"Resetting peratom fix info from restart file:\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"Resetting peratom fix info from restart file:\n");
         if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
                             fix[ifix]->style,fix[ifix]->id);
@@ -944,7 +944,13 @@ void Modify::delete_fix(const char *id)
 {
   int ifix = find_fix(id);
   if (ifix < 0) error->all(FLERR,"Could not find fix ID to delete");
-  delete fix[ifix];
+  delete_fix(ifix);
+}
+
+void Modify::delete_fix(int ifix)
+{
+  if(fix[ifix])
+    delete fix[ifix];
   atom->update_callback(ifix);
 
   // move other Fixes and fmask down in list one slot
@@ -1409,24 +1415,24 @@ void Modify::restart_deallocate(int flag)
     if (flag && comm->me == 0) {
       int i;
       for (i = 0; i < nfix_restart_global; i++)
-	if (used_restart_global[i] == 0) break;
+        if (used_restart_global[i] == 0) break;
       if (i == nfix_restart_global) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"All restart file global fix info "
                   "was re-assigned\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"All restart file global fix info "
                   "was re-assigned\n");
       } else {
-	if (screen) fprintf(screen,"Unused restart file global fix info:\n");
-	if (logfile) fprintf(logfile,"Unused restart file global fix info:\n");
-	for (i = 0; i < nfix_restart_global; i++) {
-	  if (used_restart_global[i]) continue;
-	  if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
-			      style_restart_global[i],id_restart_global[i]);
-	  if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
-			       style_restart_global[i],id_restart_global[i]);
-	}
+        if (screen) fprintf(screen,"Unused restart file global fix info:\n");
+        if (logfile) fprintf(logfile,"Unused restart file global fix info:\n");
+        for (i = 0; i < nfix_restart_global; i++) {
+          if (used_restart_global[i]) continue;
+          if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
+                              style_restart_global[i],id_restart_global[i]);
+          if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
+                               style_restart_global[i],id_restart_global[i]);
+        }
       }
     }
 
@@ -1445,24 +1451,24 @@ void Modify::restart_deallocate(int flag)
     if (flag && comm->me == 0) {
       int i;
       for (i = 0; i < nfix_restart_peratom; i++)
-	if (used_restart_peratom[i] == 0) break;
+        if (used_restart_peratom[i] == 0) break;
       if (i == nfix_restart_peratom) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"All restart file peratom fix info "
                   "was re-assigned\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"All restart file peratom fix info "
                   "was re-assigned\n");
       } else {
-	if (screen) fprintf(screen,"Unused restart file peratom fix info:\n");
-	if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n");
-	for (i = 0; i < nfix_restart_peratom; i++) {
-	  if (used_restart_peratom[i]) continue;
-	  if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
-			      style_restart_peratom[i],id_restart_peratom[i]);
-	  if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
-			       style_restart_peratom[i],id_restart_peratom[i]);
-	}
+        if (screen) fprintf(screen,"Unused restart file peratom fix info:\n");
+        if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n");
+        for (i = 0; i < nfix_restart_peratom; i++) {
+          if (used_restart_peratom[i]) continue;
+          if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
+                              style_restart_peratom[i],id_restart_peratom[i]);
+          if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
+                               style_restart_peratom[i],id_restart_peratom[i]);
+        }
       }
     }
 
diff --git a/src/modify.h b/src/modify.h
index d825d5c4ef..4ec61f6d57 100644
--- a/src/modify.h
+++ b/src/modify.h
@@ -95,6 +95,7 @@ class Modify : protected Pointers {
   void add_fix(int, char **, int trysuffix=1);
   void modify_fix(int, char **);
   void delete_fix(const char *);
+  void delete_fix(int);
   int find_fix(const char *);
   int find_fix_by_style(const char *);
   int check_package(const char *);
diff --git a/src/output.cpp b/src/output.cpp
index ce7fcb7cca..ce593ec6ae 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -827,9 +827,9 @@ void Output::create_restart(int narg, char **arg)
    sum and print memory usage
    result is only memory on proc 0, not averaged across procs
 ------------------------------------------------------------------------- */
+
 void Output::memory_usage()
 {
-
   bigint bytes = 0;
   bytes += atom->memory_usage();
   bytes += neighbor->memory_usage();
diff --git a/src/replicate.cpp b/src/replicate.cpp
index e2ed718f65..f3d1964169 100644
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@@ -74,6 +74,11 @@ void Replicate::command(int narg, char **arg)
   if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store)
     error->all(FLERR,"Cannot replicate with fixes that store atom quantities");
 
+  // Record wall time for atom replication
+
+  MPI_Barrier(world);
+  double time1 = MPI_Wtime();
+
   // maxtag = largest atom tag across all existing atoms
 
   tagint maxtag = 0;
@@ -424,4 +429,16 @@ void Replicate::command(int narg, char **arg)
     Special special(lmp);
     special.build();
   }
+
+  // Wall time
+
+  MPI_Barrier(world);
+  double time2 = MPI_Wtime();
+
+  if (me == 0) {
+    if (screen)
+      fprintf(screen,"  Time spent = %g secs\n",time2-time1);
+    if (logfile)
+      fprintf(logfile,"  Time spent = %g secs\n",time2-time1);
+  }
 }
diff --git a/tools/phonon/Makefile b/tools/phonon/Makefile
index 0aacb1e086..67f9b91fdf 100644
--- a/tools/phonon/Makefile
+++ b/tools/phonon/Makefile
@@ -1,7 +1,7 @@
 .SUFFIXES : .o .cpp
 # compiler and flags
-CC     = g++ -Wno-unused-result
-LINK   = $(CC) -static
+CC     = g++ -Wall
+LINK   = $(CC)
 CFLAGS = -O3 $(DEBUG) $(UFLAG)
 #
 OFLAGS = -O3 $(DEBUG)
@@ -9,18 +9,17 @@ INC    = $(LPKINC) $(TCINC) $(SPGINC)
 LIB    = $(LPKLIB) $(TCLIB) $(SPGLIB)
 #
 # cLapack library needed
-LPKINC = -I/opt/libs/clapack/3.2.1/include
-LPKLIB = -L/opt/libs/clapack/3.2.1/lib -lclapack -lblas -lf2c #-lm
+LPKINC = 
+LPKLIB =-llapack
 #
-# Tricubic library needed
-TCINC = -I/opt/libs/tricubic/1.0/include
-TCLIB = -L/opt/libs/tricubic/1.0/lib -ltricubic
 #
 # spglib 1.8.2, used to get the irreducible q-points
 # if UFLAG is not set, spglib won't be used.
-UFLAG  = -DUseSPG
-SPGINC = -I/opt/libs/spglib/1.8.2/include
-SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg
+
+# UFLAG  = -DUseSPG
+# SPGINC = -I/opt/libs/spglib/1.8.2/include
+# SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg
+
 # if spglib other than version 1.8.2 is used, please 
 # modify file phonon.cpp, instruction can be found by searching 1.8.2
 
@@ -36,7 +35,7 @@ SRC = $(wildcard *.cpp)
 OBJ = $(SRC:.cpp=.o)
 
 #====================================================================
-all:  ver ${EXE}
+all:  ${EXE}
 
 ${EXE}: $(OBJ)
 	$(LINK) $(OFLAGS) $(OBJ) $(LIB) -o $@
@@ -59,3 +58,16 @@ ver:
 	$(CC) $(CFLAGS) -c $<
 .cpp.o:
 	$(CC) $(CFLAGS) $(INC) -c $<
+
+#====================================================================
+# dependencies
+disp.o: disp.cpp phonon.h dynmat.h memory.h interpolate.h green.h timer.h \
+ global.h
+dynmat.o: dynmat.cpp dynmat.h memory.h interpolate.h version.h global.h
+green.o: green.cpp green.h memory.h global.h
+interpolate.o: interpolate.cpp interpolate.h memory.h global.h
+main.o: main.cpp dynmat.h memory.h interpolate.h phonon.h
+memory.o: memory.cpp memory.h
+phonon.o: phonon.cpp phonon.h dynmat.h memory.h interpolate.h green.h \
+ timer.h global.h
+timer.o: timer.cpp timer.h
diff --git a/tools/phonon/README b/tools/phonon/README
index ae6383b6bd..b54d96d8a3 100644
--- a/tools/phonon/README
+++ b/tools/phonon/README
@@ -5,15 +5,9 @@
    analyse the phonon related information.
 #-------------------------------------------------------------------------------
 1. Dependencies
-   The clapack library is needed to solve the eigen problems,
-   which could be downloaded from:
-   http://www.netlib.org/clapack/
-   
-   The tricubic library is also needed to do tricubic interpolations,
-   which could be obtained from:
-      http://orca.princeton.edu/francois/software/tricubic/
-   or
-      http://1drv.ms/1J2WFYk
+   The LAPACK library is needed to solve the eigen problems.
+   http://www.netlib.org/lapack/
+   Intel MKL can be used as well.
    
    The spglib is optionally needed, enabling one to evaluate the
    phonon density of states or vibrational thermal properties
diff --git a/tools/phonon/disp.cpp b/tools/phonon/disp.cpp
index 2fa603916c..218e01e7fc 100644
--- a/tools/phonon/disp.cpp
+++ b/tools/phonon/disp.cpp
@@ -18,7 +18,8 @@ void Phonon::pdisp()
 {
   // ask the output file name and write the header.
   char str[MAXLINE];
-  for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n");
+  for (int ii = 0; ii < 80; ++ii) printf("=");
+  printf("\n");
 #ifdef UseSPG
   // ask method to generate q-lines
   int method = 2;
@@ -53,7 +54,6 @@ void Phonon::pdisp()
     while (1){
       for (int i = 0; i < 3; ++i) qstr[i] = qend[i];
   
-      int quit = 0;
       printf("\nPlease input the start q-point in unit of B1->B3, q to exit [%g %g %g]: ", qstr[0], qstr[1], qstr[2]);
       int n = count_words(fgets(str, MAXLINE, stdin));
       ptr = strtok(str, " \t\n\r\f");
@@ -2844,7 +2844,8 @@ void Phonon::pdisp()
     printf("\nPhonon dispersion data are written to: %s, you can visualize the results\n", fname);
     printf("by invoking: `gnuplot pdisp.gnuplot; gv pdisp.eps`\n");
   }
-  for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n");
+  for (int ii = 0; ii < 80; ++ii) printf("=");
+  printf("\n");
 
   delete []fname;
   nodes.clear();
diff --git a/tools/phonon/dynmat.cpp b/tools/phonon/dynmat.cpp
index e82f473130..3b7bfe8268 100644
--- a/tools/phonon/dynmat.cpp
+++ b/tools/phonon/dynmat.cpp
@@ -3,6 +3,11 @@
 #include "version.h"
 #include "global.h"
 
+extern "C" void zheevd_(char *, char *, long int *, doublecomplex *,
+                       long int *, double *, doublecomplex *,
+                       long int *, double *, long int *, long int *,
+                       long int *, long int *);
+
 // to initialize the class
 DynMat::DynMat(int narg, char **arg)
 {
@@ -81,7 +86,8 @@ DynMat::DynMat(int narg, char **arg)
   printf("Number of atoms per unit cell     : %d\n", nucell);
   printf("System dimension                  : %d\n", sysdim);
   printf("Boltzmann constant in used units  : %g\n", boltz);
-  for (int i = 0; i < 80; ++i) printf("="); printf("\n");
+  for (int i = 0; i < 80; ++i) printf("=");
+  printf("\n");
   if (sysdim < 1||sysdim > 3||nx < 1||ny < 1||nz < 1||nucell < 1){
     printf("Wrong values read from header of file: %s, please check the binary file!\n", binfile);
     fclose(fp); exit(3);
@@ -117,11 +123,11 @@ DynMat::DynMat(int narg, char **arg)
   memory->create(attyp, nucell,         "DynMat:attyp");
   memory->create(M_inv_sqrt, nucell,    "DynMat:M_inv_sqrt");
   
-  if ( fread(&Tmeasure,      sizeof(double), 1,      fp) != 1     ){printf("\nError while reading temperature from file: %s\n",   binfile); fclose(fp); exit(3);}
-  if ( fread(&basevec[0],    sizeof(double), 9,      fp) != 9     ){printf("\nError while reading lattice info from file: %s\n",  binfile); fclose(fp); exit(3);}
-  if ( fread(basis[0],       sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n",    binfile); fclose(fp); exit(3);}
-  if ( fread(&attyp[0],      sizeof(int),    nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n",    binfile); fclose(fp); exit(3);}
-  if ( fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&Tmeasure,      sizeof(double), 1,      fp) != 1     ){printf("\nError while reading temperature from file: %s\n",   binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&basevec[0],    sizeof(double), 9,      fp) != 9     ){printf("\nError while reading lattice info from file: %s\n",  binfile); fclose(fp); exit(3);}
+  if ( (int) fread(basis[0],       sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n",    binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&attyp[0],      sizeof(int),    nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n",    binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);}
   fclose(fp);
 
   car2dir();
@@ -229,9 +235,9 @@ return;
 int DynMat::geteigen(double *egv, int flag)
 {
   char jobz, uplo;
-  integer n, lda, lwork, lrwork, *iwork, liwork, info;
+  long int n, lda, lwork, lrwork, *iwork, liwork, info;
   doublecomplex *work;
-  doublereal *w = &egv[0], *rwork;
+  double *w = &egv[0], *rwork;
 
   n     = fftdim;
   if (flag) jobz = 'V';
@@ -338,7 +344,8 @@ void DynMat::EnforceASR()
   char *ptr = strtok(str," \t\n\r\f");
   if (ptr) nasr = atoi(ptr);
   if (nasr < 1){
-    for (int i=0; i<80; i++) printf("="); printf("\n");
+    for (int i=0; i<80; i++) printf("=");
+    printf("\n");
     return;
   }
 
@@ -404,7 +411,8 @@ void DynMat::EnforceASR()
     if (i == 99){ printf("...... (%d more skiped)", fftdim-100); break;}
   }
   printf("\n");
-  for (int i = 0; i < 80; ++i) printf("="); printf("\n\n");
+  for (int i = 0; i < 80; ++i) printf("=");
+  printf("\n\n");
 
 return;
 }
@@ -456,7 +464,7 @@ return;
  * --------------------------------------------------------------------*/
 void DynMat::GaussJordan(int n, double *Mat)
 {
-  int i,icol,irow,j,k,l,ll,idr,idc;
+  int i,icol=0,irow=0,j,k,l,ll,idr,idc;
   int *indxc,*indxr,*ipiv;
   double big, nmjk;
   double dum, pivinv;
diff --git a/tools/phonon/dynmat.h b/tools/phonon/dynmat.h
index 1d6e716584..f5bd4010b8 100644
--- a/tools/phonon/dynmat.h
+++ b/tools/phonon/dynmat.h
@@ -7,11 +7,6 @@
 #include "memory.h"
 #include "interpolate.h"
 
-extern "C"{
-#include "f2c.h"
-#include "clapack.h"
-}
-
 using namespace std;
 
 class DynMat {
diff --git a/tools/phonon/green.cpp b/tools/phonon/green.cpp
index 8f8946dc4f..35514c03fb 100644
--- a/tools/phonon/green.cpp
+++ b/tools/phonon/green.cpp
@@ -224,7 +224,6 @@ void Green::recursion()
 {
   // local variables
   std::complex<double> Z, rec_x, rec_x_inv;
-  std::complex<double> cunit = std::complex<double>(0.,1.);
 
   double w = wmin;
 
diff --git a/tools/phonon/interpolate.cpp b/tools/phonon/interpolate.cpp
index 8c0cbde1ce..954062d415 100644
--- a/tools/phonon/interpolate.cpp
+++ b/tools/phonon/interpolate.cpp
@@ -1,7 +1,125 @@
 #include "interpolate.h"
-#include "math.h"
+#include <math.h>
 #include "global.h"
 
+///////////////////////
+// tricubic library code
+static int A[64][64] = {
+{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 9,-9,-9, 9, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 6,-6, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 6,-6, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 4,-4,-4, 4, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0},
+{-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 9,-9, 0, 0,-9, 9, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 0, 0, 6,-6, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9, 0, 0,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0},
+{ 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0},
+{-27,27,27,-27,27,-27,-27,27,-18,-9,18, 9,18, 9,-18,-9,-18,18,-9, 9,18,-18, 9,-9,-18,18,18,-18,-9, 9, 9,-9,-12,-6,-6,-3,12, 6, 6, 3,-12,-6,12, 6,-6,-3, 6, 3,-12,12,-6, 6,-6, 6,-3, 3,-8,-4,-4,-2,-4,-2,-2,-1},
+{18,-18,-18,18,-18,18,18,-18, 9, 9,-9,-9,-9,-9, 9, 9,12,-12, 6,-6,-12,12,-6, 6,12,-12,-12,12, 6,-6,-6, 6, 6, 6, 3, 3,-6,-6,-3,-3, 6, 6,-6,-6, 3, 3,-3,-3, 8,-8, 4,-4, 4,-4, 2,-2, 4, 4, 2, 2, 2, 2, 1, 1},
+{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0},
+{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6, 9,-9, 9,-9,-9, 9,-9, 9,12,-12,-12,12, 6,-6,-6, 6, 6, 3, 6, 3,-6,-3,-6,-3, 8, 4,-8,-4, 4, 2,-4,-2, 6,-6, 6,-6, 3,-3, 3,-3, 4, 2, 4, 2, 2, 1, 2, 1},
+{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-6, 6,-6, 6, 6,-6, 6,-6,-8, 8, 8,-8,-4, 4, 4,-4,-3,-3,-3,-3, 3, 3, 3, 3,-4,-4, 4, 4,-2,-2, 2, 2,-4, 4,-4, 4,-2, 2,-2, 2,-2,-2,-2,-2,-1,-1,-1,-1},
+{ 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 0, 0, 6,-6, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 4,-4, 0, 0,-4, 4, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4, 0, 0,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0},
+{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0},
+{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6,12,-12, 6,-6,-12,12,-6, 6, 9,-9,-9, 9, 9,-9,-9, 9, 8, 4, 4, 2,-8,-4,-4,-2, 6, 3,-6,-3, 6, 3,-6,-3, 6,-6, 3,-3, 6,-6, 3,-3, 4, 2, 2, 1, 4, 2, 2, 1},
+{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-8, 8,-4, 4, 8,-8, 4,-4,-6, 6, 6,-6,-6, 6, 6,-6,-4,-4,-2,-2, 4, 4, 2, 2,-3,-3, 3, 3,-3,-3, 3, 3,-4, 4,-2, 2,-4, 4,-2, 2,-2,-2,-1,-1,-2,-2,-1,-1},
+{ 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+{-12,12,12,-12,12,-12,-12,12,-8,-4, 8, 4, 8, 4,-8,-4,-6, 6,-6, 6, 6,-6, 6,-6,-6, 6, 6,-6,-6, 6, 6,-6,-4,-2,-4,-2, 4, 2, 4, 2,-4,-2, 4, 2,-4,-2, 4, 2,-3, 3,-3, 3,-3, 3,-3, 3,-2,-1,-2,-1,-2,-1,-2,-1},
+{ 8,-8,-8, 8,-8, 8, 8,-8, 4, 4,-4,-4,-4,-4, 4, 4, 4,-4, 4,-4,-4, 4,-4, 4, 4,-4,-4, 4, 4,-4,-4, 4, 2, 2, 2, 2,-2,-2,-2,-2, 2, 2,-2,-2, 2, 2,-2,-2, 2,-2, 2,-2, 2,-2, 2,-2, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+static int ijk2n(int i, int j, int k) {
+  return(i+4*j+16*k);
+}
+
+/* ---------------------------------------------------------------------------- */
+
+static void tricubic_get_coeff_stacked(double a[64], double x[64]) {
+  int i,j;
+  for (i=0;i<64;i++) {
+    a[i]=(double)(0.0);
+    for (j=0;j<64;j++) {
+      a[i]+=A[i][j]*x[j];
+    }
+  }
+}
+
+static void tricubic_get_coeff(double a[64], double f[8], double dfdx[8], double dfdy[8], double dfdz[8], double d2fdxdy[8], double d2fdxdz[8], double d2fdydz[8], double d3fdxdydz[8]) {
+  int i;
+  double x[64];
+  for (i=0;i<8;i++) {
+    x[0+i]=f[i];
+    x[8+i]=dfdx[i];
+    x[16+i]=dfdy[i];
+    x[24+i]=dfdz[i];
+    x[32+i]=d2fdxdy[i];
+    x[40+i]=d2fdxdz[i];
+    x[48+i]=d2fdydz[i];
+    x[56+i]=d3fdxdydz[i];
+  }
+  tricubic_get_coeff_stacked(a,x);
+}
+
+static double tricubic_eval(double a[64], double x, double y, double z) {
+  int i,j,k;
+  double ret=(double)(0.0);
+  /* TRICUBIC EVAL
+     This is the short version of tricubic_eval. It is used to compute
+     the value of the function at a given point (x,y,z). To compute
+     partial derivatives of f, use the full version with the extra args.
+  */
+  for (i=0;i<4;i++) {
+    for (j=0;j<4;j++) {
+      for (k=0;k<4;k++) {
+        ret+=a[ijk2n(i,j,k)]*pow(x,i)*pow(y,j)*pow(z,k);
+      }
+    }
+  }
+  return(ret);
+}
+
 /* ----------------------------------------------------------------------------
  * Constructor used to get info from caller, and prepare other necessary data
  * ---------------------------------------------------------------------------- */
@@ -274,7 +392,8 @@ void Interpolate::set_method()
 
   which =2-im%2;
   printf("Your  selection: %d\n", which);
-  for(int i=0; i<80; i++) printf("="); printf("\n\n");
+  for(int i=0; i<80; i++) printf("=");
+  printf("\n\n");
 
   if (which == 1) tricubic_init();
 
@@ -306,4 +425,3 @@ void Interpolate::reset_gamma()
 
 return;
 }
-/* ---------------------------------------------------------------------------- */
diff --git a/tools/phonon/interpolate.h b/tools/phonon/interpolate.h
index e192fcac87..04a358ae71 100644
--- a/tools/phonon/interpolate.h
+++ b/tools/phonon/interpolate.h
@@ -5,11 +5,8 @@
 #include "stdlib.h"
 #include "string.h"
 #include "memory.h"
-#include <tricubic.h>
-extern "C"{
-#include "f2c.h"
-#include "clapack.h"
-}
+
+extern "C" typedef struct { double r, i; } doublecomplex;
 
 using namespace std;
 
diff --git a/tools/phonon/phonon.cpp b/tools/phonon/phonon.cpp
index 43bea111b4..065885cf3f 100644
--- a/tools/phonon/phonon.cpp
+++ b/tools/phonon/phonon.cpp
@@ -42,7 +42,8 @@ Phonon::Phonon(DynMat *dm)
     printf("\n");
     for (int i = 0; i < 37; ++i) printf("=");
     printf(" Menu ");
-    for (int i = 0; i < 37; ++i) printf("="); printf("\n");
+    for (int i = 0; i < 37; ++i) printf("=");
+    printf("\n");
     printf("  1. Phonon DOS evaluation;\n");
     printf("  2. Phonon dispersion curves;\n");
     printf("  3. Dynamical matrix at arbitrary q;\n");
@@ -60,7 +61,8 @@ Phonon::Phonon(DynMat *dm)
     printf("Your choice [0]: ");
     if (count_words(fgets(str,MAXLINE,stdin)) > 0) job = atoi(strtok(str," \t\n\r\f"));
     printf("\nYour  selection: %d\n", job);
-    for (int i = 0; i < 80; ++i) printf("=");printf("\n\n");
+    for (int i = 0; i < 80; ++i) printf("=");
+    printf("\n\n");
 
     // now to do the job according to user's choice
     if      (job == 1) pdos();
@@ -414,7 +416,8 @@ void Phonon::vfanyq()
     dynmat->geteigen(egvs, 0);
     printf("q-point: [%lg %lg %lg], ", q[0], q[1], q[2]);
     printf("vibrational frequencies at this q-point:\n");
-    for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); printf("\n\n");
+    for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]);
+    printf("\n\n");
   }
 
 return;
@@ -1001,7 +1004,8 @@ void Phonon::ShowCell()
   printf("\n");
   for (int i = 0; i < 30; ++i) printf("=");
   printf("   Unit Cell Info   ");
-  for (int i = 0; i < 30; ++i) printf("="); printf("\n");
+  for (int i = 0; i < 30; ++i) printf("=");
+  printf("\n");
   printf("Number of atoms in the unit cell: %d\n", dynmat->nucell);
   printf("Basis  vectors  of the unit cell:\n");
   printf("  %15.8f  %15.8f  %15.8f\n", dynmat->basevec[0],  dynmat->basevec[1],  dynmat->basevec[2]);
@@ -1091,7 +1095,7 @@ int Phonon::count_words(const char *line)
   strcpy(copy,line);
 
   char *ptr;
-  if (ptr = strchr(copy,'#')) *ptr = '\0';
+  if ((ptr = strchr(copy,'#'))) *ptr = '\0';
 
   if (strtok(copy," \t\n\r\f") == NULL) {
     memory->destroy(copy);
diff --git a/tools/phonon/version.h b/tools/phonon/version.h
index 8ed0e80aa7..decab631b0 100644
--- a/tools/phonon/version.h
+++ b/tools/phonon/version.h
@@ -1 +1 @@
-#define VERSION 7
+#define VERSION 8