From d6316c40d94efe8b0259cea2ce71bc101d5a6bcc Mon Sep 17 00:00:00 2001
From: Christoph Junghans <junghans@lanl.gov>
Date: Fri, 22 Sep 2017 15:17:44 -0600
Subject: [PATCH 01/53] cmake: fix build with system latte

---
 cmake/CMakeLists.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bc33da60de..666b77ae3d 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -665,7 +665,9 @@ include_directories(${LAMMPS_STYLE_HEADERS_DIR})
 ############################################
 add_library(lammps ${LIB_SOURCES})
 target_link_libraries(lammps ${LAMMPS_LINK_LIBS})
-add_dependencies(lammps ${LAMMPS_DEPS})
+if(LAMMPS_DEPS)
+  add_dependencies(lammps ${LAMMPS_DEPS})
+endif()
 set_target_properties(lammps PROPERTIES OUTPUT_NAME lammps${LAMMPS_MACHINE})
 if(BUILD_SHARED_LIBS)
   set_target_properties(lammps PROPERTIES SOVERSION ${SOVERSION})

From 78a486c0fdfe0b8a8fa02c1e8bfac9c4bda751a7 Mon Sep 17 00:00:00 2001
From: Julien Devemy <julien.devemy@uca.fr>
Date: Mon, 25 Sep 2017 16:18:08 +0200
Subject: [PATCH 02/53] Authorize hybrid/overlay for fix srp

---
 src/USER-MISC/fix_srp.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-MISC/fix_srp.cpp b/src/USER-MISC/fix_srp.cpp
index f3dec42a83..e1e5f579b8 100644
--- a/src/USER-MISC/fix_srp.cpp
+++ b/src/USER-MISC/fix_srp.cpp
@@ -98,7 +98,7 @@ int FixSRP::setmask()
 
 void FixSRP::init()
 {
-  if (force->pair_match("hybrid",1) == NULL)
+  if (force->pair_match("hybrid",1) == NULL && force->pair_match("hybrid/overlay",1) == NULL)
     error->all(FLERR,"Cannot use pair srp without pair_style hybrid");
 
   int has_rigid = 0;

From 789812ec3dcef78579ee42958d3c24a3b7792b3b Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 22 Sep 2017 14:46:53 -0500
Subject: [PATCH 03/53] KOKKOS: minor typo fix

---
 src/KOKKOS/npair_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index b568bd5c93..8d6648bf2b 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -164,8 +164,8 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   k_ex_mol_group.sync<DeviceType>();
   k_ex_mol_bit.sync<DeviceType>();
   k_ex_mol_intra.sync<DeviceType>();
-  k_bincount.sync<DeviceType>(),
-  k_bins.sync<DeviceType>(),
+  k_bincount.sync<DeviceType>();
+  k_bins.sync<DeviceType>();
   atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
 
   data.special_flag[0] = special_flag[0];

From 32e0de7a67a1c17d3b4f948847ca3e4b6e35e5cb Mon Sep 17 00:00:00 2001
From: Tim Mattox <timothy.mattox@engilitycorp.com>
Date: Fri, 22 Sep 2017 15:10:53 -0500
Subject: [PATCH 04/53] first pass at implementing atom2bin for KOKKOS neighbor
 lists

---
 src/KOKKOS/nbin_kokkos.cpp  |  5 +++++
 src/KOKKOS/nbin_kokkos.h    |  2 ++
 src/KOKKOS/npair_kokkos.cpp |  7 +++++--
 src/KOKKOS/npair_kokkos.h   | 37 +++++--------------------------------
 4 files changed, 17 insertions(+), 34 deletions(-)

diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp
index c7e815928a..b06d46d520 100644
--- a/src/KOKKOS/nbin_kokkos.cpp
+++ b/src/KOKKOS/nbin_kokkos.cpp
@@ -75,6 +75,10 @@ void NBinKokkos<DeviceType>::bin_atoms_setup(int nall)
     k_bincount = DAT::tdual_int_1d("Neighbor::d_bincount",mbins);
     bincount = k_bincount.view<DeviceType>();
   }
+  if (nall > k_atom2bin.d_view.dimension_0()) {
+    k_atom2bin = DAT::tdual_int_1d("Neighbor::d_atom2bin",nall);
+    atom2bin = k_atom2bin.view<DeviceType>();
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -125,6 +129,7 @@ void NBinKokkos<DeviceType>::binatomsItem(const int &i) const
 {
   const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2));
 
+  atom2bin(i) = ibin;
   const int ac = Kokkos::atomic_fetch_add(&bincount[ibin], (int)1);
   if(ac < bins.dimension_1()) {
     bins(ibin, ac) = i;
diff --git a/src/KOKKOS/nbin_kokkos.h b/src/KOKKOS/nbin_kokkos.h
index de3cf41d19..bf2ccc5908 100644
--- a/src/KOKKOS/nbin_kokkos.h
+++ b/src/KOKKOS/nbin_kokkos.h
@@ -44,11 +44,13 @@ class NBinKokkos : public NBinStandard {
   int atoms_per_bin;
   DAT::tdual_int_1d k_bincount;
   DAT::tdual_int_2d k_bins;
+  DAT::tdual_int_1d k_atom2bin;
 
   typename AT::t_int_1d bincount;
   const typename AT::t_int_1d_const c_bincount;
   typename AT::t_int_2d bins;
   typename AT::t_int_2d_const c_bins;
+  typename AT::t_int_1d atom2bin;
   typename AT::t_int_scalar d_resize;
   typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
   typename AT::t_x_array_randomread x;
diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index 8d6648bf2b..2f9e6e0b43 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -73,6 +73,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_bin_info()
   atoms_per_bin = nbKK->atoms_per_bin;
   k_bincount = nbKK->k_bincount;
   k_bins = nbKK->k_bins;
+  k_atom2bin = nbKK->k_atom2bin;
 }
 
 /* ----------------------------------------------------------------------
@@ -122,6 +123,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
          k_cutneighsq.view<DeviceType>(),
          k_bincount.view<DeviceType>(),
          k_bins.view<DeviceType>(),
+         k_atom2bin.view<DeviceType>(),
          nstencil,
          k_stencil.view<DeviceType>(),
          k_stencilxyz.view<DeviceType>(),
@@ -166,6 +168,7 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   k_ex_mol_intra.sync<DeviceType>();
   k_bincount.sync<DeviceType>();
   k_bins.sync<DeviceType>();
+  k_atom2bin.sync<DeviceType>();
   atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
 
   data.special_flag[0] = special_flag[0];
@@ -317,7 +320,7 @@ void NeighborKokkosExecute<DeviceType>::
   const X_FLOAT ztmp = x(i, 2);
   const int itype = type(i);
 
-  const int ibin = coord2bin(xtmp, ytmp, ztmp);
+  const int ibin = c_atom2bin(i);
 
   const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil
     = d_stencil;
@@ -678,7 +681,7 @@ void NeighborKokkosExecute<DeviceType>::
   // no molecular test when i = ghost atom
 
   if (i < nlocal) {
-    const int ibin = coord2bin(xtmp, ytmp, ztmp);
+    const int ibin = c_atom2bin(i);
     for (int k = 0; k < nstencil; k++) {
       const int jbin = ibin + stencil[k];
       for(int m = 0; m < c_bincount(jbin); m++) {
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index 517ea546fa..6c1c0e958b 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -105,6 +105,7 @@ class NPairKokkos : public NPair {
   int atoms_per_bin;
   DAT::tdual_int_1d k_bincount;
   DAT::tdual_int_2d k_bins;
+  DAT::tdual_int_1d k_atom2bin;
 
   // data from NStencil class
 
@@ -148,6 +149,8 @@ class NeighborKokkosExecute
   const typename AT::t_int_1d_const c_bincount;
   typename AT::t_int_2d bins;
   typename AT::t_int_2d_const c_bins;
+  const typename AT::t_int_1d atom2bin;
+  const typename AT::t_int_1d_const c_atom2bin;
 
 
   // data from NStencil class
@@ -190,6 +193,7 @@ class NeighborKokkosExecute
                         const typename AT::t_xfloat_2d_randomread &_cutneighsq,
                         const typename AT::t_int_1d &_bincount,
                         const typename AT::t_int_2d &_bins,
+                        const typename AT::t_int_1d &_atom2bin,
                         const int _nstencil,
                         const typename AT::t_int_1d &_d_stencil,
                         const typename AT::t_int_1d_3 &_d_stencilxyz,
@@ -224,6 +228,7 @@ class NeighborKokkosExecute
                         const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
     neigh_list(_neigh_list), cutneighsq(_cutneighsq),
     bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
+    atom2bin(_atom2bin),c_atom2bin(_atom2bin),
     nstencil(_nstencil),d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),
     nlocal(_nlocal),
     x(_x),type(_type),mask(_mask),molecule(_molecule),
@@ -281,38 +286,6 @@ class NeighborKokkosExecute
   void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
 #endif
 
-  KOKKOS_INLINE_FUNCTION
-  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
-  {
-    int ix,iy,iz;
-
-    if (x >= bboxhi[0])
-      ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
-    else if (x >= bboxlo[0]) {
-      ix = static_cast<int> ((x-bboxlo[0])*bininvx);
-      ix = MIN(ix,nbinx-1);
-    } else
-      ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
-
-    if (y >= bboxhi[1])
-      iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
-    else if (y >= bboxlo[1]) {
-      iy = static_cast<int> ((y-bboxlo[1])*bininvy);
-      iy = MIN(iy,nbiny-1);
-    } else
-      iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
-
-    if (z >= bboxhi[2])
-      iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
-    else if (z >= bboxlo[2]) {
-      iz = static_cast<int> ((z-bboxlo[2])*bininvz);
-      iz = MIN(iz,nbinz-1);
-    } else
-      iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
-
-    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
-  }
-
   KOKKOS_INLINE_FUNCTION
   int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
   {

From 836a6d292c10a4c1d8a77b3586d2ebeb2858cf27 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 25 Sep 2017 16:31:39 -0400
Subject: [PATCH 05/53] whitespace fixes, silence compiler warning about too
 few format specifiers

---
 src/USER-MANIFOLD/manifold_gaussian_bump.cpp | 10 +++++-----
 src/finish.cpp                               |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
index db8c589afb..a9ee35bbfc 100644
--- a/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
+++ b/src/USER-MANIFOLD/manifold_gaussian_bump.cpp
@@ -134,7 +134,7 @@ public:
 // Manifold itself:
 manifold_gaussian_bump::manifold_gaussian_bump(class LAMMPS* lmp,
                                                int narg, char **arg)
-	: manifold(lmp), lut_z(NULL), lut_zp(NULL) {}
+        : manifold(lmp), lut_z(NULL), lut_zp(NULL) {}
 
 
 manifold_gaussian_bump::~manifold_gaussian_bump()
@@ -361,13 +361,13 @@ void manifold_gaussian_bump::test_lut()
     n( x, nn );
     double taper_z;
     if( xx <= rc1 ){
-	    taper_z = gaussian_bump(xx);
+            taper_z = gaussian_bump(xx);
     }else if( xx < rc2 ){
-	    taper_z = lut_get_z( xx );
+            taper_z = lut_get_z( xx );
     }else{
-	    taper_z = 0.0;
+            taper_z = 0.0;
     }
-    fprintf( fp, "%g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
+    fprintf( fp, "%g %g %g %g %g %g %g\n", xx, gaussian_bump(xx), taper_z,
              gg, nn[0], nn[1], nn[2] );
   }
   fclose(fp);
diff --git a/src/finish.cpp b/src/finish.cpp
index 45e9226388..c22ecaae60 100644
--- a/src/finish.cpp
+++ b/src/finish.cpp
@@ -130,7 +130,7 @@ void Finish::end(int flag)
                           atom->natoms);
       if (logfile) fprintf(logfile,fmt1,time_loop,ntasks,update->nsteps,
                            atom->natoms);
-      
+
       // Gromacs/NAMD-style performance metric for suitable unit settings
 
       if ( timeflag && !minflag && !prdflag && !tadflag &&
@@ -144,7 +144,7 @@ void Finish::end(int flag)
         double one_fs = force->femtosecond;
         double t_step = ((double) time_loop) / ((double) update->nsteps);
         double step_t = 1.0/t_step;
-        
+
         if (strcmp(update->unit_style,"lj") == 0) {
           double tau_day = 24.0*3600.0 / t_step * update->dt / one_fs;
           const char perf[] = "Performance: %.3f tau/day, %.3f timesteps/s\n";
@@ -161,7 +161,7 @@ void Finish::end(int flag)
       }
 
       // CPU use on MPI tasks and OpenMP threads
-      
+
       if (timeflag) {
         if (lmp->kokkos) {
           const char fmt2[] =

From e6969002ce55f10db0a7bebd073b2f93f947f14b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 25 Sep 2017 19:37:37 -0400
Subject: [PATCH 06/53] having plain filelink instead of filelink.o confuses
 KOKKOS linking with nvcc

---
 lib/latte/Install.py               | 6 +++---
 lib/latte/Makefile.lammps.gfortran | 2 +-
 lib/latte/Makefile.lammps.ifort    | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/latte/Install.py b/lib/latte/Install.py
index b3e771e4cc..37cb5d6b17 100644
--- a/lib/latte/Install.py
+++ b/lib/latte/Install.py
@@ -159,13 +159,13 @@ if buildflag or pathflag:
     os.remove("includelink")
   if os.path.isfile("liblink") or os.path.islink("liblink"):
     os.remove("liblink")
-  if os.path.isfile("filelink") or os.path.islink("filelink"):
-    os.remove("filelink")
+  if os.path.isfile("filelink.o") or os.path.islink("filelink.o"):
+    os.remove("filelink.o")
   cmd = 'ln -s "%s/src" includelink' % lattedir
   subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
   cmd = 'ln -s "%s" liblink' % lattedir
   subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
-  cmd = 'ln -s "%s/src/latte_c_bind.o" filelink' % lattedir
+  cmd = 'ln -s "%s/src/latte_c_bind.o" filelink.o' % lattedir
   subprocess.check_output(cmd,stderr=subprocess.STDOUT,shell=True)
 
 # copy Makefile.lammps.suffix to Makefile.lammps
diff --git a/lib/latte/Makefile.lammps.gfortran b/lib/latte/Makefile.lammps.gfortran
index 921721552b..6aa7782f8a 100644
--- a/lib/latte/Makefile.lammps.gfortran
+++ b/lib/latte/Makefile.lammps.gfortran
@@ -3,5 +3,5 @@
 # GNU Fortran settings
 
 latte_SYSINC = 
-latte_SYSLIB = ../../lib/latte/filelink -llatte -lgfortran -llapack -lblas
+latte_SYSLIB = ../../lib/latte/filelink.o -llatte -lgfortran -llapack -lblas
 latte_SYSPATH = -fopenmp
diff --git a/lib/latte/Makefile.lammps.ifort b/lib/latte/Makefile.lammps.ifort
index 23d2b32fcc..0491bdd8a5 100644
--- a/lib/latte/Makefile.lammps.ifort
+++ b/lib/latte/Makefile.lammps.ifort
@@ -3,7 +3,7 @@
 # Intel ifort settings
 
 latte_SYSINC = 
-latte_SYSLIB = ../../lib/latte/filelink \
+latte_SYSLIB = ../../lib/latte/filelink.o \
                -llatte -lifcore -lsvml -lompstub -limf -lmkl_intel_lp64 \
                -lmkl_intel_thread -lmkl_core -lmkl_intel_thread -lpthread \
                -openmp -O0

From 0573aaa6da3a7a439b347b9974d6f596078d8479 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 25 Sep 2017 19:37:55 -0400
Subject: [PATCH 07/53] update src/.gitignore for LATTE package

---
 src/.gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/.gitignore b/src/.gitignore
index 1571065b72..13518abbe8 100644
--- a/src/.gitignore
+++ b/src/.gitignore
@@ -405,6 +405,8 @@
 /fix_lambdah_calc.h
 /fix_langevin_eff.cpp
 /fix_langevin_eff.h
+/fix_latte.cpp
+/fix_latte.h
 /fix_lb_fluid.cpp
 /fix_lb_fluid.h
 /fix_lb_momentum.cpp

From 38530415c8f0cd81d5cb57215a5b09eec877c917 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 25 Sep 2017 20:03:53 -0400
Subject: [PATCH 08/53] -ltbbmalloc is required

---
 src/MAKE/OPTIONS/Makefile.intel_cpu_mpich | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
index 40d517bce4..7ca59e7b1c 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@@ -15,7 +15,7 @@ DEPFLAGS =	-M
 
 LINK =		mpicxx -cxx=icc
 LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           
+LIB =           -ltbbmalloc
 SIZE =		size
 
 ARCHIVE =	ar

From b60cff7e7773573b0a6a2619e7e8e4c8ee9148b1 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 25 Sep 2017 23:15:49 -0400
Subject: [PATCH 09/53] USER-OMP package depends on USER-DRUDE

---
 src/Depend.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/Depend.sh b/src/Depend.sh
index 9463607960..e1c812ebc2 100644
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -119,6 +119,10 @@ if (test $1 = "USER-DPD") then
   depend KOKKOS
 fi
 
+if (test $1 = "USER-DRUDE") then
+  depend USER-OMP
+fi
+
 if (test $1 = "USER-FEP") then
   depend USER-OMP
 fi

From 53e4ee4f2dc9b11f8fd5e54d78ce19acb6361e0e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 25 Sep 2017 23:20:42 -0400
Subject: [PATCH 10/53] need to re-init timers after initial setup

---
 src/REPLICA/prd.cpp | 1 +
 src/REPLICA/tad.cpp | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/REPLICA/prd.cpp b/src/REPLICA/prd.cpp
index 30ebc779c5..14eeac8d66 100644
--- a/src/REPLICA/prd.cpp
+++ b/src/REPLICA/prd.cpp
@@ -310,6 +310,7 @@ void PRD::command(int narg, char **arg)
   time_dephase = time_dynamics = time_quench = time_comm = time_output = 0.0;
   bigint clock = 0;
 
+  timer->init();
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
 
diff --git a/src/REPLICA/tad.cpp b/src/REPLICA/tad.cpp
index 5a4d885224..347cd3ba67 100644
--- a/src/REPLICA/tad.cpp
+++ b/src/REPLICA/tad.cpp
@@ -274,6 +274,7 @@ void TAD::command(int narg, char **arg)
   nbuild = ndanger = 0;
   time_neb = time_dynamics = time_quench = time_comm = time_output = 0.0;
 
+  timer->init();
   timer->barrier_start();
   time_start = timer->get_wall(Timer::TOTAL);
 

From 8bba6d3e8c63cf66078e3671be15581c1bb94203 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 26 Sep 2017 16:52:10 -0400
Subject: [PATCH 11/53] correct formatting and broken/colliding link issues
 with LATTE package related documentation

---
 doc/src/Section_packages.txt | 1 +
 doc/src/fix_latte.txt        | 6 +++---
 doc/src/fixes.txt            | 1 +
 doc/src/pair_eam.txt         | 4 ++--
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index d9a9fb4163..7539d99cd0 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -728,6 +728,7 @@ make lib-latte args="-b"                # download and build in lib/latte/LATTE-
 make lib-latte args="-p $HOME/latte"    # use existing LATTE installation in $HOME/latte
 make lib-latte args="-b -m gfortran"    # download and build in lib/latte and 
                                         #   copy Makefile.lammps.gfortran to Makefile.lammps
+:pre
 
 Note that 3 symbolic (soft) links, "includelink" and "liblink" and
 "filelink", are created in lib/latte to point into the LATTE home dir.
diff --git a/doc/src/fix_latte.txt b/doc/src/fix_latte.txt
index f78e13b866..17b3335ad7 100644
--- a/doc/src/fix_latte.txt
+++ b/doc/src/fix_latte.txt
@@ -66,7 +66,7 @@ reference charge of overlapping atom-centered densities and bond
 integrals are parameterized using a Slater-Koster tight-binding
 approach. This procedure, which usually is referred to as the DFTB
 method has been described in detail by ("Elstner"_#Elstner) and
-("Finnis"_#Finnis) and coworkers. 
+("Finnis"_#Finnis2) and coworkers. 
 
 The work of the LATTE developers follows that of Elstner closely with
 respect to the physical model.  However, the development of LATTE is
@@ -173,7 +173,7 @@ M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
 M. Haugk, T. Frauenheim, S. Suhai, and G. Seifert, Phys. Rev. B, 58,
 7260 (1998).
 
-:link(Finnis)
+:link(Finnis2)
 [(Finnis)] M. W. Finnis, A. T. Paxton, M. Methfessel, and M. van
 Schilfgarde, Phys. Rev. Lett., 81, 5149 (1998).
 
@@ -197,7 +197,7 @@ J. Sci. Comput. 36 (2), 147-170, (2014).
 [(Niklasson2014)] A. M. N. Niklasson and M. Cawkwell, J. Chem. Phys.,
 141, 164123, (2014).
 
-:link(Niklasson2014)
+:link(Niklasson2017)
 [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017).
 
 :link(Niklasson2012)
diff --git a/doc/src/fixes.txt b/doc/src/fixes.txt
index 7000a66c51..e363273a75 100644
--- a/doc/src/fixes.txt
+++ b/doc/src/fixes.txt
@@ -59,6 +59,7 @@ Fixes :h1
    fix_langevin
    fix_langevin_drude
    fix_langevin_eff
+   fix_latte
    fix_lb_fluid
    fix_lb_momentum
    fix_lb_pc
diff --git a/doc/src/pair_eam.txt b/doc/src/pair_eam.txt
index a0026432ec..03e77f53ab 100644
--- a/doc/src/pair_eam.txt
+++ b/doc/src/pair_eam.txt
@@ -294,7 +294,7 @@ distribution have a ".cdeam" suffix.
 
 Style {eam/fs} computes pairwise interactions for metals and metal
 alloys using a generalized form of EAM potentials due to Finnis and
-Sinclair "(Finnis)"_#Finnis.  The total energy Ei of an atom I is
+Sinclair "(Finnis)"_#Finnis1.  The total energy Ei of an atom I is
 given by
 
 :c,image(Eqs/pair_eam_fs.jpg)
@@ -442,7 +442,7 @@ of Physics: Condensed Matter, 16, S2629 (2004).
 [(Daw)] Daw, Baskes, Phys Rev Lett, 50, 1285 (1983).
 Daw, Baskes, Phys Rev B, 29, 6443 (1984).
 
-:link(Finnis)
+:link(Finnis1)
 [(Finnis)] Finnis, Sinclair, Philosophical Magazine A, 50, 45 (1984).
 
 :link(Stukowski)

From fd3ecd04812090d6fd88e2220fa25b6a3f1b3962 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 26 Sep 2017 16:52:24 -0400
Subject: [PATCH 12/53] fix typo in formatting

---
 doc/src/fix_neb.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/fix_neb.txt b/doc/src/fix_neb.txt
index 52d8a7df84..73b3e31266 100644
--- a/doc/src/fix_neb.txt
+++ b/doc/src/fix_neb.txt
@@ -93,7 +93,7 @@ intermediate replica with the previous and the next image:
 
 Fnudge_parallel = {Kspring} * (|Ri+1 - Ri| - |Ri - Ri-1|) :pre
 
-Note that in this case the specified {Kspring) is in force/distance
+Note that in this case the specified {Kspring} is in force/distance
 units.
 
 With a value of {ideal}, the spring force is computed as suggested in
@@ -105,7 +105,7 @@ where RD is the "reaction coordinate" see "neb"_neb.html section, and
 RDideal is the ideal RD for which all the images are equally spaced.
 I.e. RDideal = (I-1)*meanDist when the climbing replica is off, where
 I is the replica number).  The meanDist is the average distance
-between replicas.  Note that in this case the specified {Kspring) is
+between replicas.  Note that in this case the specified {Kspring} is
 in force units.
 
 Note that the {ideal} form of nudging can often be more effective at

From bfdc4acb8bbe756c4911da4aa3de7f85627d9878 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 26 Sep 2017 16:53:36 -0400
Subject: [PATCH 13/53] add missing entry for pdf version of manual

---
 doc/src/lammps.book | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/src/lammps.book b/doc/src/lammps.book
index 86dfe78af3..b74ec49aed 100644
--- a/doc/src/lammps.book
+++ b/doc/src/lammps.book
@@ -187,6 +187,7 @@ fix_ipi.html
 fix_langevin.html
 fix_langevin_drude.html
 fix_langevin_eff.html
+fix_latte.html
 fix_lb_fluid.html
 fix_lb_momentum.html
 fix_lb_pc.html

From de45fa6e7107cd9587ea14ec96b53c9a2196fb39 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 26 Sep 2017 18:25:37 -0400
Subject: [PATCH 14/53] correct bogus links in LATTE docs

---
 doc/src/Section_packages.txt | 2 +-
 doc/src/fix_latte.txt        | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index 7539d99cd0..e08784bf6c 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -705,7 +705,7 @@ dynamics can be run with LAMMPS using density-functional tight-binding
 quantum forces calculated by LATTE.
 
 More information on LATTE can be found at this web site:
-"https://github.com/lanl/LATTE"_#latte_home.  A brief technical
+"https://github.com/lanl/LATTE"_latte_home.  A brief technical
 description is given with the "fix latte"_fix_latte.html command.
 
 :link(latte_home,https://github.com/lanl/LATTE)
diff --git a/doc/src/fix_latte.txt b/doc/src/fix_latte.txt
index 17b3335ad7..4edd610546 100644
--- a/doc/src/fix_latte.txt
+++ b/doc/src/fix_latte.txt
@@ -200,8 +200,8 @@ J. Sci. Comput. 36 (2), 147-170, (2014).
 :link(Niklasson2017)
 [(Niklasson2017)] A. M. N. Niklasson, J. Chem. Phys., 147, 054103 (2017).
 
-:link(Niklasson2012)
-[(Niklasson2017)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
+:link(Cawkwell2012)
+[(Cawkwell2012)] A. M. N. Niklasson, M. J. Cawkwell, Phys. Rev. B, 86
 (17), 174308 (2012).
 
 :link(Negre2016)

From 23e283f1355179168c2bcc2783d31ec3d6d67323 Mon Sep 17 00:00:00 2001
From: Anders Hafreager <andershaf@gmail.com>
Date: Wed, 27 Sep 2017 16:20:07 +0200
Subject: [PATCH 15/53] Fixed proper deletion of fixes if fix is NULL

---
 src/modify.cpp | 10 ++++++++--
 src/modify.h   |  1 +
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/modify.cpp b/src/modify.cpp
index 4516788aa9..b95d8868bf 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -110,7 +110,7 @@ Modify::~Modify()
   // delete all fixes
   // do it via delete_fix() so callbacks in Atom are also updated correctly
 
-  while (nfix) delete_fix(fix[0]->id);
+  while (nfix) delete_fix(0);
   memory->sfree(fix);
   memory->destroy(fmask);
 
@@ -944,7 +944,13 @@ void Modify::delete_fix(const char *id)
 {
   int ifix = find_fix(id);
   if (ifix < 0) error->all(FLERR,"Could not find fix ID to delete");
-  delete fix[ifix];
+  delete_fix(ifix);
+}
+
+void Modify::delete_fix(int ifix)
+{
+  if(fix[ifix])
+    delete fix[ifix];
   atom->update_callback(ifix);
 
   // move other Fixes and fmask down in list one slot
diff --git a/src/modify.h b/src/modify.h
index d825d5c4ef..4ec61f6d57 100644
--- a/src/modify.h
+++ b/src/modify.h
@@ -95,6 +95,7 @@ class Modify : protected Pointers {
   void add_fix(int, char **, int trysuffix=1);
   void modify_fix(int, char **);
   void delete_fix(const char *);
+  void delete_fix(int);
   int find_fix(const char *);
   int find_fix_by_style(const char *);
   int check_package(const char *);

From d898afaafb7ac183a8458e971b714ad2eeb79b02 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 29 Sep 2017 09:19:38 -0400
Subject: [PATCH 16/53] use <> for system includes not ""

---
 src/input.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/input.cpp b/src/input.cpp
index 7d11b8741b..23b89d3040 100644
--- a/src/input.cpp
+++ b/src/input.cpp
@@ -18,7 +18,7 @@
 #include <errno.h>
 #include <ctype.h>
 #include <unistd.h>
-#include "sys/stat.h"
+#include <sys/stat.h>
 #include "input.h"
 #include "style_command.h"
 #include "universe.h"

From d7aac2fed53cbdd13db313241c7b9a14563326a2 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 29 Sep 2017 13:26:02 -0600
Subject: [PATCH 17/53] Add sync/modify to nbin_kokkos

---
 src/KOKKOS/nbin_kokkos.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp
index b06d46d520..95ea105ad9 100644
--- a/src/KOKKOS/nbin_kokkos.cpp
+++ b/src/KOKKOS/nbin_kokkos.cpp
@@ -90,6 +90,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
 {
   last_bin = update->ntimestep;
 
+  k_bins.template sync<DeviceType>();
+  k_bincount.template sync<DeviceType>();
+  k_atom2bin.template sync<DeviceType>();
+
   h_resize() = 1;
 
   while(h_resize() > 0) {
@@ -119,6 +123,10 @@ void NBinKokkos<DeviceType>::bin_atoms()
       c_bins = bins;
     }
   }
+
+  k_bins.template modify<DeviceType>();
+  k_bincount.template modify<DeviceType>();
+  k_atom2bin.template modify<DeviceType>();
 }
 
 /* ---------------------------------------------------------------------- */

From 9f2740b7f18f29f30e30f6bc6db7bdb8a4a8173a Mon Sep 17 00:00:00 2001
From: Stan Moore <stanmoore1@gmail.com>
Date: Fri, 29 Sep 2017 13:41:35 -0600
Subject: [PATCH 18/53] Partially revert 01d0a5c, avoid atomics, safe because
 of the while loop. Worst case is the resize will happen again because max
 wasn't accurate

---
 src/KOKKOS/npair_kokkos.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index 2f9e6e0b43..fd89f5ef60 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -434,7 +434,7 @@ void NeighborKokkosExecute<DeviceType>::
   if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
   }
 
   neigh_list.d_ilist(i) = i;
@@ -644,7 +644,7 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli
   if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
   }
   }
 }
@@ -767,7 +767,7 @@ void NeighborKokkosExecute<DeviceType>::
   if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+    if(n > new_maxneighs()) new_maxneighs() = n; // avoid atomics, safe because in while loop
   }
   neigh_list.d_ilist(i) = i;
 }

From a86572f4fcb47b817b173cd2e4d076b2af2aa897 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 29 Sep 2017 16:20:19 -0600
Subject: [PATCH 19/53] Reduce memory churn in Kokkos package

---
 src/KOKKOS/comm_kokkos.cpp  | 54 ++++++++++++++++++-------------------
 src/KOKKOS/comm_kokkos.h    |  1 +
 src/KOKKOS/npair_kokkos.cpp |  6 +++--
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index f5ed0f525f..ba44ea813f 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -46,7 +46,8 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
   if (sendlist) for (int i = 0; i < maxswap; i++) memory->destroy(sendlist[i]);
   memory->sfree(sendlist);
   sendlist = NULL;
-  k_sendlist = ArrayTypes<LMPDeviceType>::tdual_int_2d();
+  k_sendlist = DAT::tdual_int_2d();
+  k_total_send = DAT::tdual_int_scalar("comm::k_total_send");
 
   // error check for disallow of OpenMP threads?
 
@@ -57,12 +58,12 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
   memory->destroy(buf_recv);
   buf_recv = NULL;
 
-  k_exchange_sendlist = ArrayTypes<LMPDeviceType>::
+  k_exchange_sendlist = DAT::
     tdual_int_1d("comm:k_exchange_sendlist",100);
-  k_exchange_copylist = ArrayTypes<LMPDeviceType>::
+  k_exchange_copylist = DAT::
     tdual_int_1d("comm:k_exchange_copylist",100);
-  k_count = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_count",1);
-  k_sendflag = ArrayTypes<LMPDeviceType>::tdual_int_1d("comm:k_sendflag",100);
+  k_count = DAT::tdual_int_1d("comm:k_count",1);
+  k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);
 
   memory->destroy(maxsendlist);
   maxsendlist = NULL;
@@ -659,11 +660,11 @@ struct BuildBorderListFunctor {
   int iswap,maxsendlist;
   int nfirst,nlast,dim;
   typename AT::t_int_2d sendlist;
-  typename AT::t_int_1d nsend;
+  typename AT::t_int_scalar nsend;
 
   BuildBorderListFunctor(typename AT::tdual_x_array _x,
                          typename AT::tdual_int_2d _sendlist,
-                         typename AT::tdual_int_1d _nsend,int _nfirst,
+                         typename AT::tdual_int_scalar _nsend,int _nfirst,
                          int _nlast, int _dim,
                          X_FLOAT _lo, X_FLOAT _hi, int _iswap,
                          int _maxsendlist):
@@ -684,7 +685,7 @@ struct BuildBorderListFunctor {
     for (int i=teamstart + dev.team_rank(); i<teamend; i+=dev.team_size()) {
       if (x(i,dim) >= lo && x(i,dim) <= hi) mysend++;
     }
-    const int my_store_pos = dev.team_scan(mysend,&nsend(0));
+    const int my_store_pos = dev.team_scan(mysend,&nsend());
 
     if (my_store_pos+mysend < maxsendlist) {
     mysend = my_store_pos;
@@ -763,37 +764,34 @@ void CommKokkos::borders_device() {
       if (sendflag) {
         if (!bordergroup || ineed >= 2) {
           if (style == SINGLE) {
-            typename ArrayTypes<DeviceType>::tdual_int_1d total_send("TS",1);
-            total_send.h_view(0) = 0;
-            if(exec_space == Device) {
-              total_send.template modify<DeviceType>();
-              total_send.template sync<LMPDeviceType>();
-            }
+            k_total_send.h_view() = 0;
+            k_total_send.template modify<LMPHostType>();
+            k_total_send.template sync<LMPDeviceType>();
 
             BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
-                total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+                k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
             Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
             Kokkos::parallel_for(config,f);
 
-            total_send.template modify<DeviceType>();
-            total_send.template sync<LMPHostType>();
+            k_total_send.template modify<DeviceType>();
+            k_total_send.template sync<LMPHostType>();
 
-            if(total_send.h_view(0) >= maxsendlist[iswap]) {
-              grow_list(iswap,total_send.h_view(0));
+            if(k_total_send.h_view() >= maxsendlist[iswap]) {
+              grow_list(iswap,k_total_send.h_view());
               k_sendlist.modify<DeviceType>();
-              total_send.h_view(0) = 0;
+              k_total_send.h_view() = 0;
               if(exec_space == Device) {
-                total_send.template modify<LMPHostType>();
-                total_send.template sync<LMPDeviceType>();
+                k_total_send.template modify<LMPHostType>();
+                k_total_send.template sync<LMPDeviceType>();
               }
               BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
-                  total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
+                  k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
               Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
               Kokkos::parallel_for(config,f);
-              total_send.template modify<DeviceType>();
-              total_send.template sync<LMPHostType>();
+              k_total_send.template modify<DeviceType>();
+              k_total_send.template sync<LMPHostType>();
             }
-            nsend = total_send.h_view(0);
+            nsend = k_total_send.h_view();
           } else {
             error->all(FLERR,"Required border comm not yet "
                        "implemented with Kokkos");
@@ -961,7 +959,7 @@ void CommKokkos::grow_send_kokkos(int n, int flag, ExecutionSpace space)
     buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
   }
   else {
-    k_buf_send = ArrayTypes<LMPDeviceType>::
+    k_buf_send = DAT::
       tdual_xfloat_2d("comm:k_buf_send",maxsend_border,atom->avec->size_border);
     buf_send = k_buf_send.view<LMPHostType>().ptr_on_device();
   }
@@ -975,7 +973,7 @@ void CommKokkos::grow_recv_kokkos(int n, ExecutionSpace space)
 {
   maxrecv = static_cast<int> (BUFFACTOR * n);
   int maxrecv_border = (maxrecv+BUFEXTRA+5)/atom->avec->size_border + 2;
-  k_buf_recv = ArrayTypes<LMPDeviceType>::
+  k_buf_recv = DAT::
     tdual_xfloat_2d("comm:k_buf_recv",maxrecv_border,atom->avec->size_border);
   buf_recv = k_buf_recv.view<LMPHostType>().ptr_on_device();
 }
diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h
index a8ae973124..4065efd000 100644
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@@ -53,6 +53,7 @@ class CommKokkos : public CommBrick {
 
  protected:
   DAT::tdual_int_2d k_sendlist;
+  DAT::tdual_int_scalar k_total_send;
   DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
   DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
   DAT::tdual_int_1d k_count;
diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index b568bd5c93..727a81c87f 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -88,13 +88,15 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::copy_stencil_info()
 
   int maxstencil = ns->get_maxstencil();
 
-  k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
+  if (maxstencil > k_stencil.dimension_0())
+    k_stencil = DAT::tdual_int_1d("neighlist:stencil",maxstencil);
   for (int k = 0; k < maxstencil; k++)
     k_stencil.h_view(k) = ns->stencil[k];
     k_stencil.modify<LMPHostType>();
     k_stencil.sync<DeviceType>();
   if (GHOST) {
-    k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
+    if (maxstencil > k_stencilxyz.dimension_0())
+      k_stencilxyz = DAT::tdual_int_1d_3("neighlist:stencilxyz",maxstencil);
     for (int k = 0; k < maxstencil; k++) {
       k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
       k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];

From 4c71beb0240905f2e147eb4971520ac8fc1912c7 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 30 Sep 2017 12:12:15 -0400
Subject: [PATCH 20/53] cleanup/simplification of compilation for fix phonon
 analysis tool "phana"

- include the used tricubic functions directly as static functions
- silence compiler warnings
- define f2c.h imported data types directly or use C equivalents
- since the direct LAPACK API was called and not cLAPACK, declare LAPACK interface and depend only on LAPACK
- add proper dependencies
- disable automatic minor version number generation. step version manually.
- comment out optional spglib functionality by default
---
 tools/phonon/Makefile        |  34 ++++++----
 tools/phonon/README          |  12 +---
 tools/phonon/disp.cpp        |   7 +-
 tools/phonon/dynmat.cpp      |  30 +++++----
 tools/phonon/dynmat.h        |   5 --
 tools/phonon/green.cpp       |   1 -
 tools/phonon/interpolate.cpp | 124 ++++++++++++++++++++++++++++++++++-
 tools/phonon/interpolate.h   |   7 +-
 tools/phonon/phonon.cpp      |  14 ++--
 tools/phonon/version.h       |   2 +-
 10 files changed, 182 insertions(+), 54 deletions(-)

diff --git a/tools/phonon/Makefile b/tools/phonon/Makefile
index 0aacb1e086..67f9b91fdf 100644
--- a/tools/phonon/Makefile
+++ b/tools/phonon/Makefile
@@ -1,7 +1,7 @@
 .SUFFIXES : .o .cpp
 # compiler and flags
-CC     = g++ -Wno-unused-result
-LINK   = $(CC) -static
+CC     = g++ -Wall
+LINK   = $(CC)
 CFLAGS = -O3 $(DEBUG) $(UFLAG)
 #
 OFLAGS = -O3 $(DEBUG)
@@ -9,18 +9,17 @@ INC    = $(LPKINC) $(TCINC) $(SPGINC)
 LIB    = $(LPKLIB) $(TCLIB) $(SPGLIB)
 #
 # cLapack library needed
-LPKINC = -I/opt/libs/clapack/3.2.1/include
-LPKLIB = -L/opt/libs/clapack/3.2.1/lib -lclapack -lblas -lf2c #-lm
+LPKINC = 
+LPKLIB =-llapack
 #
-# Tricubic library needed
-TCINC = -I/opt/libs/tricubic/1.0/include
-TCLIB = -L/opt/libs/tricubic/1.0/lib -ltricubic
 #
 # spglib 1.8.2, used to get the irreducible q-points
 # if UFLAG is not set, spglib won't be used.
-UFLAG  = -DUseSPG
-SPGINC = -I/opt/libs/spglib/1.8.2/include
-SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg
+
+# UFLAG  = -DUseSPG
+# SPGINC = -I/opt/libs/spglib/1.8.2/include
+# SPGLIB = -L/opt/libs/spglib/1.8.2/lib -lsymspg
+
 # if spglib other than version 1.8.2 is used, please 
 # modify file phonon.cpp, instruction can be found by searching 1.8.2
 
@@ -36,7 +35,7 @@ SRC = $(wildcard *.cpp)
 OBJ = $(SRC:.cpp=.o)
 
 #====================================================================
-all:  ver ${EXE}
+all:  ${EXE}
 
 ${EXE}: $(OBJ)
 	$(LINK) $(OFLAGS) $(OBJ) $(LIB) -o $@
@@ -59,3 +58,16 @@ ver:
 	$(CC) $(CFLAGS) -c $<
 .cpp.o:
 	$(CC) $(CFLAGS) $(INC) -c $<
+
+#====================================================================
+# dependencies
+disp.o: disp.cpp phonon.h dynmat.h memory.h interpolate.h green.h timer.h \
+ global.h
+dynmat.o: dynmat.cpp dynmat.h memory.h interpolate.h version.h global.h
+green.o: green.cpp green.h memory.h global.h
+interpolate.o: interpolate.cpp interpolate.h memory.h global.h
+main.o: main.cpp dynmat.h memory.h interpolate.h phonon.h
+memory.o: memory.cpp memory.h
+phonon.o: phonon.cpp phonon.h dynmat.h memory.h interpolate.h green.h \
+ timer.h global.h
+timer.o: timer.cpp timer.h
diff --git a/tools/phonon/README b/tools/phonon/README
index ae6383b6bd..b54d96d8a3 100644
--- a/tools/phonon/README
+++ b/tools/phonon/README
@@ -5,15 +5,9 @@
    analyse the phonon related information.
 #-------------------------------------------------------------------------------
 1. Dependencies
-   The clapack library is needed to solve the eigen problems,
-   which could be downloaded from:
-   http://www.netlib.org/clapack/
-   
-   The tricubic library is also needed to do tricubic interpolations,
-   which could be obtained from:
-      http://orca.princeton.edu/francois/software/tricubic/
-   or
-      http://1drv.ms/1J2WFYk
+   The LAPACK library is needed to solve the eigen problems.
+   http://www.netlib.org/lapack/
+   Intel MKL can be used as well.
    
    The spglib is optionally needed, enabling one to evaluate the
    phonon density of states or vibrational thermal properties
diff --git a/tools/phonon/disp.cpp b/tools/phonon/disp.cpp
index 2fa603916c..218e01e7fc 100644
--- a/tools/phonon/disp.cpp
+++ b/tools/phonon/disp.cpp
@@ -18,7 +18,8 @@ void Phonon::pdisp()
 {
   // ask the output file name and write the header.
   char str[MAXLINE];
-  for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n");
+  for (int ii = 0; ii < 80; ++ii) printf("=");
+  printf("\n");
 #ifdef UseSPG
   // ask method to generate q-lines
   int method = 2;
@@ -53,7 +54,6 @@ void Phonon::pdisp()
     while (1){
       for (int i = 0; i < 3; ++i) qstr[i] = qend[i];
   
-      int quit = 0;
       printf("\nPlease input the start q-point in unit of B1->B3, q to exit [%g %g %g]: ", qstr[0], qstr[1], qstr[2]);
       int n = count_words(fgets(str, MAXLINE, stdin));
       ptr = strtok(str, " \t\n\r\f");
@@ -2844,7 +2844,8 @@ void Phonon::pdisp()
     printf("\nPhonon dispersion data are written to: %s, you can visualize the results\n", fname);
     printf("by invoking: `gnuplot pdisp.gnuplot; gv pdisp.eps`\n");
   }
-  for (int ii = 0; ii < 80; ++ii) printf("="); printf("\n");
+  for (int ii = 0; ii < 80; ++ii) printf("=");
+  printf("\n");
 
   delete []fname;
   nodes.clear();
diff --git a/tools/phonon/dynmat.cpp b/tools/phonon/dynmat.cpp
index e82f473130..3b7bfe8268 100644
--- a/tools/phonon/dynmat.cpp
+++ b/tools/phonon/dynmat.cpp
@@ -3,6 +3,11 @@
 #include "version.h"
 #include "global.h"
 
+extern "C" void zheevd_(char *, char *, long int *, doublecomplex *,
+                       long int *, double *, doublecomplex *,
+                       long int *, double *, long int *, long int *,
+                       long int *, long int *);
+
 // to initialize the class
 DynMat::DynMat(int narg, char **arg)
 {
@@ -81,7 +86,8 @@ DynMat::DynMat(int narg, char **arg)
   printf("Number of atoms per unit cell     : %d\n", nucell);
   printf("System dimension                  : %d\n", sysdim);
   printf("Boltzmann constant in used units  : %g\n", boltz);
-  for (int i = 0; i < 80; ++i) printf("="); printf("\n");
+  for (int i = 0; i < 80; ++i) printf("=");
+  printf("\n");
   if (sysdim < 1||sysdim > 3||nx < 1||ny < 1||nz < 1||nucell < 1){
     printf("Wrong values read from header of file: %s, please check the binary file!\n", binfile);
     fclose(fp); exit(3);
@@ -117,11 +123,11 @@ DynMat::DynMat(int narg, char **arg)
   memory->create(attyp, nucell,         "DynMat:attyp");
   memory->create(M_inv_sqrt, nucell,    "DynMat:M_inv_sqrt");
   
-  if ( fread(&Tmeasure,      sizeof(double), 1,      fp) != 1     ){printf("\nError while reading temperature from file: %s\n",   binfile); fclose(fp); exit(3);}
-  if ( fread(&basevec[0],    sizeof(double), 9,      fp) != 9     ){printf("\nError while reading lattice info from file: %s\n",  binfile); fclose(fp); exit(3);}
-  if ( fread(basis[0],       sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n",    binfile); fclose(fp); exit(3);}
-  if ( fread(&attyp[0],      sizeof(int),    nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n",    binfile); fclose(fp); exit(3);}
-  if ( fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&Tmeasure,      sizeof(double), 1,      fp) != 1     ){printf("\nError while reading temperature from file: %s\n",   binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&basevec[0],    sizeof(double), 9,      fp) != 9     ){printf("\nError while reading lattice info from file: %s\n",  binfile); fclose(fp); exit(3);}
+  if ( (int) fread(basis[0],       sizeof(double), fftdim, fp) != fftdim){printf("\nError while reading basis info from file: %s\n",    binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&attyp[0],      sizeof(int),    nucell, fp) != nucell){printf("\nError while reading atom types from file: %s\n",    binfile); fclose(fp); exit(3);}
+  if ( (int) fread(&M_inv_sqrt[0], sizeof(double), nucell, fp) != nucell){printf("\nError while reading atomic masses from file: %s\n", binfile); fclose(fp); exit(3);}
   fclose(fp);
 
   car2dir();
@@ -229,9 +235,9 @@ return;
 int DynMat::geteigen(double *egv, int flag)
 {
   char jobz, uplo;
-  integer n, lda, lwork, lrwork, *iwork, liwork, info;
+  long int n, lda, lwork, lrwork, *iwork, liwork, info;
   doublecomplex *work;
-  doublereal *w = &egv[0], *rwork;
+  double *w = &egv[0], *rwork;
 
   n     = fftdim;
   if (flag) jobz = 'V';
@@ -338,7 +344,8 @@ void DynMat::EnforceASR()
   char *ptr = strtok(str," \t\n\r\f");
   if (ptr) nasr = atoi(ptr);
   if (nasr < 1){
-    for (int i=0; i<80; i++) printf("="); printf("\n");
+    for (int i=0; i<80; i++) printf("=");
+    printf("\n");
     return;
   }
 
@@ -404,7 +411,8 @@ void DynMat::EnforceASR()
     if (i == 99){ printf("...... (%d more skiped)", fftdim-100); break;}
   }
   printf("\n");
-  for (int i = 0; i < 80; ++i) printf("="); printf("\n\n");
+  for (int i = 0; i < 80; ++i) printf("=");
+  printf("\n\n");
 
 return;
 }
@@ -456,7 +464,7 @@ return;
  * --------------------------------------------------------------------*/
 void DynMat::GaussJordan(int n, double *Mat)
 {
-  int i,icol,irow,j,k,l,ll,idr,idc;
+  int i,icol=0,irow=0,j,k,l,ll,idr,idc;
   int *indxc,*indxr,*ipiv;
   double big, nmjk;
   double dum, pivinv;
diff --git a/tools/phonon/dynmat.h b/tools/phonon/dynmat.h
index 1d6e716584..f5bd4010b8 100644
--- a/tools/phonon/dynmat.h
+++ b/tools/phonon/dynmat.h
@@ -7,11 +7,6 @@
 #include "memory.h"
 #include "interpolate.h"
 
-extern "C"{
-#include "f2c.h"
-#include "clapack.h"
-}
-
 using namespace std;
 
 class DynMat {
diff --git a/tools/phonon/green.cpp b/tools/phonon/green.cpp
index 8f8946dc4f..35514c03fb 100644
--- a/tools/phonon/green.cpp
+++ b/tools/phonon/green.cpp
@@ -224,7 +224,6 @@ void Green::recursion()
 {
   // local variables
   std::complex<double> Z, rec_x, rec_x_inv;
-  std::complex<double> cunit = std::complex<double>(0.,1.);
 
   double w = wmin;
 
diff --git a/tools/phonon/interpolate.cpp b/tools/phonon/interpolate.cpp
index 8c0cbde1ce..954062d415 100644
--- a/tools/phonon/interpolate.cpp
+++ b/tools/phonon/interpolate.cpp
@@ -1,7 +1,125 @@
 #include "interpolate.h"
-#include "math.h"
+#include <math.h>
 #include "global.h"
 
+///////////////////////
+// tricubic library code
+static int A[64][64] = {
+{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 9,-9,-9, 9, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 6,-6, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 6,-6, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 4,-4,-4, 4, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3,-6,-3, 0, 0, 0, 0, 6,-6, 3,-3, 0, 0, 0, 0, 4, 2, 2, 1, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 3, 3, 0, 0, 0, 0,-4, 4,-2, 2, 0, 0, 0, 0,-2,-2,-1,-1, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 4, 2, 0, 0, 0, 0,-3, 3,-3, 3, 0, 0, 0, 0,-2,-1,-2,-1, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2,-2,-2, 0, 0, 0, 0, 2,-2, 2,-2, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0},
+{-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 9,-9, 0, 0,-9, 9, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 0, 0, 6,-6, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0, 0, 0,-1, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9,-9, 0, 0,-9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 3, 0, 0,-6,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6,-6, 0, 0, 3,-3, 0, 0, 4, 2, 0, 0, 2, 1, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3,-3, 0, 0, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 4, 0, 0,-2, 2, 0, 0,-2,-2, 0, 0,-1,-1, 0, 0},
+{ 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 9, 0,-9, 0,-9, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 3, 0,-6, 0,-3, 0, 6, 0,-6, 0, 3, 0,-3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0, 2, 0, 2, 0, 1, 0},
+{-27,27,27,-27,27,-27,-27,27,-18,-9,18, 9,18, 9,-18,-9,-18,18,-9, 9,18,-18, 9,-9,-18,18,18,-18,-9, 9, 9,-9,-12,-6,-6,-3,12, 6, 6, 3,-12,-6,12, 6,-6,-3, 6, 3,-12,12,-6, 6,-6, 6,-3, 3,-8,-4,-4,-2,-4,-2,-2,-1},
+{18,-18,-18,18,-18,18,18,-18, 9, 9,-9,-9,-9,-9, 9, 9,12,-12, 6,-6,-12,12,-6, 6,12,-12,-12,12, 6,-6,-6, 6, 6, 6, 3, 3,-6,-6,-3,-3, 6, 6,-6,-6, 3, 3,-3,-3, 8,-8, 4,-4, 4,-4, 2,-2, 4, 4, 2, 2, 2, 2, 1, 1},
+{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 0,-3, 0, 3, 0, 3, 0,-4, 0, 4, 0,-2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-2, 0,-1, 0,-1, 0},
+{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6, 9,-9, 9,-9,-9, 9,-9, 9,12,-12,-12,12, 6,-6,-6, 6, 6, 3, 6, 3,-6,-3,-6,-3, 8, 4,-8,-4, 4, 2,-4,-2, 6,-6, 6,-6, 3,-3, 3,-3, 4, 2, 4, 2, 2, 1, 2, 1},
+{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-6, 6,-6, 6, 6,-6, 6,-6,-8, 8, 8,-8,-4, 4, 4,-4,-3,-3,-3,-3, 3, 3, 3, 3,-4,-4, 4, 4,-2,-2, 2, 2,-4, 4,-4, 4,-2, 2,-2, 2,-2,-2,-2,-2,-1,-1,-1,-1},
+{ 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{-6, 6, 0, 0, 6,-6, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 4,-4, 0, 0,-4, 4, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-6, 6, 0, 0, 6,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4,-2, 0, 0, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-3, 3, 0, 0,-3, 3, 0, 0,-2,-1, 0, 0,-2,-1, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4,-4, 0, 0,-4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0,-2,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,-2, 0, 0, 2,-2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0},
+{-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0,-6, 0, 6, 0, 6, 0,-6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-4, 0,-2, 0, 4, 0, 2, 0,-3, 0, 3, 0,-3, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0,-2, 0,-1, 0,-2, 0,-1, 0},
+{18,-18,-18,18,-18,18,18,-18,12, 6,-12,-6,-12,-6,12, 6,12,-12, 6,-6,-12,12,-6, 6, 9,-9,-9, 9, 9,-9,-9, 9, 8, 4, 4, 2,-8,-4,-4,-2, 6, 3,-6,-3, 6, 3,-6,-3, 6,-6, 3,-3, 6,-6, 3,-3, 4, 2, 2, 1, 4, 2, 2, 1},
+{-12,12,12,-12,12,-12,-12,12,-6,-6, 6, 6, 6, 6,-6,-6,-8, 8,-4, 4, 8,-8, 4,-4,-6, 6, 6,-6,-6, 6, 6,-6,-4,-4,-2,-2, 4, 4, 2, 2,-3,-3, 3, 3,-3,-3, 3, 3,-4, 4,-2, 2,-4, 4,-2, 2,-2,-2,-1,-1,-2,-2,-1,-1},
+{ 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+{ 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,-4, 0,-4, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0,-2, 0,-2, 0, 2, 0,-2, 0, 2, 0,-2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+{-12,12,12,-12,12,-12,-12,12,-8,-4, 8, 4, 8, 4,-8,-4,-6, 6,-6, 6, 6,-6, 6,-6,-6, 6, 6,-6,-6, 6, 6,-6,-4,-2,-4,-2, 4, 2, 4, 2,-4,-2, 4, 2,-4,-2, 4, 2,-3, 3,-3, 3,-3, 3,-3, 3,-2,-1,-2,-1,-2,-1,-2,-1},
+{ 8,-8,-8, 8,-8, 8, 8,-8, 4, 4,-4,-4,-4,-4, 4, 4, 4,-4, 4,-4,-4, 4,-4, 4, 4,-4,-4, 4, 4,-4,-4, 4, 2, 2, 2, 2,-2,-2,-2,-2, 2, 2,-2,-2, 2, 2,-2,-2, 2,-2, 2,-2, 2,-2, 2,-2, 1, 1, 1, 1, 1, 1, 1, 1}};
+
+static int ijk2n(int i, int j, int k) {
+  return(i+4*j+16*k);
+}
+
+/* ---------------------------------------------------------------------------- */
+
+static void tricubic_get_coeff_stacked(double a[64], double x[64]) {
+  int i,j;
+  for (i=0;i<64;i++) {
+    a[i]=(double)(0.0);
+    for (j=0;j<64;j++) {
+      a[i]+=A[i][j]*x[j];
+    }
+  }
+}
+
+static void tricubic_get_coeff(double a[64], double f[8], double dfdx[8], double dfdy[8], double dfdz[8], double d2fdxdy[8], double d2fdxdz[8], double d2fdydz[8], double d3fdxdydz[8]) {
+  int i;
+  double x[64];
+  for (i=0;i<8;i++) {
+    x[0+i]=f[i];
+    x[8+i]=dfdx[i];
+    x[16+i]=dfdy[i];
+    x[24+i]=dfdz[i];
+    x[32+i]=d2fdxdy[i];
+    x[40+i]=d2fdxdz[i];
+    x[48+i]=d2fdydz[i];
+    x[56+i]=d3fdxdydz[i];
+  }
+  tricubic_get_coeff_stacked(a,x);
+}
+
+static double tricubic_eval(double a[64], double x, double y, double z) {
+  int i,j,k;
+  double ret=(double)(0.0);
+  /* TRICUBIC EVAL
+     This is the short version of tricubic_eval. It is used to compute
+     the value of the function at a given point (x,y,z). To compute
+     partial derivatives of f, use the full version with the extra args.
+  */
+  for (i=0;i<4;i++) {
+    for (j=0;j<4;j++) {
+      for (k=0;k<4;k++) {
+        ret+=a[ijk2n(i,j,k)]*pow(x,i)*pow(y,j)*pow(z,k);
+      }
+    }
+  }
+  return(ret);
+}
+
 /* ----------------------------------------------------------------------------
  * Constructor used to get info from caller, and prepare other necessary data
  * ---------------------------------------------------------------------------- */
@@ -274,7 +392,8 @@ void Interpolate::set_method()
 
   which =2-im%2;
   printf("Your  selection: %d\n", which);
-  for(int i=0; i<80; i++) printf("="); printf("\n\n");
+  for(int i=0; i<80; i++) printf("=");
+  printf("\n\n");
 
   if (which == 1) tricubic_init();
 
@@ -306,4 +425,3 @@ void Interpolate::reset_gamma()
 
 return;
 }
-/* ---------------------------------------------------------------------------- */
diff --git a/tools/phonon/interpolate.h b/tools/phonon/interpolate.h
index e192fcac87..04a358ae71 100644
--- a/tools/phonon/interpolate.h
+++ b/tools/phonon/interpolate.h
@@ -5,11 +5,8 @@
 #include "stdlib.h"
 #include "string.h"
 #include "memory.h"
-#include <tricubic.h>
-extern "C"{
-#include "f2c.h"
-#include "clapack.h"
-}
+
+extern "C" typedef struct { double r, i; } doublecomplex;
 
 using namespace std;
 
diff --git a/tools/phonon/phonon.cpp b/tools/phonon/phonon.cpp
index 43bea111b4..065885cf3f 100644
--- a/tools/phonon/phonon.cpp
+++ b/tools/phonon/phonon.cpp
@@ -42,7 +42,8 @@ Phonon::Phonon(DynMat *dm)
     printf("\n");
     for (int i = 0; i < 37; ++i) printf("=");
     printf(" Menu ");
-    for (int i = 0; i < 37; ++i) printf("="); printf("\n");
+    for (int i = 0; i < 37; ++i) printf("=");
+    printf("\n");
     printf("  1. Phonon DOS evaluation;\n");
     printf("  2. Phonon dispersion curves;\n");
     printf("  3. Dynamical matrix at arbitrary q;\n");
@@ -60,7 +61,8 @@ Phonon::Phonon(DynMat *dm)
     printf("Your choice [0]: ");
     if (count_words(fgets(str,MAXLINE,stdin)) > 0) job = atoi(strtok(str," \t\n\r\f"));
     printf("\nYour  selection: %d\n", job);
-    for (int i = 0; i < 80; ++i) printf("=");printf("\n\n");
+    for (int i = 0; i < 80; ++i) printf("=");
+    printf("\n\n");
 
     // now to do the job according to user's choice
     if      (job == 1) pdos();
@@ -414,7 +416,8 @@ void Phonon::vfanyq()
     dynmat->geteigen(egvs, 0);
     printf("q-point: [%lg %lg %lg], ", q[0], q[1], q[2]);
     printf("vibrational frequencies at this q-point:\n");
-    for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]); printf("\n\n");
+    for (int i = 0; i < ndim; ++i) printf("%lg ", egvs[i]);
+    printf("\n\n");
   }
 
 return;
@@ -1001,7 +1004,8 @@ void Phonon::ShowCell()
   printf("\n");
   for (int i = 0; i < 30; ++i) printf("=");
   printf("   Unit Cell Info   ");
-  for (int i = 0; i < 30; ++i) printf("="); printf("\n");
+  for (int i = 0; i < 30; ++i) printf("=");
+  printf("\n");
   printf("Number of atoms in the unit cell: %d\n", dynmat->nucell);
   printf("Basis  vectors  of the unit cell:\n");
   printf("  %15.8f  %15.8f  %15.8f\n", dynmat->basevec[0],  dynmat->basevec[1],  dynmat->basevec[2]);
@@ -1091,7 +1095,7 @@ int Phonon::count_words(const char *line)
   strcpy(copy,line);
 
   char *ptr;
-  if (ptr = strchr(copy,'#')) *ptr = '\0';
+  if ((ptr = strchr(copy,'#'))) *ptr = '\0';
 
   if (strtok(copy," \t\n\r\f") == NULL) {
     memory->destroy(copy);
diff --git a/tools/phonon/version.h b/tools/phonon/version.h
index 8ed0e80aa7..decab631b0 100644
--- a/tools/phonon/version.h
+++ b/tools/phonon/version.h
@@ -1 +1 @@
-#define VERSION 7
+#define VERSION 8

From 091d0580904a43810c8c641ec91e1c82a0deb3bd Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Sat, 30 Sep 2017 17:44:15 -0400
Subject: [PATCH 21/53] Fix typo

---
 cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 666b77ae3d..48557a43f3 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -76,7 +76,7 @@ add_definitions(-DLAMMPS_MEMALIGN=${LAMMPS_MEMALIGN})
 option(LAMMPS_EXCEPTIONS "enable the use of C++ exceptions for error messages (useful for library interface)" OFF)
 if(LAMMPS_EXCEPTIONS)
   add_definitions(-DLAMMPS_EXCEPTIONS)
-  set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES -DLAMMPS_EXCEPTIONS")
+  set(LAMMPS_API_DEFINES "${LAMMPS_API_DEFINES} -DLAMMPS_EXCEPTIONS")
 endif()
 
 set(LAMMPS_MACHINE "" CACHE STRING "Suffix to append to lmp binary and liblammps (WON'T enable any features automatically")

From 6e342d2e45dcfd864b011578caf97f0b5be2443c Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Sun, 1 Oct 2017 14:03:52 +0200
Subject: [PATCH 22/53] MAINT: bigint (int64) maps onto either long or long
 long, depending on platform. Automatically choose the correct one.

---
 src/USER-NETCDF/dump_netcdf.cpp       | 39 +++++++++++++++++++--------
 src/USER-NETCDF/dump_netcdf_mpiio.cpp | 35 +++++++++++++++++++-----
 2 files changed, 57 insertions(+), 17 deletions(-)

diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index 971f69f7cc..3193f3c365 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -607,6 +607,32 @@ void DumpNetCDF::closefile()
 
 /* ---------------------------------------------------------------------- */
 
+template <typename T>
+int nc_put_var1_x(int ncid, int varid, const size_t index[], const T* tp)
+{
+  return nc_put_var1_double(ncid, varid, index, tp);
+}
+
+template <>
+int nc_put_var1_x<int>(int ncid, int varid, const size_t index[], const int* tp)
+{
+  return nc_put_var1_int(ncid, varid, index, tp);
+}
+
+template <>
+int nc_put_var1_x<long>(int ncid, int varid, const size_t index[],
+                        const long* tp)
+{
+  return nc_put_var1_long(ncid, varid, index, tp);
+}
+
+template <>
+int nc_put_var1_x<long long>(int ncid, int varid, const size_t index[],
+                             const long long* tp)
+{
+  return nc_put_var1_longlong(ncid, varid, index, tp);
+}
+
 void DumpNetCDF::write()
 {
   // open file
@@ -638,13 +664,8 @@ void DumpNetCDF::write()
                   th->keyword[i] );
         }
         else if (th->vtype[i] == BIGINT) {
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-          NCERRX( nc_put_var1_long(ncid, thermovar[i], start, &th->bivalue),
+          NCERRX( nc_put_var1_x(ncid, thermovar[i], start, &th->bivalue),
                   th->keyword[i] );
-#else
-          NCERRX( nc_put_var1_int(ncid, thermovar[i], start, &th->bivalue),
-                  th->keyword[i] );
-#endif
         }
       }
     }
@@ -930,11 +951,7 @@ void DumpNetCDF::write_prmtop()
 
   fprintf(f, "%%FLAG POINTERS\n");
   fprintf(f, "%%FORMAT(10I8)\n");
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-  fprintf(f, "%8li", ntotalgr);
-#else
-  fprintf(f, "%8i", ntotalgr);
-#endif
+  fprintf(f, BIGINT_FORMAT, ntotalgr);
   for (int i = 0; i < 11; i++)
     fprintf(f, "%8i", 0);
   fprintf(f, "\n");
diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
index 3b753b1b04..656da1b6df 100644
--- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
@@ -583,6 +583,34 @@ void DumpNetCDFMPIIO::closefile()
 
 /* ---------------------------------------------------------------------- */
 
+template <typename T>
+int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[],
+                     const T* tp)
+{
+  return ncmpi_put_var1_double(ncid, varid, index, tp);
+}
+
+template <>
+int ncmpi_put_var1_x<int>(int ncid, int varid, const MPI_Offset index[],
+                          const int* tp)
+{
+  return ncmpi_put_var1_int(ncid, varid, index, tp);
+}
+
+template <>
+int ncmpi_put_var1_x<long>(int ncid, int varid, const MPI_Offset index[],
+                           const long* tp)
+{
+  return ncmpi_put_var1_long(ncid, varid, index, tp);
+}
+
+template <>
+int ncmpi_put_var1_x<long long>(int ncid, int varid, const MPI_Offset index[],
+                                const long long* tp)
+{
+  return ncmpi_put_var1_longlong(ncid, varid, index, tp);
+}
+
 void DumpNetCDFMPIIO::write()
 {
   // open file
@@ -616,13 +644,8 @@ void DumpNetCDFMPIIO::write()
                   th->keyword[i] );
         }
         else if (th->vtype[i] == BIGINT) {
-#if defined(LAMMPS_SMALLBIG) || defined(LAMMPS_BIGBIG)
-          NCERRX( ncmpi_put_var1_long(ncid, thermovar[i], start, &th->bivalue),
+          NCERRX( ncmpi_put_var1_x(ncid, thermovar[i], start, &th->bivalue),
                   th->keyword[i] );
-#else
-          NCERRX( ncmpi_put_var1_int(ncid, thermovar[i], start, &th->bivalue),
-                  th->keyword[i] );
-#endif
         }
       }
     }

From 84378f8ae27fe4c181e0f676910fea3fbd9df940 Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Sun, 1 Oct 2017 14:05:12 +0200
Subject: [PATCH 23/53] MAINT: Renamed _put_var1_x to _put_var1_bigint

---
 src/USER-NETCDF/dump_netcdf.cpp       | 14 ++++----------
 src/USER-NETCDF/dump_netcdf_mpiio.cpp | 15 ++++-----------
 2 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index 3193f3c365..aa2a4700a3 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -608,26 +608,20 @@ void DumpNetCDF::closefile()
 /* ---------------------------------------------------------------------- */
 
 template <typename T>
-int nc_put_var1_x(int ncid, int varid, const size_t index[], const T* tp)
-{
-  return nc_put_var1_double(ncid, varid, index, tp);
-}
-
-template <>
-int nc_put_var1_x<int>(int ncid, int varid, const size_t index[], const int* tp)
+int nc_put_var1_bigint(int ncid, int varid, const size_t index[], const T* tp)
 {
   return nc_put_var1_int(ncid, varid, index, tp);
 }
 
 template <>
-int nc_put_var1_x<long>(int ncid, int varid, const size_t index[],
+int nc_put_var1_bigint<long>(int ncid, int varid, const size_t index[],
                         const long* tp)
 {
   return nc_put_var1_long(ncid, varid, index, tp);
 }
 
 template <>
-int nc_put_var1_x<long long>(int ncid, int varid, const size_t index[],
+int nc_put_var1_bigint<long long>(int ncid, int varid, const size_t index[],
                              const long long* tp)
 {
   return nc_put_var1_longlong(ncid, varid, index, tp);
@@ -664,7 +658,7 @@ void DumpNetCDF::write()
                   th->keyword[i] );
         }
         else if (th->vtype[i] == BIGINT) {
-          NCERRX( nc_put_var1_x(ncid, thermovar[i], start, &th->bivalue),
+          NCERRX( nc_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue),
                   th->keyword[i] );
         }
       }
diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
index 656da1b6df..e054772c41 100644
--- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
@@ -584,28 +584,21 @@ void DumpNetCDFMPIIO::closefile()
 /* ---------------------------------------------------------------------- */
 
 template <typename T>
-int ncmpi_put_var1_x(int ncid, int varid, const MPI_Offset index[],
+int ncmpi_put_var1_bigint(int ncid, int varid, const MPI_Offset index[],
                      const T* tp)
-{
-  return ncmpi_put_var1_double(ncid, varid, index, tp);
-}
-
-template <>
-int ncmpi_put_var1_x<int>(int ncid, int varid, const MPI_Offset index[],
-                          const int* tp)
 {
   return ncmpi_put_var1_int(ncid, varid, index, tp);
 }
 
 template <>
-int ncmpi_put_var1_x<long>(int ncid, int varid, const MPI_Offset index[],
+int ncmpi_put_var1_bigint<long>(int ncid, int varid, const MPI_Offset index[],
                            const long* tp)
 {
   return ncmpi_put_var1_long(ncid, varid, index, tp);
 }
 
 template <>
-int ncmpi_put_var1_x<long long>(int ncid, int varid, const MPI_Offset index[],
+int ncmpi_put_var1_bigint<long long>(int ncid, int varid, const MPI_Offset index[],
                                 const long long* tp)
 {
   return ncmpi_put_var1_longlong(ncid, varid, index, tp);
@@ -644,7 +637,7 @@ void DumpNetCDFMPIIO::write()
                   th->keyword[i] );
         }
         else if (th->vtype[i] == BIGINT) {
-          NCERRX( ncmpi_put_var1_x(ncid, thermovar[i], start, &th->bivalue),
+          NCERRX( ncmpi_put_var1_bigint(ncid, thermovar[i], start, &th->bivalue),
                   th->keyword[i] );
         }
       }

From 100231bba8f19b4deb3c9e73911bc7bcaca8fbd8 Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Sun, 1 Oct 2017 14:21:09 +0200
Subject: [PATCH 24/53] ENH: Enable multi file writes.

---
 src/USER-NETCDF/dump_netcdf.cpp       | 35 ++++++++++++++++++++-----
 src/USER-NETCDF/dump_netcdf_mpiio.cpp | 37 +++++++++++++++++++++------
 2 files changed, 57 insertions(+), 15 deletions(-)

diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index aa2a4700a3..85a2d40935 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -88,8 +88,8 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
 
   if (multiproc)
     error->all(FLERR,"Multi-processor writes are not supported.");
-  if (multifile)
-    error->all(FLERR,"Multiple files are not supported.");
+  if (append_flag && multifile)
+    error->all(FLERR,"Cannot append when writing to multiple files.");
 
   perat = new nc_perat_t[nfield];
 
@@ -224,6 +224,24 @@ DumpNetCDF::~DumpNetCDF()
 
 void DumpNetCDF::openfile()
 {
+  char *filecurrent = filename;
+  if (multifile && !singlefile_opened) {
+    char *filestar = filecurrent;
+    filecurrent = new char[strlen(filestar) + 16];
+    char *ptr = strchr(filestar,'*');
+    *ptr = '\0';
+    if (padflag == 0)
+      sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
+              filestar,update->ntimestep,ptr+1);
+    else {
+      char bif[8],pad[16];
+      strcpy(bif,BIGINT_FORMAT);
+      sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]);
+      sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1);
+    }
+    *ptr = '*';
+  }
+
   if (thermo && !singlefile_opened) {
     if (thermovar)  delete [] thermovar;
     thermovar = new int[output->thermo->nfield];
@@ -268,14 +286,14 @@ void DumpNetCDF::openfile()
   ntotalgr = group->count(igroup);
 
   if (filewriter) {
-    if (append_flag && access(filename, F_OK) != -1) {
+    if (append_flag && !multifile && access(filecurrent, F_OK) != -1) {
       // Fixme! Perform checks if dimensions and variables conform with
       // data structure standard.
 
       if (singlefile_opened) return;
       singlefile_opened = 1;
 
-      NCERRX( nc_open(filename, NC_WRITE, &ncid), filename );
+      NCERRX( nc_open(filecurrent, NC_WRITE, &ncid), filecurrent );
 
       // dimensions
       NCERRX( nc_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR );
@@ -348,8 +366,8 @@ void DumpNetCDF::openfile()
       if (singlefile_opened) return;
       singlefile_opened = 1;
 
-      NCERRX( nc_create(filename, NC_64BIT_DATA, &ncid),
-          filename );
+      NCERRX( nc_create(filecurrent, NC_64BIT_DATA, &ncid),
+              filecurrent );
 
       // dimensions
       NCERRX( nc_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim),
@@ -601,7 +619,10 @@ void DumpNetCDF::closefile()
     // append next time DumpNetCDF::openfile is called
     append_flag = 1;
     // write to next frame upon next open
-    framei++;
+    if (multifile)
+      framei = 1;
+    else
+      framei++;
   }
 }
 
diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
index e054772c41..271f963a4e 100644
--- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
@@ -88,8 +88,8 @@ DumpNetCDFMPIIO::DumpNetCDFMPIIO(LAMMPS *lmp, int narg, char **arg) :
 
   if (multiproc)
     error->all(FLERR,"Multi-processor writes are not supported.");
-  if (multifile)
-    error->all(FLERR,"Multiple files are not supported.");
+  if (append_flag && multifile)
+    error->all(FLERR,"Cannot append when writing to multiple files.");
 
   perat = new nc_perat_t[nfield];
 
@@ -217,6 +217,24 @@ DumpNetCDFMPIIO::~DumpNetCDFMPIIO()
 
 void DumpNetCDFMPIIO::openfile()
 {
+  char *filecurrent = filename;
+  if (multifile && !singlefile_opened) {
+    char *filestar = filecurrent;
+    filecurrent = new char[strlen(filestar) + 16];
+    char *ptr = strchr(filestar,'*');
+    *ptr = '\0';
+    if (padflag == 0)
+      sprintf(filecurrent,"%s" BIGINT_FORMAT "%s",
+              filestar,update->ntimestep,ptr+1);
+    else {
+      char bif[8],pad[16];
+      strcpy(bif,BIGINT_FORMAT);
+      sprintf(pad,"%%s%%0%d%s%%s",padflag,&bif[1]);
+      sprintf(filecurrent,pad,filestar,update->ntimestep,ptr+1);
+    }
+    *ptr = '*';
+  }
+
   if (thermo && !singlefile_opened) {
     if (thermovar)  delete [] thermovar;
     thermovar = new int[output->thermo->nfield];
@@ -260,7 +278,7 @@ void DumpNetCDFMPIIO::openfile()
   // get total number of atoms
   ntotalgr = group->count(igroup);
 
-  if (append_flag && access(filename, F_OK) != -1) {
+  if (append_flag && !multifile && access(filecurrent, F_OK) != -1) {
     // Fixme! Perform checks if dimensions and variables conform with
     // data structure standard.
 
@@ -270,8 +288,8 @@ void DumpNetCDFMPIIO::openfile()
     if (singlefile_opened) return;
     singlefile_opened = 1;
 
-    NCERRX( ncmpi_open(MPI_COMM_WORLD, filename, NC_WRITE, MPI_INFO_NULL,
-                       &ncid), filename );
+    NCERRX( ncmpi_open(MPI_COMM_WORLD, filecurrent, NC_WRITE, MPI_INFO_NULL,
+                       &ncid), filecurrent );
 
     // dimensions
     NCERRX( ncmpi_inq_dimid(ncid, NC_FRAME_STR, &frame_dim), NC_FRAME_STR );
@@ -344,8 +362,8 @@ void DumpNetCDFMPIIO::openfile()
     if (singlefile_opened) return;
     singlefile_opened = 1;
 
-    NCERRX( ncmpi_create(MPI_COMM_WORLD, filename, NC_64BIT_DATA,
-                         MPI_INFO_NULL, &ncid), filename );
+    NCERRX( ncmpi_create(MPI_COMM_WORLD, filecurrent, NC_64BIT_DATA,
+                         MPI_INFO_NULL, &ncid), filecurrent );
 
     // dimensions
     NCERRX( ncmpi_def_dim(ncid, NC_FRAME_STR, NC_UNLIMITED, &frame_dim),
@@ -577,7 +595,10 @@ void DumpNetCDFMPIIO::closefile()
     // append next time DumpNetCDFMPIIO::openfile is called
     append_flag = 1;
     // write to next frame upon next open
-    framei++;
+    if (multifile)
+      framei = 1;
+    else
+      framei++;
   }
 }
 

From 56d21bfb057b18c3f0056545cc6c2c52d8175c71 Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Sun, 1 Oct 2017 14:22:04 +0200
Subject: [PATCH 25/53] MAINT: Removed obsolete prmtop writer.

---
 src/USER-NETCDF/dump_netcdf.cpp | 58 ---------------------------------
 src/USER-NETCDF/dump_netcdf.h   |  1 -
 2 files changed, 59 deletions(-)

diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index 85a2d40935..be274b2052 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -947,64 +947,6 @@ int DumpNetCDF::modify_param(int narg, char **arg)
 
 /* ---------------------------------------------------------------------- */
 
-void DumpNetCDF::write_prmtop()
-{
-  char fn[1024];
-  char tmp[81];
-  FILE *f;
-
-  strcpy(fn, filename);
-  strcat(fn, ".prmtop");
-
-  f = fopen(fn, "w");
-  fprintf(f, "%%VERSION  LAMMPS\n");
-  fprintf(f, "%%FLAG TITLE\n");
-  fprintf(f, "%%FORMAT(20a4)\n");
-  memset(tmp, ' ', 76);
-  tmp[76] = '\0';
-  fprintf(f, "NASN%s\n", tmp);
-
-  fprintf(f, "%%FLAG POINTERS\n");
-  fprintf(f, "%%FORMAT(10I8)\n");
-  fprintf(f, BIGINT_FORMAT, ntotalgr);
-  for (int i = 0; i < 11; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-  for (int i = 0; i < 12; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-  for (int i = 0; i < 6; i++)
-    fprintf(f, "%8i", 0);
-  fprintf(f, "\n");
-
-  fprintf(f, "%%FLAG ATOM_NAME\n");
-  fprintf(f, "%%FORMAT(20a4)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%4s", "He");
-    if ((i+1) % 20 == 0)
-      fprintf(f, "\n");
-  }
-
-  fprintf(f, "%%FLAG CHARGE\n");
-  fprintf(f, "%%FORMAT(5E16.5)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%16.5e", 0.0);
-    if ((i+1) % 5 == 0)
-      fprintf(f, "\n");
-  }
-
-  fprintf(f, "%%FLAG MASS\n");
-  fprintf(f, "%%FORMAT(5E16.5)\n");
-  for (int i = 0; i < ntotalgr; i++) {
-    fprintf(f, "%16.5e", 1.0);
-    if ((i+1) % 5 == 0)
-        fprintf(f, "\n");
-  }
-  fclose(f);
-}
-
-/* ---------------------------------------------------------------------- */
-
 void DumpNetCDF::ncerr(int err, const char *descr, int line)
 {
   if (err != NC_NOERR) {
diff --git a/src/USER-NETCDF/dump_netcdf.h b/src/USER-NETCDF/dump_netcdf.h
index b86f294d30..25d64efade 100644
--- a/src/USER-NETCDF/dump_netcdf.h
+++ b/src/USER-NETCDF/dump_netcdf.h
@@ -92,7 +92,6 @@ class DumpNetCDF : public DumpCustom {
   void closefile();
   virtual void write_header(bigint);
   virtual void write_data(int, double *);
-  void write_prmtop();
 
   virtual int modify_param(int, char **);
 

From da7be99cc4a31636b08411e8564860f1ef254f63 Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Sun, 1 Oct 2017 14:28:20 +0200
Subject: [PATCH 26/53] DOC: Added multi file example.

---
 doc/src/dump_netcdf.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/dump_netcdf.txt b/doc/src/dump_netcdf.txt
index 63568137a6..70111a36a8 100644
--- a/doc/src/dump_netcdf.txt
+++ b/doc/src/dump_netcdf.txt
@@ -25,7 +25,8 @@ args = list of atom attributes, same as for "dump_style custom"_dump.html :l,ule
 
 dump 1 all netcdf 100 traj.nc type x y z vx vy vz
 dump_modify 1 append yes at -1 thermo yes
-dump 1 all netcdf/mpiio 1000 traj.nc id type x y z :pre
+dump 1 all netcdf/mpiio 1000 traj.nc id type x y z
+dump 1 all netcdf 1000 traj.*.nc id type x y z :pre
 
 [Description:]
 
@@ -73,4 +74,3 @@ section for more info.
 [Related commands:]
 
 "dump"_dump.html, "dump_modify"_dump_modify.html, "undump"_undump.html
-

From fbe42cda2d2458e60ba3bc6906d4c6f62cac74d6 Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Sun, 1 Oct 2017 14:31:33 +0200
Subject: [PATCH 27/53] MAINT: Only set append flag when not in multifile mode.

---
 src/USER-NETCDF/dump_netcdf.cpp       | 7 ++++---
 src/USER-NETCDF/dump_netcdf_mpiio.cpp | 7 ++++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index be274b2052..7156b773b3 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -616,13 +616,14 @@ void DumpNetCDF::closefile()
   if (filewriter && singlefile_opened) {
     NCERR( nc_close(ncid) );
     singlefile_opened = 0;
-    // append next time DumpNetCDF::openfile is called
-    append_flag = 1;
     // write to next frame upon next open
     if (multifile)
       framei = 1;
-    else
+    else {
+      // append next time DumpNetCDF::openfile is called
+      append_flag = 1;
       framei++;
+    }
   }
 }
 
diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
index 271f963a4e..29c2b6cb1f 100644
--- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
@@ -592,13 +592,14 @@ void DumpNetCDFMPIIO::closefile()
   if (singlefile_opened) {
     NCERR( ncmpi_close(ncid) );
     singlefile_opened = 0;
-    // append next time DumpNetCDFMPIIO::openfile is called
-    append_flag = 1;
     // write to next frame upon next open
     if (multifile)
       framei = 1;
-    else
+    else {
+      // append next time DumpNetCDFMPIIO::openfile is called
+      append_flag = 1;
       framei++;
+    }
   }
 }
 

From a7b0d1f521afac4f6b48dfbe6ba79c6054965f6c Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Sun, 1 Oct 2017 14:40:19 +0200
Subject: [PATCH 28/53] DOC: Corrected syntax for appending at certain frame
 (NetCDF only)

---
 doc/src/dump_modify.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt
index 2ea1da3db3..6ccf40a8c5 100644
--- a/doc/src/dump_modify.txt
+++ b/doc/src/dump_modify.txt
@@ -16,7 +16,7 @@ dump-ID = ID of dump to modify :ulb,l
 one or more keyword/value pairs may be appended :l
 these keywords apply to various dump styles :l
 keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
-  {append} arg = {yes} or {no} or {at} N
+  {append} arg = {yes} or {no} or {yes at} N
     N = index of frame written upon first dump
   {buffer} arg = {yes} or {no}
   {element} args = E1 E2 ... EN, where N = # of atom types

From cf24dd026520a5283589fd89150e49da3e4f0bc2 Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Mon, 2 Oct 2017 04:53:17 -0700
Subject: [PATCH 29/53] Adding pair style dpd/intel and dihedral style
 fourier/intel Adding raw performance numbers for Skylake xeon server. Fixes
 for using older Intel compilers and compiling without OpenMP. Fix adding in
 hooks for using USER-INTEL w/ minimization.

---
 doc/src/JPG/user_intel.png                    | Bin 20491 -> 19528 bytes
 doc/src/accelerate_intel.txt                  |   8 +-
 doc/src/dihedral_fourier.txt                  |   1 +
 doc/src/pair_dpd.txt                          |   1 +
 src/USER-INTEL/README                         |  16 +-
 src/USER-INTEL/TEST/README                    |  20 +-
 src/USER-INTEL/TEST/in.intel.dpd              |  48 ++
 src/USER-INTEL/dihedral_fourier_intel.cpp     | 441 +++++++++++++
 src/USER-INTEL/dihedral_fourier_intel.h       |  82 +++
 src/USER-INTEL/fix_intel.cpp                  |   1 +
 src/USER-INTEL/fix_intel.h                    |   3 +
 src/USER-INTEL/intel_buffers.cpp              |   3 +-
 src/USER-INTEL/intel_preprocess.h             |   9 +
 src/USER-INTEL/npair_full_bin_ghost_intel.cpp |   2 -
 src/USER-INTEL/npair_intel.cpp                |   2 -
 src/USER-INTEL/pair_dpd_intel.cpp             | 617 ++++++++++++++++++
 src/USER-INTEL/pair_dpd_intel.h               | 110 ++++
 17 files changed, 1345 insertions(+), 19 deletions(-)
 create mode 100644 src/USER-INTEL/TEST/in.intel.dpd
 create mode 100644 src/USER-INTEL/dihedral_fourier_intel.cpp
 create mode 100644 src/USER-INTEL/dihedral_fourier_intel.h
 create mode 100644 src/USER-INTEL/pair_dpd_intel.cpp
 create mode 100644 src/USER-INTEL/pair_dpd_intel.h

diff --git a/doc/src/JPG/user_intel.png b/doc/src/JPG/user_intel.png
index 7ec83b3207b06c4bbda7d56f2a7d9d94a15d115d..5061f1af2e26d9c2c1110390143d9ebf96946bd4 100755
GIT binary patch
literal 19528
zcmeHu1z1#V*X|$)f(jz3)Jvy?;2^D(fOLpRNeC+49ZJX0NP~1J1Bi5YNJtMYT?0rr
zbM}Dxs^9z7^`HNo|D5Z)E^%Pb-s@TGUTfWJt!MA;ColWpD%MRb5D0WtQbJS_1iFw0
z0-*$5x&R#MI`ZZOK2WR_9|(h>U8GCEA83a6W$uGO1;Lk(buI#bW135-S%E;;t&u+{
z69yFaAkf2cNzwbt_FAh60v~31!_tiyt2`78uciloegEKj*`p9)wwYMSXWiUChAid?
z=LOO`dM(K~v3n(XNo7I$phlc-L(`Zm*Z#o4A^qxm6n;?RcCw<4AOaQxg$sL~nr#mu
zP=ViT)8vyM6kq_q8K5_j10T_(_)fsbQ%iYx#njPCu+ZuL;k;^4deg?tf|==cxZ#{b
zAue!T#r=Zy3#zF#)k|^;@S2)Z4aDgI_4~1YJ0rfXI0ztN0^8r^!G6=JJ5~vF^w6bF
z5!NXV9L~dNs-~9puR02Wj;Qn<n${Z65-GTz9E$NES7mv=H}ArK(MG&r{bC`EB)!@i
zsT(^t!!8*vp_6J&u7B?G;UR%jVW>86x;+PQJlS8Ms=vwYLG@)=@Q)QPYiJ6=m`n6;
z9Zg?#C$_4jWhX%afhtLU*arON-gf`Ya5{|N4gWv&<>B-?Vt)a#uZ^H5@Aq*+4MFi2
z0D-nI4s6z(jJh>?9GaVfZr=lxbgu^fZ$hg3Kk>J9@AW(&OlDT%w0YjViKGy6vziHV
z01Kjl*e@{bzP-z^tNG>S_7L|8BjBo$<(>5;A?sCece4)&)C_7O$45p)rLNnP1EP+?
z4N8#v{c~g2FR8sBr?*llTS|XFHb1<74OGkU7UnVi7G@{<P8YE<#MX`ix<{{1=(KiK
z>&ogxWpL)(E{5HzYmB?gMw{O*_!2I_fDFV|-J1Tb+{d402)2x!FM*z3lZW5807Bfo
zz49q@Z;Fi*-j~>Y*j=aIOCVw_d?)$%VgHl$bu;JKi=Zo4{RJ4h_?ujhq$^zRlV~@f
zg90&;hq#(fkBqX{j-%e~?%xK3o=PBPW-#s^t0e{q2o9ryye|UQ8OG|$`BKO>;G}#%
zkfDnM0&%j779fs<P9<+n-*+kHehw^xbbX|x%1Fao!LbTl5nnXEKK!Fq`Yqx;v-WHI
zlmehybEN&!%J8W50X%5!7S7B&J0K0%K}eK2366I3jupR@!M1?hV86z&D^@E<AVGI)
z_xPn58pvJeEmc~M0WlC~+$nCMBCvg8KzjNI3g;7z>KDOz(|~q*zDxr(t(3;jfqqDB
zr0cG!<eZ;c==y>F>qTc!ZO9dc^g|*ZPe?F_Ou>TNA4Ebi7$D*V${b|`l~4W#2Vj1S
zkbjM`gR&E-HtWxm6$-R#OoNjeFhEzjen<rrsH*HUblf9zIwlnaJ(c&)QalMDt#59L
z&V-?YYDxdRUZwzXx>;=;ybi!B{_KaI4uZpOZmZhmnIgyYWT2<YKMTm4%+*HB@jEw=
zgV<R`l(SF8uytniI+Wp<IeQG-AoqeBDN;LZmX}B<^tN@og`CV@<;?&f=>HJjhV)_W
z*FTkzD<BejTjIJ$P60;&iQE>AT-Khw1{Q4QYSKChcUYzcfvz<CFb<EvX`O!StJ5yX
zUE$A_Vs$B;Z%_zD;(&<5e}Fd6xO*x$Kcgq`iv`(82(J8C{S`85TsVF#D%#|FK(ujj
zUkU(D;!nFXU<fTlvc@T+^D#T{&EucH*~*(RQ4vfXQ6f6M33^q64szEb{((V?ZVjRF
zDc?cxa;c)*c1TSLK)f<PD_#?2cUqg6Grs=-s7?ugTF>8f%B1BW=Sm5LVxsO3^o9tX
znAYh82Aq?7_JzH^(ZLTlNGrhiu((xRQZq*qTM#H77j0|>zyL>k{AnOs(`lA@%Am}>
zkFY+;o?1<S+{tGF!lYkK+yF9!B>o<2b}T;n&eQ~C@!SvlO<Jz&2|-pXB$2{ihl^t<
z(B{z*<sI#$(Lak!?Sj5(9&j<cC*&;DaFWO4_BQbQ9lq+WyTq#q=_NgM)pWkDTDdg*
zrbfApd!>B1l?`O?*|8ew<bmg%UvQofO88DBHT-mM+wHU6Tq^VG-|<4V^IST`T~TVO
zpE}*DzXskt53OTeZ8S}3zS?&KYh;kjZp;bU^%eta5YFq>qDvw$*m=4ESg8uqCPX^=
zM3fGgDC{&XiBt|>s#W?~5^53<bWOFpPdq7&;b-=u14;Up7^W5Q-If-6^#Z%ec0m;F
zUGy(X`Bj8Emq?Dzl67N3Zg=vc>oXkCmsh_ne4J+oioh5(!u?@eA)(W{RfX4)WA(9q
z@T3yz7pq@HjR1V6tt66-@fPiuwXJ~igpdBSAUu4h0~F0L6}X$5+YGe;DE(*co8KqF
z`;6Cd->`i_G`rnIy@&GiDjco_B{ffzj+Y!yUmUOcY;N@JgYZE=3+E(oI@sV>t1ROr
za^k@UnG+f3i4oT~pJ#bxF$`}r?u23Iw~NeJfpo0@8CE@iU9)f%_?djPU*x-dsA=A%
zLzyhZ_zh%}f)=?MRC)5>7Thv)IOY-rU99`bMa}q46LbfJ_ku(LxIn+awMy><y65n-
z&BzRAKrbGmx}u|Z&Zb%#a5uzpopuBLS63AG3s(xNPERNhG*^-FUvZJ<UJT){;eSd8
z?WZ6}oZ$FUu?uqUxHf8ppwiI;JiEa`$FRG(0H@i&Ss7g)<JceI17Wc@?vB(*z#anu
z<s5iJXgF^`NTs4`Gsx^1RsJ<)@zAMQeKVj7z?mxWq1#D27Z04Y0S;;;)^%FdrZHv_
zMqpXFI(w$H%7t_&!pm$^l~F%mABpZaxcN;h;GB9=EJlA{(i+$n&OmAh>gUAM5>=r5
z<QH7BVn}14vOLyx+uhmU+GMM_Uju3Go<@~-|JyTeD6D8^YX>cX=E&WHm)x9a4~Z~I
z!jF&KPV8<UmT)|v6vGY8W7$>LB%;--1iIR>bi``pBA}vl%5wRQlhNqZkQComBEqA|
zW1+ECiWM@3T@e1;zd8rgzlwQ;aG@#AEn|Ig>Zbe(%6{}WeK`DbRt&by^n!%_si?H~
z+sHxk0e|(C0y0e(zblzlZYqud&!5>^tQ)*uh59+<-@590Q?*ee4h4EJZ5J6r+<h2^
za)Q8_etU^4_+=6n&>^miZ?jzo(<#`D2cDXAC`c%X&Kftl(rHWyx-1COm%Lnajd}_&
zYsFiH3Wh&>qv+o6XI}vt8yQ|`;Hb}NkI+<~RlLw^x3GZ9VHn0q9<RJ7C0#k-u3wCA
zVcTt4Z&k91OKH#hY=ygmhkGhQhEw(B@buUOycsaKU9)@-*E@O<89y{R(72u&2oEv}
z;!|K|1kiVzyNp^(w(ENO-^exP_;4q{@-0)$yC5eIErqxj#3Itx$hf^8Y5xsyUx|?_
z2MW3S2^eg<(?&ze6Z&GLBsJDY+Q~yrt|$Ig07X9uFIz6<UL+`r@#gq$rJvys?13!j
zbQ~ut8YMgQQZI(L80b*|YFfH#yHC&6+<pu0V{fIpldB_7QLRCJ7F}75oT+XkR``ne
zxIC@}NyMu8F{=pyPDTwa)0$ly)y>9m(ED;8c|h9}Ij$ZcjiN<q6Mw`Squ?;fGsv>O
zYSY^4(}B)cBhn3vZYh3eh%jw*EKW9)df|(Sb}7TNYuB|)fpO>Z-<KwV@AMVzv5gf{
z%fqDS$n@dxlj7(69{LQsUoi8R=kJ+h=xhu1i2e#46;&p~E*O}TUa4?nyyU!^itwZR
z3j;Bz-T(gge!4h`pm(O&KxT_0@MO*d?k8})b=lkwOpj7rA0QI1{P7P|p=a1l8LUvO
zA?-tHcai%Cz{o1VOUhZLD(x{qyr6f`OFW1SvjMlys(^_hpt;9D7t1!}`#YMl^6-+$
z(ovy7K6&`j1>mYq6J@|ZodolrfA<fj|B2#Slv)4!-1wchgL`O?G;w1`vjBW!nLCny
z#}}yep9LP<h&?w=1^oCN%I@<QE00I5-`QWP?5fRS7PcZ80Fbis;`tk-y5xXS!PV^P
z06}sY1nG^kdeXn~hG9Vq&f*UGKh){m=l-9^#DReC(a^y@3GjqSDo@A!NVY<lQ;iqK
z8G7e9oG_<7XT2?n!161m8i<emapGIp48}G(66Jq<-d4E4;??)%R9qzr$VkfzNhxck
zIbo(5@mIEb`90CDH<9zB{LuVRRiZ~jKkSs?x`|j0&z<iHbskUuWIfRW3^{idRnK{F
zPA6C3DS;vvQB0bfm#BK^OWDrv36ui!p#Kwb1Qz46-*Gp8L?V0aXtaoWXgkMmN%9@A
z8exA&s<P-zdB^(arh3>WVoWVRlJYNP&gv!#FJY#ZL3lnAWRw0NhQF~Ri#{yC0V|z8
za2`DW`B9o>$9GPq@fWba@C$Z{60Gx#|55b+T|_7OYTSTLk6@wd>VvtuJ))w#<NX7^
zK^V#prdZUb6cY&5=ul!1(BUfK@F}K>)cesvD8P@rrA~sAglg(_<y2hW&9AN2IvRs*
ze@Pw%IU1rZd?`8NPLq!0T#32p?*&Ez|0q#S>cmX7!KPp@uCz4{;_^$b^y9TP@q6{+
ziCd9aJ!e+rG59k-tF=q}SK8OilSak*M*4mXM)HvzDlJv&AvMeP_b#7vA&|X)M*%9T
zFk7>U7A1K@{3BM}^a|SoH}`+z@)Tb{f30M!b!z64oRt+-I50Q$-27M51NEBg2ko4b
zN9}uhg#|eI%l}$V<G3^Iu5AfYREW&mGd~5}f_^5czhfp+&$H6|jr#wIL+kMV<aIYM
z{2ftC4!PuQ&z`YnQh+0aeo*t@Fxx-y)^BY3KV8wBkybATf1pZc<*;L<w*)#;4QKpy
z{GQ{Q^^%$>g#lRzbLvSW$T^q&^y3u&I$h4u&4Fn6repyHAqUG!-mwE9Tob8(gmDho
zlF~l~DC{{*9ru?wQWGVL8UH#j&XNAVYQRX=-_{ltlUW(AzlSK~Zws9KfcS)H>fb@=
z-&sJ#L?2h7)F`4D4AjP~mwLFC$X@t+$@_b1{THqR+&?ms)flZX&`dRp9GJfM5`a(Q
zp5p_3j)IB~wp8$q9nlXm$kGWMr%GWEF!)%c;Kz}g0}*RQjz7p!`+EMJ?CJ#_{<39L
zr87Jj`s#m+$TLO%a>@H!Dw(#g*b_BuO0r*4o24j3ywFiriT3E#6Bh_IRzjBd9PkHU
z20#-N6J_0Bv3Yxtqf~K^SPoh~s$3Mx-NVrS@%PYEAyE^>2XX@dmcu}~sNmAsaD*g<
z&&~tcntue+tB#wT+x+Yau?oTb+wEu|??)q$El3W8g~}h$M4cp(m@{YOvFF(wfoxfx
zS1<RoUTY42w+23U&h1;jC5gg-I3cbWkOa99slXd5CM?2dv#bKV4v$OQ<|WaknKl)>
zxE$~%-A%C=LU9!@9spc`o51Nnn*?NPfN#onbkCqTZ8HD>A?4ZFjqF2X^LUFcdv46;
zRLH21IN(82q%Z%L&Hxl*q6&^3&y7;A>xrw71cV?b@-uSnR81JUSZ{ARFrJYEsq_<9
z$nfep>(10f0i>@0W7WUl+jC-Op#m}+9T@t5jUi|MCr=Fk1A#L7i(AgiTYv;WA1EZ@
z48Qs!DTOfl2B2Fn-mi!PeJFqk6_TTBn`wT7F*EC;fjAI7+ek1t0eb<^knxkq{ugU;
zU;u&i-FtEhOGZG$e;o*mbNm5UpOuggzDW86=DjoA0LrWX9_0J@k<|uI2WbDlSjMHr
z+3loY^Uqt11-F@8I%{(t0ucv<Ex>31;K-39QGzfO$OBYh$Kx#XumN<w<<I(?bvD)^
zpeeh=%&oGMyL#6Q{OCb?-i^mLl0crHClmnrUw9Cx44Bm4!Ws3Rm->(i(~9<_b0v{O
zVS(VkuLs2*r6#_G&&wy1SEC`vDRlq71hUlrDAwH?*OL@d_9-s{F~AC7cVztu3ZVb7
zN{cEB1fT#Q*LeR4EL2o4e%wzb`sMEFm>1C_WB}<yd8>^s0uj2YN~T@F1#lWb9!r5K
zK-=;cSqKn<&NphV*3Nq3leBc>fc2>mm_*3qT%59|&3kJ9v!v+~UE<KgCK+W)1B$d*
z8W6ee5XlJQ^MS0%B=kDa0$L0_`S8YN1HG2)c8mIj!VT2BhN)TGcTnH4`t|BHlY)y?
zkR9kxDSE#DHcCayu=@x4g7Kdz5Z%yRjGX@Tagmkbsor&fq|WxkEY#D`n#dc!#6wbg
zhpxOYrvXf~g25gkTl1d?<9r%64Tu2luo1@>DkvYuzEX+$rzSXE^RNQ@OKEfFFNXnS
z*`cGSj%@sX&f2rxztUTZuXS;B&AY^bL~s72N_NJj96k_}-}&Thw{p5z0=UEITgO9!
zbUmB4%^B(ci-1%*4-}kLr+>}NjZKx7voiW0<$-<T{~-S_X2*ZR=34{spM?Dn+aSBM
z|6@E)wso@8(j1&rQ)bb^PeNfy)-}ukv>5gWIeNGXK&LaGUyzI3-u$_NKD(1KM#u;}
zv_+URst(*n-132r1JB5W@cmmg$t6#I>hy44FE5ZS+Hgk1^xvyc3Dbfx0qrtu&*qR%
z|A_9d<kc`>IL>zqA!bv9mb7W`2~ohvf3JRQE|L4GngS2(NO+#&J@v)+f+TtPv0eb~
zD(Zt3<?xTl6(ZGLWF`O*uA>0k*ZCV(>^)-c5V7bE@nt{VNm{72S<@5pJfx;wbveuz
zjE_mBt#O~~;nEd`I;Oz(uw1~CFu{TfIzez(B3A~mk^K89HDti9RBdy2ohT8Ym(FdH
zY<mBbQ9xFGBlvW}6=o%rg+^C?$*j2GslTiN^^RzP8{Y|it^PYGfz;Hg0L6v}`GjDZ
zTOR|<?(~r)Eg(jRmxOWsuBxB?tp^1{9)K1#34m=MREDtRC?i))l#WlItn4lc-XJ`Z
zdYTfzo&bTu%+_z=t#c~CyWb`<?3!gF(4vh8it3dZ{7+tw%Ejagyx*up(VSaZ09#JJ
z9(S!Q+~$M2!u!^4YHLGXs_0(%vDRCI%eO;^n=q+2jfE-<GCEAnz7m{c@MHCK^Kqs<
z0gF-kgNp?<f<lj>1#4+*<~+Qtg(hVW0*pq$S}!AuCcg<&o|3lkQCD1TV-8ySOlFc@
z!aer%8#YTqD0J#Jx%v3zCYJB^9o5IQlPG*gswgO;rpTghj*a?X6=XbIJvU7Y-J-s&
ze^jaSQ^(x;>W(XBg4KfbuP7Q~?NpZ{6$;^7bCt&Em{42$L~_)G4LCZwOPtw5f7wr!
zNen{s$>7)ZL$F@pjTN`M&<~KzoE{$6GVM-kPpDj4zQt`|Py<5&eAYMooHoR0bn^iz
zh1y(Db@gk~PlmtB_)0*YRI}&OzZ&BS6CP^VLVX9@Qi<OA_<R)VssVo+<MZCEhdTIH
ztn5dw<A|lffVk^31rYo<YDLnsOI#T-XF-UtRm99^VKmG3yJ$mX*Ru_6if5jMIL!HH
zY*T9}N;I1A@21NoRCIGPCnRp$i#jSaikKy;N*yxXaXP$p+n9?yE|yj8{$r6qW*=w4
zp0z>d`pZ0+Nfn5DV1pQ&E*me`Sn@q^Np@k^P%Dw7fC79{mLG+IHcC4rtZi{bPzz3U
zS9OuFBLm)R`{Wdt3hdZkpJyr5B*HvC+6Z0Mk@SkZE%(Sv->Wr$qk##Ja;2R@_G1hb
zx{;qaKh=lvJ`rMRR@!9abz30nS#4fg#Q6A63vr<g21c}%&(5{SCa^liP$#<`AGdY(
zHw)<BrigL9A|$hG<)s*1(QAomXR=OIS|ogOsb%HliI4D2<6|a=)@%v6)O+uq#05E-
z*hWJ#4CvKP=F8I%k%`KsGVS-jc-)@4+bwWmG}xQ^g&e6yYkrT+upl$D&*q7%=_5N#
zC&Ucw<|cs_&uprnt@p(eb`l}I*Tz@w8W`8fj84ZIQWTlSys)hnua*Fq0AZ_(nNL4k
za1obEM)Rgm3#&zNsz3?kp>YhwoJMl#qJ^EE_1N-rRTt42TZSY{Bdx@zCN@4T)C{s>
zvBA&ZS#n={(nVs@xhA}Dfen5+SAq$@yXxuwm$*O1EK0*iK^81d*|?N9z&D3r5xtDn
z+1Fl93h~l3KfrStTdf`Eb{RF24uPPI2AUXaVIjt*Mi$<0-PsgF5?BeIBq%iaHo=KO
z!DZuwphrTT!TR$=4w$ac&zi1=c+=yHqEkIB6`uQ|B|9NACq@VH{#;2tkLHR@*#evK
zI!`??wrhIBu?Fy-2xhVB6BpcAbXd~L)qGYs+4M()_{>cfqndQRb%Sr}td0j+2yU4y
zZA(27kH`GLac^!yP`#@rp|kW)uXZ_HfM1Rqwp2>XPD&nqm#Sinr)9Zyd3lkxva?Ls
z;sAABoE|G!aGusT*N1dn`_qDQnggnO<j~C*{5{V65t0aIfoyq!<T;NQ7ly^1U7WPY
zyV?zE0`7Z5n(sU&OyKFKU85koK9lNK%x_b&bF)ytG3E~vROPi*PD4k>!GoIzRA&s!
zBLj+bZFv})!3t@Z(PO*zVvUx&CZwU8M^52PhlyK3dOa<b%{?>rgowv@L=J(es@Cuk
zVHN9>+&dGC*uu2^?PM~C+DnpNhnQC>x}+bMmnXTvy2{Z7yAo!+u8=^C@>wjD>{`di
z@bey|a0t>g8{(}_dTN!;q=%|plML@gx@<OVAvC_EHt#2R)Go-M9xT~$u{?D9Bpa34
z@#b@XqYqVQ)$D}!*L2d;akK?SCRVpE<E=UjV`~-GTz8CBc@mq121SQLOB-Ch@FrhV
zpmUMPbjHo%ooP3Phc0NZ**M(E*I;X=HCG_%4Vfk%g~LDCN`4APH7ywtZxuIcwd)G5
zbK~#nEuOL6%r0&z??xLY{=+4_t8Dk_NEfd*wKmOJ<>7>uqeonOK^UvQ)0UnAc_vzI
zkfQ3}`CgZogpF8?c6(-zvg5pRmh`z@kS<>CdZ7V7!TG~l```egC2reun@XFrhq=k%
z5r-G#!%J;;tftc|XZdp|=ic^6lergbPIif@1`)Xiqx09D#8ra%>L!VO%dv0g+{P!T
zbs6S@tW)H*tnA%r!iE>m1iBrxXc{cK?fTR@GQP%tF^9>`;y3+lDAD>6IqQw$v`6N<
z^<t0cJ|W9C!uFdWOZ>F~*8@YvaoJM%t0Tf?vhvk%I|*KVGrU2O)J#62wH&_aMXD<X
zsR;@xHf^keQ`jZSx5Z65n};tGsl~9Im+Eijh@GI<w!|4aEOlqy;drRP#*}Yr!mVkH
zX^7~NT=m2aKu+MZ8A<Ur<<MapeL9@SunK6y`$8ht54Y9MI*o65JqgV;AI8{&Sd8;E
z2`=rnxODS}@G;ipJ3^yoh$w%S($qqqY9DR$7iMkZmtXygQAcviBw$<E*l!nIa=aKv
z{L{s;<eu=@U9vF{o~CD=@kdUn41a}8IdPMQz@PO9j{#Y3%u%WiVQYGm<X#>@AXdui
zXI=`<=Y}^vpglDk;V0=vo1{yY3KQztWRBJMKLj83>O5PYe;;t-PF|uQBE9BUKb>)q
z-!2BTaDHi=0a5Wgf_c=_DAH7{-6el09iS22vN4>sBX;HzC`=*>Q%QM(qibTT-cqaW
zyv77Yf^;#B1D<M&(M9Cm*HtfL9ofCyUN9KF<q3A;la&*d4D?Uxdh%e;TCA-`AHh8S
z$ev$^AHz&IYLYD{NJcLl+<R|61GcT#>_kJ*YrbJDn^zjUeNUkmp-|Y<*cDlL1&haV
zK3MI~<(hM<t3N|3BV~3k=IRSQ?Ah|NuxWdrCN9>!@IKY)!*zY_kBCT9fR|N2z~B-H
zem1#4$f$(J?3`(vjF3U)1ILs(d)|ya+TPcF52Lo-J{xoKv_*s)p!RUOeHQ)}zwgo5
zAMFg%gm2sxM3*0JA5sk|wq7fR3pQ5VsJaw(LL)XeFm|Vr&74nXYQ3L)q(ht$27mEl
zwC(W&#jlneE~pl&YfJ4-a;E97V^gJAxGJ<uZt3b+c750ly9+q{Y3MQNx~(JB(SxZ|
zUmYq={R%ipRw)y6P8<gD9@=I1op_?n9AY-Ll+O7d1-L)VF4bs1#vtg=^=i#+>7fyS
za7uQuEqeGlRi-k`7Ea%<?;QSVxy-Sg%rM04yzP8+!Pl$mmVQZ)W^8%mxaG!OLPMUI
zsE1kaBlY$uTk2j98P3s&J(IKH<}IqS_W2aL@?m`8Zbsj$rxo-%&yiN>-qp0N$Z=5^
z6mugJKKLvdMh(C03DlQcUYkpvMxIm2h4YKhKnwa3l<CL2n-8)!q&viKQlgon<&<{s
zf(`}MD}*(U(=ft0dybmk<O$yO3h>;|Jt>?X*hO38ePf8OXOQX_U!xc1Nz`8|H$G7Z
zmu!8P-d2L_#B0cyQI1ur^6#{1@CO~hLu*AtideIQ-<*)2c2+)o4`W=`?5y!-?2T$D
z*;pf7&BjLUU6HtC>+|{~MT>6*Pb5f$kvc=F7a_%s@89wWXU)N;5=XJ?+UVDYhb?Qd
z#)3=JxP?9B7Pqw@a28!k2yGdQ9Uvf3!|M!gciXr)-ieL+THUE+r)%W2h4?m!Oxg>r
z3;89}wHCx?+Zw@E0yLW@X~@pIMGB<XV=BYMb`ag2Xz<m4=)+_~joRI1mgHx0w2u=<
zf=(`PF&{GH8}BOnpxjtlB$l;=Ft<H&{liLPR!8p}l9d=yrX~!Aj~1~!_ghd2cU3<e
zqy;?a5PztY_0omxdHU!r-e+zO_iKhJR)E__pZZpVOSKdP3oMM*i#LzGhbX&{L7<V^
z(BNLBIZ|Gg@02scb#Ts6dbI8Z&r9gq#bm`xMoKDY3!#1(?S&*-ug!?~u}Mv%yLQ6y
zBUQJcI&9>2bGhBFEH}q>(&#CSO!#g9XoA^jyZWXqo%wx<@B63=QRN`<EWyW8>cv^J
zy9?CM^;lCwO;idB!lRbK9Wn)y@o2qkRhBK=QF-lmoLz1x4ypT}PA@IVx8q=}&&qtP
zP=!+{vzx9?+7f(Xdo*3n-`pXNH`TAZ><Gsmrr@w~!l+m{6fV0-J$cUh@6p}{j`rHk
zJh#kVVRwTXN{;Qo?e;4&xvb5?M1tbl$Vm(tnGPL?zUEN{Et!k7KUU|jdnC7>gTk{S
zq4l(8y&C6Qi%CruvHTWC{f-cJQ-ipQq-m~ox$4A#faxi5dG=>^lM4GU$qRvZne8+U
zYBDOT7v}s57)j<#hBgv&ywZa=@cCb>zrps=m&GJcfikqkKbJ$BY<2qbbx@I*G9pjL
zLX(`HY6<JjQWQ@Nzek@&r#9QBK?S|8ki`Nu0wZ|B;kFmsY`|HlEcBq|E2^zsGssC|
zsw`y}q;V}_Az{AMRWqC5SIxXXaaUdEVxQ&R3X{+s#hn;`hWDM2vW)Oys3v=9ttkE4
zqkA1UzI8$lU=-!sdF!S#MS>|`mwWd;8>=>rSy=b$FPcejyOfoPCo-^@KEcl9zLl?~
zF{tXCasg{a&!2_6PZet2yXOzt7@Z3!Grsp{jiywz7AJYJGCb+Cs%=d(Hbhf`*KatQ
zG#mBYbbu2WkG+c&=HouSyJ|YN`RU}ATO2~7lX^yDAn@d61>7m@UHG$KrraTda<Xvl
z`xc9DqejJ#icUhJY7BEw)R6Ptdk8&GA#EJ+$loKgTu)*-pJ{=5LD#&a&HdTx-=stl
zlwY-&L>pgYEPgyeksWY!lKX6QyF}Wt$Sk7_DJut}RedYZwpe%dgv^#tu+gcll}#3}
zziXiF!x?{ko1y(zuujH%?%!%GuyS?FZTXZu#6uc~4ir=!6MP=(%QdW<_Y1hMcRmzy
zoyttJpG$&8|Jo*$LPNlHU-7z&E6T|>_aQ8MR~5^265(|gz_>{~C?0BdI^|eNaIm^g
zz}L5X-^j;g%vHf9P!~t*9s-knILD~zsu%PvTgBT^96NhG<)iEx?KSE=cVGZ71-HMq
zSSq5LP;zuL^PL?V9gnnhySGlGO8;xmv&O(bj1fLY;sab$3!dmKk-J;5pg$~eHrah!
z<CfQ+?PQ!27?@$HNB5nJgsNtKJ>%l{eH6-XeIf8flKXF4CnX*#s%Wr%9-wEbZKgsF
z-7HFQ#7>@Dj}7n|2k7e+x6=dqp)dITWzq#~mbjliPZvym;je#}(uWwk8mjr*#tYEF
zB~?&<xD0m126O}-^$!1TfEBl>ZQbv?DNAl^-mR>^=t5cqb3zDio(5ApA@nWpkXLIy
zJM3Np9wYo^CAD-IVgb|H@jk)~oxCUfata2*ZUp-!0b)CuxxM9CbtmHEK5u4;`rC?2
zV``bxwZ~C6_U>2Tv&c1p^?bD6S+}*^@lf$!wx&msMdK8X{OBJC?XK^{*VKmvF_73%
zi6}-8E!`-yZEH=pGGz$t8u5$P+@5oQhWMuo(3r|@+v?4D3r}xc>mDIaSo9JL#!n)@
zo?a=@P!K$SI4_DJPDYBix)OXvGL$XMF-RNvAb;7h;YfgXjhkzmh3SVO(|$^eCTZOg
z>iVO*S9|=%CIv`eV%lB4Bdx{AFLP_dS&ilB#&&rksQAm(salIudmkDKs&Ep<@)13C
zOI<x<33M+p$6j00XuGd<qH)~drba93`%B6!>y#FHDP-2_Mmx7|ibSaoKd;cEeW@!E
z)=MSRvL>l#D*f?UzjyzKT|A%NkSo9#+CASiAuxaHhRccr*U*_w%Qb#u+~7xznWaKB
zIWWB=XL=htZw}>Dx5_ek5jfU$byzNMe=lfSK$*R@CZxQ_LNW20$o|NbjgTaW(!QbQ
zm7=8QElE@6D->^hN1o%fCBJWFStUBqPbrcK@+79_R>YTW!Xm$3BKm>%VMzX9*ls>#
zP?R$xPE=pQQ;G_63iiOtMLqdN1e=&pT}+82QbBV6lSdNcQ;xHSrS?T}(>sr$Q(v#>
z4~F|h4i0F{2=3q2@_uQuF}k|oyYsPPtc$Xtc87oa8h>l=AAES$FNv&3E8b;M^Ja57
zhk&ZRFAjX194h)`SXOW1HPygN1x~!$_BLO_P$L9XFE*XiMS_)DNJx|l=}b0eDlhIO
zzrJV8ukI2gqF~?a2*ppQk5!nq@)pga>9vCs-`9wA;3HX8%NR^m^S6eH+(z!fRP5{5
z2+X@yCTM&$kdFAA`T*bXL)NGHfr#PlmoK=Bnufh&Kex?MRiF<K+*z~xs%6|Tn2Nd{
z<5?<$dM8h_(SeuVk=K;oy?n+0<rpoz6g!8p38^VG9=3RwBrGjoqWeLn>R!RfwDX5v
zyh1`>g%=@`!^-ccC@s{|Lhf^ae5Say8O3WNr0Oq)XW~y!B|@cq-C-HFQ&ye(wS$3v
z>dTUek0Z7s9F@M39CtCSUrwXGGWNEU)3cWWg%iQrXz__QIkPvk({0Rp#C=W=cECPj
zP!FHVfhx4muel?t-d%UCdu*a$n#nNE_*EvkLjOtuBg<`O6!H%il&*a1a5YmJ#yI~w
z{4}fqX>t4ZC0F7{#RQFKnzQNOykZW9E$U2iEW@6KmJlS86s01sMmG5v47;b-pB|GN
zl2YeVzcLzw+L6YMOv{&8X<GM|`wp-uOPP>Z`dU1^xjD1)$<mkio%*<hd-%d*_JjyK
z$kpPqgp{cUCgEFcl!g(6F%s_d#u4q)C3uE;E4&Qx{sjCqrgB{Ox%>B0Gem|c7vFGY
z^ooAciV>m0CyJ4`(41ZMdm|-tb~UnkyAU2dGx6W6J?;f|lD-m~_UlYdncLs-;p;Z`
zFg8(WRJp`0T>Mz|+DpP>!IXqJ{q7T~QxQ`Z&08MzEv50G;-`Dj&d)X)N2{cl%ElxV
z+-^Ri27A#zc~42iz!G+impjI?5fvOOyNY#-!`5_GQIO!Flf+o`B3XG!ghlK4myb@c
z=0;H`EN;kr$Yp%O7kqSH+Jp{zDYsHtyo3Yp%w7Qc0>emM9P|`K83|Rz)0qXJK7vRB
z*kd65cul9P+d_6C`b59&bOXD*|1I=w7^90=NOJ4_Rx^|x%Fuy#aQF59>lNT^BfWe4
z+VDjZAq+fCG~g|zHsiC_D)Vpcw(8eKsGmN#dxYRHKS1D?ij|Ez$kjP7oCNbVEed5G
zsY?8^g%7}oSV!zk8hl;4_^xYJRF2h`d{Wy<`z3)_n3kWTv>2B~C%>#&$;?^>ZcVzq
zKUoDV06{8=h?3M<R{@m-gl0tZVdiW1m8Li!YI+VwKYRf^cjPisxP~<+#v1Lo=2uzL
zD|U#+jlU2?-)f)531ya^Q_+~Z>St#{BWu`+U-lIydYR&u9E+D9C8Lz#j3gZ#6>3zO
z%06Q|A3qWIoC+x0z#CJ*3-w!ZGN*^Yy_$zXR^-s}ERkvdm`2kD^UNT=P<PmsC*6`n
zpGC&*czk*f5nrE65h_dQGP5%`Al8M_*Phxg<f}fNn4g_+z<<P0#9+A8uQPr}YP9s}
zavAexvlKU;nMnGeWBi8vf`;nin&p~Dx_pqY;KU7XW9!c@t7=MAO$U9Hn1T{ww{$24
zd*(y9(hGZ(H_X9;RVjYXq5Xvp1(`RF4<#p9SmajXb-@im8J)o<cP-M$ik7EW#LH7y
zI4LV*8`7^A-tifE(>%Mn?-gv}GL*npoOE)CMK|7-pZ6ku=Ar58_8#4y=qNp=`JxcJ
znU&=_e>>B5`X#sGYL1F-MiXkC<ubO-^7=)Y<M3@Mf%-O)CZI_(wPfv4T*SGgl38}A
zCsYmZCci}}psS=<1FrUcSRHZL{VC%>!}_&(0>OffStP1SXQ;xHVP*r}WY7oIs?!5&
zzC(It-Xd2<)a1wopQu_vCc@N%qmrMf!);9$E$CmlxY(7F_7DUUG7e7hp1>v%S>Y|z
zEXG9mtDguUp(GSE?iJSNJu|M2wu8nU>G{GuktwKTFIJ%PBjW1a^i*uj*4%tD{Irfl
z#mQZU7ULpzQDioJ>Qim-1uMOjgF(YWTmD9i+Z;6Q0$#7s#2r3f4!F{mYz&s-5nZPI
zP^vTI?ZUT*Ru>-&IY82qrt^o%L<^w_vM)sZK9`+x$OKY8i|r|RI^3D#s(LaaW5O}k
zR5s+OuIpf;?dIfvQbb)ETDs_{<}k9dI_WC#B+>3<#Jk{<?<nS%rQ1EaL><ixO$5A{
z7y&zPY@!%gdi%FxT0db4JaZh0Fm`lEGdBl^kCO9#tX1-Y&b(4iXrc$l7S4AmZEX@I
z$zJu_@`J2(RnrOwcxW+ws;A_M=oAlDrqj0sW&qoiooJ=P4J?4C7XD}w%&%L)eZ?wy
z(n#C!jxn=DHKUlJ|I><rrex)G=G&o`hJ8&;?E>D_sx%_OwOsT1);OM3Xk;R2y$;;b
zOdLtT`o?omGwiWOTJ{2GXcU1so*XS+!g$jxkC}OQ!&SyQ;1Sj2M8<$mR7-S%iC5$X
z>39Tb#4eMtVOj$!QOI>S4BI>MU7wqAjDe^kt20n(UdF2aUTI3}beoG6c6Y{;`8L&G
z@4bgY;zYY6In~Bw-N?q5ym}7AFTjYfIF9D91+y~0N`JZIa#=;pyUNtJ63YI@UMt{k
zmoq~+2|2kZx1d6)mE1Bt6+{gvHc`o~#9`APgdKwAU%$|E20FGv1*+|_ux5boJ4Pk9
zY&)81&gnb}wKFR6jx*dqsj_=23fpwVyTv>e8xiDpDR16<A;u&m_m*F2SgD7_$qb{d
zq>!sCiBK&Y{=@qhOqy9$Z8^y*`0a!{>MfGyBql!NS8&fOK2GL0dXelJMf;{?qqg5l
zWHuqvlqOMr5&MQ|0R>ak3YORF8LeTgyllNSSm}MsVkdH{IZjVz<tuG-<-&atBFPb@
z38(!ny~7NeAGvs;OO@|GS106Kc*F`b8t8=~^Ixq1BjNLZPy7FGk$<6d0b;9LXo(j&
zx7efSRBCtt?<F{Y0B*$otM%i5N8LmYBfz`4#}tSaMAlnz!4r;}ImGKi#2)Ir+twz-
z-YJ_$^LA1n@V?Y9CA(0nAhY+h#O(-&puaU!r3fq$Pn20xvm4aUB$exgW3J)RZocgu
zWf3*W)A+b4@yU>ZUF~>Pv0lfZo*rLe2?fp+=3Q1wXvjhB-t~K93n6a#@w>7bM8#H`
zQ+_;r8PTa5IrknogkCnv{M;74#<3=C?K~J*sD@)BexS^?(?Y@2bTRZ3q`pku@6aJ0
z-+SKw#5kp);;|JIx&D-;;2XlW#hnj0ZFkn+nu8O0=tlC4D*p&`#MVMiGY6fvE-pZk
zqQxk4(5_JMArTa}Ad2cMBJF!6REIL^6u!=e!FYNfrWX{jYW6&ce)A&S7bDUobxgwj
z<dw`|VKtEy?BHp@3f9bCeq$@{Mhjo^$JUF>z(b&NQ)zQ1tV0CNhB677!JOpF@x9s4
zmxG0rW2Hmiu^%dp-5Yo0(+brctXt`^DTp%pXjT<oUWf>Zi6AVy=Tf<&yB1#-(Is9t
zMNF2EDT<Cvu`S*8#3Oov`TA*hr{@Ms)|C^^+urM^Sx@9jLm!F}a9-0giJ>wux~#%j
z8DPq35PbcmU70fc1+}?X;vs!r4K?dC8}mJ(Sg;Bvl-}Jf4o|iSCPz3^UcWDJh2zv+
zdVyamfF{c$9Cw7Qc-Gt~rQ5KRsK7=>P&x2Sb3tYFC{f2l=K^7$hM^Q$-oOkz=ty+A
z%~*yy-5VlFd7>eS1z`c_HLUbPiwFGvLiNr$cad7-G`TVdu9*v5UpRr*y{L9?*nA=2
zuzy1f@?<x(W#hK&yRcH|SnN&OHOWqOIo>jcw}n#!<Jm49uS%CJD+|HxeV&zOo}Vzk
zK#yih%aSJw_(>`Fu&d}%yUG$%ym`ozi8-`amfkT11f=yjJ<2ZBd5CTI$nkl<_vUk@
zIQy)-b4oq^W?%y=$+UPzJ9TZZ5`MYcU%g?Uz;t1VkbBPwxWI-{m<+di?<C|B6}p9O
z4wq7qDt*{}&`J+<F!1T{likvz4U36k%E#(kG8Q+eCeVphx0LkaE-6JeR`lAAkVod(
zHfWAjKZ8kq&V7TIBQo!@KD%uZw>p8+euvxl=9?Ng%7Sx5%A0#ifh+dr7R~+v_;k$l
ze)+UMBv{mibZESb>BSwEg0dSxEXP3(G_Ermw&NM`%#(k)H@f7#@N5p^7c_Zz&*^>Q
z(giOuEXv-kNGGNM!<pAZl9;Jc4xG**$k|*Tj@oT0lf0rJWPv&J()Fcrp0vt!r6>U=
z)m7}PSD~wGc8&x$zjQImjKxkD#_uMoO&2*SO0VcTze#y@^#zAvH-2iR1ck$4P&F_-
zia(=?8z4+KXNo;c#ffa74OLY1*5f-=AWWe~qsY{4ysnK}RSu_6dYq;-UGUDj*l;+5
zF&@HPt*5G3Pltr93T7lk*tm45MPlW7Ls6?!Zc7G}q&;OlxtJzrKlx{gfutu_F!_#N
z9I0K`kOOVN7wkf{_JbqrV?%-!xhmDAI-a>)25eo_&-z+kyMna@#!U@tBXnU1uRqTr
zAQF482CdMOjo#I7+Zfgrl5A>}IqutD-o61m+Ae9B=VTy%mxSn}-Y}yD^7c8B*t!uX
z+R7s;^3+LMiAGK=cka}`TV1_D!Dk;r?CMWqBfZ73G&S%hyaQbSi1sU6H!Rh%pWKs(
z(4T0qQ1~O9E%xMHyd^z~XWA<3T66M;as6TYI*8W_mW-9L7mIjo{EK)wi%`D)*#yB-
zgIS0lI4}&WX-OuTG&zx4W<YtY^ESy=8$RRubX?2xB3tx{wojkHBizHD4#8Ahr3z(X
z!cqbJZ|O`!MM#JG3A@EdIi}Wt0)1Xs|7Xle3zMgZD{P0CktIrLb4c6m#0?w7YFm$*
R(}z4HDJCll6@L2qe*uA?T_XSh

literal 20491
zcmeIacUV)~`Yntb1yKPN6{RXDC`c0w9TY`+=tWBCAWeD+p@@J8h=@pU79jK%sx)aL
zHA-j+O?nNTPy^ozsN3yv-?Pu}KHv9z_qpc}9}HP*&UcP?ykop`u7p4(uoU?*nqx#n
zMC3BkcU6dp{zxVwIuvsB58yY_wyO!ik3)_sQW8Wt?R3+?i^Jx(6>bv|<%b^MF+2ji
zCbN^i??^;++KKS*(4ZNk8xfJ>Yni*Z)!ZQ11ksn{(ghInYyF4Ra}1r18HfHce*ZR^
z?!8!Lw~?&Z-O4wPxUd^*bkAO_Ci>%105hH4jxO@(SK*275L$jBIsv+2C@&&o7q{TO
zx4m~-;^?KZH?P0!k?(DL`<;|H3cdPZ)fxx<?^C!TV$W-LYjc!wZ=*3Z$$Px#YY+83
zlAvLK`y`)SeQ1`LG;r@)9@?8LUHo0ryRzV%-B~O5upx%xBhyws{q3j;fp<c{HGX;M
zlH2IHBk6s)Vky)J_=T?9z^s04Ri|3b^kH|Lm+fVpxvMYwom_s{I~Vt1s%?#OYCi;u
z8ndCpU4zeRWHa|}A@=aw{FaYSN=WM)thLW>FhsYS?cuXvh^}d>hPm_>O;WN;ekVcO
z_mtYioZay?B5Mm4@*Y?6tTn#9VarlvCu8}=-FmLC03C|;xa*;;M1L8W)xCEPxb;Oq
zotYvKmIxgPMgV0v4^0L9qzq_FB)}qY07{)9f)xOVfg(;6m~Gka|J7Ru+lTfxbN9A}
z_DWc}c^B@ye@uvFkmCKu&~D4#>S419Piaj;n4kI#5#Az%2$x-Y=NKso!Bx<Mwj1}C
zWE;@7yY&CTW%%`#9wdP*;$AyD?A0WrpFi~EifAY$<dsuj^6NC%4{=Ct2Kiv6y2=F6
zn3=8vZyRZO<TK+pP<ixHGDRGI)DU6&htp(=AT?KqCh?ZByL0V4#P6ih2eUmlfE%}5
zEW6TzS;W2#sg;KoX8>)p7yvo(V7A7E2DDX?Rh6LkcJZ_~Zykkl1jKp{2JjJSEL1#S
z@M*Hf4Jm$!3#QHpq*P;_o0mL#iDbuQf85tQ7oTgYW<o%!dx;ZjtZD#bOc?j!+pA1I
z$9ZbN-DH>M7HE6su<?tN61anvd;=S49E)E)1>KPkMwae?K$rYB2yatb;2zAUx4avj
zv^Ch<8gVTkyHrmKtPgq*k)&O2joWzua7_%z6;d@8--|&53>*U}7CL#MO>7O<+3$#8
z^zuSN2p4VH;Jo)Ht6vw=Mn}53u>&I;Of6E)t@@oDA;Bu$bY=&rfg%B~bQg|7!31_X
zQ2ZtHBFSj3>KOhouOOuD6qUqNV$in1h<Ddl!8Wfs>?T)NOq{qkdlIm$Tl&faUyil5
zZWsEX;xWKHD?@L>%XgMtXAB+<0K_Y{!!aB#pWvvs5`TvduI93}o+nUl<Rcn(axS*y
zWG?=hnYee4nC)=&*7hb)B`4x5No#VT$Coh32e><`Nby3X2fmegM;BODp8$;VvEItL
zk)osyLixov{21kxnK)iX_>g!95bSaDh1+>ngAx@u-dN8qmfg>ND_j$8xxfu`BXgoh
zUD|Y>_ILr9?(tRbFuTnW!m0f+P_!b>*s76490ODc&(E<qhBNkgJof(j{G>$uk2n|)
z;`n(rT)QNdC-%S|5=WQ*ob40q_A2W_s+YJIbJEa90B_dzL0^NO3V%0j03Ymy`%Y+X
zZs*iy41gHL*B>#05u`ixz(9gNi1mxTH*GIL$`^oj&Ul_J_LMkkKm6muPF8~au>Og4
zGCI@tE5ISHsZ$b9sei%)zeR~z3LX6Ux1fW{|LG#n8SDt{dffsxzXyPm^2a-OVnwC#
zUQfjHfd%mYSimG7Ve-izlOy)U#r{X$Q_$2y^^9oz8_!M6<wf%yzZWHc0z*hcwCC>R
zLMLN+E|1fenKxX~1FN#PM!e+TY@`Qh?F-SLv#`Qg7`K<Lvs#Ju;$OIdVA$j|xKXsx
zS>1yb^W=D;2iV6Of_-fNnW2>?FKfr%^Rc=!VSNd(SV2U!+!_DC2RpnCy?o__Ybkyn
z|E)6|6s6_-=`6Wx<7$)0^ySPo|EoPqLK(iBM=sGXuY2zvXCHWIpgriLOBADK^YLfu
zH(uCEZ(3i+&uPx~mhf*6ReQf9Icjli&obKeBDI|v?HBD@;q378!hH8hXAeqwFLQ)#
zOy`v2C^+uD)k?T1lz6v`d*K%6*H+NmB0pDc)+G$(16(@ebCGwIDa9N)$sNVPZ}vTF
zIucC11h)S^q8oNT&eL+uSVxY$eP}q6>(pVJ)yDCcW5+jdK8nsKc-eHsFE<RviE0}x
zK{dROy7=z)f?@J)?ndvK&~xJ_>>B{5-AYDDR;b3K?-fQaf@d#7;@Z4Y(_Iul^-VjS
zS~z1~d@B1&1xhHe$LkgX$!MlYH`hOH%mdIU`|~u@;I(#ZW^Fk)xbx}U0T@C&WCsj=
zlAZX%W$KrEZ2;ZjQ-BA%!37W@4>*Psx@9VGv$6P(>-bIdZ?3mQxb76wr=F?qYt&Gb
zzT@=%scR5#-_!2XgBA((R@KGmTrbBxOztw3#C*zM!{07y^AfFDY9o@6K9j&Xk}&co
z!nrm$izizku{|~#3Cx?n-d4VDmZV)=v)j08WbWcFq5bx+Z*Qk30V3py`s<SM6F`Mt
zt61U{^)^wImir~j^})I9p*ODhRxxZD9ncz8hbv0+{&J0Y8k*YEy9BM6tp|d66|yE@
z5Ynp`ZiunDYelwD2p%cY@1JWV15}hx<`?j+;yh!+QpYM6#dmGW{P^zv@-}fSeo@Zl
zEKbKYpI2D)633UnMh+y4)lVe+7p9PU6zEsEdcT<V#k&#heoA~1`D-3+5mJsw@hl|i
zSH?=_<V(K>+c&o<wQ$rT78tjBZ$)2enb3mJ5Q(wnYZ#af!pY+&IsqCnR-M&1EO%7Q
zPH0Pf!2-oGJ%7+#p=$G$500}EN5cmbV_{++7A1*cqL<nh=OpRAyn@0pA1uJwbr6i(
zX@iWApAmGhz}ssp;39$*<vzsuy)gTEAeV)<-D3@;Fs}mE<ZFqQPnlzHEU3{6w50<J
zR}_cx_s$RAf$LXY<E}8N20;J$3VgjHZj_oxcxXU>Z(h`!HtEF^x1U3-i}l=nz3>2&
z$%um^aX@McYZ27;cU316EQ#TQMM<bRyr-BieY5Q-qKm470tu-<r4Zwq0j%8+v$vGA
z*GRR534MU8qwJ^L{UqvbWALF$@e@A^!!%!?9`X$cN|c7eK#&`DFb*Fhrg+pMsr#Hg
zE7<1SB+%$$%NY&8Ws}i8U#W@woaC*7OR3<XZ+M8v>j0)dQUtTbnP2tNs`e01(x*Gw
zaWb1@1RRffn%XMd1#|=4HeH?BfHxW<qAI;MK7RN-j_Pht>Sp$<|MJ^VTtVB@cRaS#
z#JyirkCI>X5>rNGR8=A<FNyyCdLZ;2g^imU)p6o=aL>M-I#<M7pA9C{vZpf!!A@J0
zUf~8xM3%#uB&p-{m=FK5a%!&v>ZC`KckX&yr;65^uCzR_b>3o15$e6%&L-t2nv{Hd
zM6*t3s<7fOi!nvm#&24Mg{h-75<KRXyJ~jc^2Dn@KME6*@vYB&Qu5gX`YjU~={l!!
z@p)#|>oKLr?M}(T!CakJ^3mb^C#z`&CE()&=^(q<0E^7jhmsYbI8WFsPuM`Qtu3zF
zd+#2(-*A=#f!UvYl`<300!q$ULN3IyKp-OkenkFRR}16sTacmi>9)v(_ika=gf=_J
zGkq0C{>{CB6IkJpxA-<pRb4#!aOqK8aa}d%>@@_L<~CP9h~KrNw(LUu5u>)IL@wXb
zXhZyS-hru?Fp4&~2O(IOa!(a=#u)MJ7BlhUl`zvRv0jD)=oo~$9-b_kxj!uLW`&+F
z1PdL|LH#3vQm@fSj)zn8j&auShbn2zBbaLw6)6)e`JZa9W!bS*6j=jaMIbuBK{jK6
zWaYNF71u2EIIQOW_|v>e)Ezj6{H=y4dpwyfM<1Z_gu-P=&2H}9XS1mGPJjQGb@Y_*
zhkt*0z|HMMZ}1+?K>|gcG6pUJE3b`N>@FCvNN_r+sElU>5(Ao24Xu6>Qmy>&FP^VN
zhws`Q!1+1d#WzhIdCa_4r^N;C-0>~xH;S168%(@An&oh}cV+sGDwaeUNOZejNhB6L
zn@o@@ph8WpB`W}(f$X+IYLkXaf)UN943Y2va1rg@Y<CR-szL2ypauAo6hLAvagswo
zCqVu0aYfI`@cl>^NqhAa|D5fnIuL}CN?<lnGo!^ms<hvP@f*H2AlT`UA3^L>5s0HO
z75f)mAw-QM1~e?(g}rx<w&6z}_VWZ}P8;Pi(s_^1bnquI0C!Z}s{2Qm{sp%Kpn#~e
zErjhYzl(5Pq51`&`UX4^d+Iy>8*U8?!zIuEQQ!o)aa+Ta>vy6Qbw=cfI?nuw5kb-b
zR)2{t{)-gumjuvTp#nBm)4g_Yaeb6A7RaPKh4XNqA~-@_VONH^q!s2b(zss>fPYUS
z7t_1A7$;ENah~t~<*o)q(J8|Tpq%#QRGkv2AK;z_#wvDiC*;Bi|DTNbo(vHk+EZrp
zZ@v*|%>W?KUR&bolYd$KZ!TPD6Z2g6A{Xy2dG-%3$1#heqr|IOLjb8d;P8)JAiMAR
z#lXHTUR_iR(7{h;;yV|U2CCG5l+V6h0p&J=#ymamm_k6vi{M~@lLIPIh+YXDy!qQW
z?7!@IZz{QK_*o1+C#d0-?jScCz=ps47C!1WN8X`3)AvuMdI}Q~7@-Ad3L|=MlXpAy
zo8*1|#zS>vkhv8zXY8a|RX_)VNq*bqM?3)Pv(+I;+u`3sWB)~je?|0F;#osX$ta<6
z1B;j-+hIv_GN^wZ)L#qo8`+kuZR#ag`din3UGQ&95lHH9L-~El0BO?YL$-NP0^mpE
zumq;68v$ZxhWJmg{69AAzVX~7@z&}Sb{@6{F-r8-ls`0c-+TZ&aw`UnyoElTfL#@3
zuPwA~B>2o}t9`r9Om4@IHilaEfxTty81#4P9|29rf0Aq3BlxXZP0Ni4v(WWO4Y9U#
z3Q*whs_t*{Sta8znX_(vQ>Z<gVuJejHTXBlwLQr<pRbBQ^A~1)v$$_n7(7uHv+!m$
z>nN&<o?ySSS%BAD<JvA^T$B#ZJ};gCFd`iG5Pfs#zXhp552IDNB2-1nf<TuX*($tu
zbLVZiqF&WOdqrr~4}DWLKvDmm<lB*9qQW+c10tTHFB+rT{zHs)K3d|8g&9u`W#>{9
z2mA-<{$$KQi`w4+`hP{W{M;=3huXCB)@7c9q7yK|XT^*u=KiX*r^4<2jplG!P7aRT
z0=kR(2iwX(uTyl2hfMPq&HqIL|1X-xxPV*@Q0gy{dFb#5fTNb?f6he_Tui^ST`qwb
zD5+ELuu{|ahdb}pU)KF@e$m#%fcqI0Ua`hqCN$m;udmMTD*$!Tipu1_f?b%JU@mMI
z+xX!B&$l^cgzSe1@ab8x-$6@VAL8$oKxm5rN<e1lfc`t^yp6al@)#V%#4;?43<4LO
z0<WC6dA*wT7YzB8DBl2$<hK1c9QV<}1L6;CYNP$wMcIYA?g_LR7y1C*!9AT?GsU++
z2Ql(dV(6BrAE6CywD3|E=5nUk&kgR~YL=ZPv(f3C)hwVD8s-cALIO;E-i9nrH1H$P
z>CzteBwjnI4YZSt@J&f<MW>(=&^Fxb3++JEriVP0DeXYKRkiU$iQJ<FLikJC$UE;m
zIOvxSQU|m^|EDbQN4p2d9EC%lfq=fI{()TY!DggiYl=24;B!!SAzq*jkbsnSAHalw
zKBYeTH|RgSqmDQWwC9Ao0EDzrxKY09w1g-jQlN(<^tZ;pfxe7fy-fo!YJ1~1wm|+L
zDD;JXr%AmwU_^JHmHjL3V<5GgIzXQb5UUDE{@a^2;^38F3^d*G^=T_Ylg=~o8&>rx
zby@oZuET$Bi;(&5LZ2IjwcjWL^osv@StQM$k9}u|`#)b8?bSQ%@e>CGnI?GbKU~`D
z6WQN|B*pyi7RTR|o9k^tc*+)fc4uuxDseG%|AFlJrx|AK2a3cGq_!}X{MDPY-?3UH
zbW}_q0YUnb4yv_act;Mvz=Jsr+9ugty9pe;SF!IOy!SydbmnTLca;Ap_zi*fpYpzo
z;>z}jRi=L?`3p(@5t^QTXE;RO32h3b2u17PeUpJccA%GpfU_-#O&|DwBMrd2#O#o#
zJb*X-qC{1uGmM|na|@EdYGXl4Pc*(Wy5r9+&o4F5cZ9sWV@CY*ocG>t2u<FnD|5FF
zf;TXW+_?M~3Q_wI|28?La<rMQkjC{7CZO$e`9hv=r^ZyHaw!IHa~hIn#AXAh5`XX_
z>RACx{WCRXN9q0-%aBxo_b|2sk0n3WYyQz<=DbMaKqmZm%>KKu_GPw2%5wI|ouSA!
zFY8I>k>JNOUrqlUiU$cN?78|v2{pnHPSEDRAq%YF&X2_~O>C+Wx`J(G0=|GP{x9mv
z<xf28MUntCxoyM=l?9bA25S26bg7A=%#S|JvbYvd?C42Vf|p{YE@%UC-bB!@zmpmC
zrKmjO1Eo2&;=xAPKj8Q4kq1rC0$`tM)Ebd^Ho=lPeu4*c3mEnt2k>BDc<i*x>jSnK
zB`WT3if<7ngX2I4_Vt_M`Byx?rPk5X_L~R&cdGva_@m<gON;;3)qxcpsONvxf?)3d
zmD^uT^`Fqf_GPu#?uOLFtR@3{8n}00_RKG4`MvW=uM9cZ)%>ney6xrJHXvlPyRq4m
zS4sF_RgcUseF4yL?{}rDFxrFjgWq)ap0L;ITnR#++C7Y@Oe&XkA-Vs_f+&7`x~s?H
ze-eaa9vXRI_}ZxbPE)X`<lFhhZ?*{hQXYuC6{NT_@xW3eqxV>L&TAKLdAG}*6eK^o
zxQ|z9iY!NnyU5C6XeVu=&gaQP>W|4n{V+8fZ9L&^RX0DEz%?qZQ+Gw3S7!WXULO}r
z3R4n3&mwZ#l5~{6Q6997lO}w6tdKL#)A=#a)FY%hF#eN2N7%(KZaxEs;~pm5S1o8(
zfUc$o2B}coj@ocOT}FR+_VQ^<T91rn{6ut<*`CXW$GzqA*j3+@u<7+X+yz<YEyx7R
zwyK@<PAS8LyUQ%taWW!U_2mXLMVhY)$jSAUlQv(ny*<Z6-lDlWV?8fsg-Z`6CnWmx
zkFHm{nIc%p7gHx!wxLgpwWjXnMYo->ZOzmvLbrtE>Te(mMnCY~;Kn9`V`I!H^bL0C
zSg3`L?+?I{bm$9~UHzCt4bnO31*t&r5FIaiZm2yj5xFpV4RePd(w$xZ*=^DXdOv)&
z&)rU-y>uj{4Ql0~$r;enXXK`Jf3mK}fDo_E{rHvQG?o0aSF5_1@m=^*oH0urz7L98
z*~E)T!yXiltQ>~~7YpZF8bLEpbPt{-)dr2%n<m8b%WJWQRmkXcG6XsHXK7xG9*XA2
z$@TdD9Ek<SqN&;H94N{^r{KLhsnr5!%WJKZOJ+^nkgRCmmJc4%-7?p0J0Q(AoTYE+
zf=3{rm2SQw?Lx1e1F#2IJ-oz`sKh9q>pDi2u>lQ1nisMv*)_jFr(aP~cg|@0IG=8c
zE3_)oR7;P)ICvi9jXjq`d62~fe$k{MByVB<vq1Ihx!U6+{J9Su@H&*;gAIG^vls96
z)Ftq3CkuK`GnLrHrc*(EaZr@%{cKU?6v>)qzEmvZ`sx<(K^Xl<xN}q_1C6FW$SPf-
zUu)&T6~b>ifBo!O<gIL;;KSQjyBm3Pn2jAPX(v<92Re_;jDt)x*UXfU%vd!to=3fW
zQcUHP2ENg9ZQ+f3SYK(9Y@VzHjer;VT4j7y3^Rt@UbZhy^zwD)6^&&3=mNcB$hY-Z
z9C?J<U0PG`cBK)nNb9dlEOt9Hn;G3C#nl{+t))I*C)!ebg-Mglev)FS{3dDhwrXHx
z04{Mtx>h``-E5ESVTGZWW2*_mTe$s?TRtOiqD<d=kO_Oo#16JUz2Cx^3~u>Y$ieco
z%S5oY9hC>?P(b1AD6C(3Ux!_pt4jNFfwoAT<>BZjmzWnZK_piXrw(NJ5=Z!EIjAr@
zL`aVyWKQG8y6UZ}#KM?0RiDGj*7TN-K5PgzEf?ttVkXnitdr}k{qU6|$i)rO?+oLj
z4s&*3&d1DdJ2QMMH2p)bZv~iFUGKDTx}1G-v(nU23p61~(pm>an##>TQLrC72E(X0
z4T5Qnuxwy{+gR!%ItLf<0;e&mo*ah|Wh}o%S1Iu2B6qu@d#x<x#8Sni1BXSX%g4vP
z>PCymxfty5P5sk1Mn0W|oOLmx&cDx~o%C@;y0)jWep(9Az-+_etDb<Z)UEGsvhlNo
z$1_S#gT4Wl^T$=jJt*|n0H>*N?%6=-*|2v;u=}R<2KIjRxWa}TPwFNVc#a04IBX5A
zF61C%q{jOi)PnKWnv|+NM9U#XiFAb&Ord5S3v9zTH7yp$*ebPI(CeQtgZtCRs~t<(
zJOauP>VuU{6@nA>2kQW>DjB^-B(k&;d$A;6qm|u9+rg4Opci5md|S{pR21v<IB@f|
z|GP`^AE}Q&i#!2qbxg@`@e=PVs?*A_#I3#+60nXq5unzXgCfkS4=+uQ&0M89v01bb
z<{`ycb1*NM^K$)hYc=QMb%!q&2_n@M0WsD%+eT#~x36cRJ|DCVuODAwW0GpX6*NDs
z896FEXOXLUONpY}AJu(BS2Gq^CFeHYnl-G;{zTVu2(&`z0a^3AnALJww>EVeiZM3w
z)UpZDg5nZ^my2sWtPv6xUOpii?KlXhP<Hra)H-)~+b(vFnkaGIi03{es^wX9iFI>>
zgf$l|gRJFLaZj`;b|xc(BpB5eHzCQ<+P?Ax16mS!IQ?oR%pe@h#aBPE!Bj0fi0E=Z
zc?Jefi|?FvjaZDG<)iy_HkIPqx$-lxmP%bTiPY&Tu?dsQA{`o@c7p`wC3H8fw7~WS
z+~v3dgxIWO<_1Md_FV|~lOjWd!8`LD#ZgBxM0T8R<&$)wQIOn61~;`%LG(J`9)5+e
zcLudL+API-W@e1aG!9X6rRI>5vh}Dayqc)X^ka9PT#U^{<wdbC?eb|<OF87OS<KMM
zxLp#8zdz=vLbL03&)KdU-PckRFgvCi--TnH)S|)f+t)6(Vs3NCY`!~(lW9HoI)hDD
z<CAmA5FML&Rk=$B3GyD?O}th%OA${wpOe`*zj<-mZEwnX3!%r*pVXmvWytn9A$M@p
zAFpw@oKKZ`o7!7YXD;U5Js#3_Wy{y|sq>Acn#;(mSSQq^<H(NjA%k~QgDbRDlgF?5
zkV)mDlCaBPcVBt2ZGK~X!a1TM(v09wc=ljL+R20-Lu+qOvd~p`4=deD()zLSNeX{=
z)8)Lu=i&U+_*30Y^?3FC)Xtp5Q@-Wv3hgz;%F+hcLU+6{j!X(GU5cW8nVxGks9V@c
z(}-{i-bVJc>hc{9trq)RrVc*U;C&+OJ9>S3<#mg?$PIYqglosk$i@x{&_jyZ{tYjh
z_{fH>F++SDe6B_)F!^)hLYYZfsLu6r3Y{Venqs)Z{hPgspvpDX78hrIsNIW+G1tiH
zE_xn;dGksgbYHaY*IcT7?ISFVdC5}IN*h8odDfCfA9Q-QWc>Bb`|@@l2m^CI9P&F~
zKT$v91xDE+P<xpXOs6EnTn{yjL7zkmDL1)z>f0~+#q-(pxUN>lo$Om{I<hNNL#S7j
z#UUl()448?u!$?m)!<ppCd+&(u9@)J%)}$JmtRVhmu6B;Cq}yFx{EsxPxUZcI9*~i
zi5bmFk}aOyW;+D+N4a=P4V9DS>m~g)T6?Qd_G=pH6`bU9QA>KgM+DirN4CbAjVccj
zh7}%2Tf^5WaM<eOz2?uJejyxWV?BX*KMQ|}-KrY-N+xK5$yu0!Y1;I#Qg6&2-tP>;
z{;*TiKjO$$!6V|_MTUwaVXzrJnTlj67oHdJr3&ray-n%3@xlvR_^KIs@tV1K{Y+5L
ztRuWeBo>zxuA88HKa0N?Bp&~iYV6qwR_$Wfc8|AUu@iplNi-=5^FtL>Ii95>JftlR
zwClWZOE1<0{c|eH^cIeobB$EI=h!AQP0QcsVgv;PwdqSTg@uOnY&GdFe?!rj`W9P}
zznV|MNvmWjL(!UEE9k>iG7FPy51&0ev*k6Q!7vk6naPbFvhTY<=hfeuSz4b1A9)0#
z(A(KXEovuPm$p!i()zY@+cPw)u`srLiYya24MKL+bCzapezJs(vh9#jUsyJm94+;j
zept|*IL~&{U4o`FRCw;%APpocN`TR0BVl8@a{BAEv;A9p2^uy^W0Ud%Tj(Mat5JEu
zyX^s^KD(3OkbJI3WQxAh&CD<ZOS#eBWkq+cE@HWTt~*<xYNTJCsPTDR``4*aGyR5}
zIFYmXoYS-uH%fZNG^t=|;ZL*l*%!i`>e1U@MI!__hm>~)x|^6=bbR(TnDf!vD2G&i
zN5OR?qd!2TPq^N*xA^sX56aC^G#4r7>y(Ua)TtEjG+~CX=ZMn_Kbj_P@Yf3GPpJ9Q
zsG#d_-(hM89>_zz74?uZd);r<qWWclE#FXV(nVI0i8bY=<%uRXZ5*cmO6XUy9j)Sk
zb$(kPu3@E<NV(L->t;`RWjEVeq9lvNqEb;pP!lNoeu4HL$@8ihKUxe=vAlnGf~;5T
zN&QP6<|v`EsQ7fqxw)=vk=;Be4k?;v=L9RS@UJ~<KnPv<S~PuvLhK=%9I9A;!>O1r
zvLqtC^pT`ZBOxHexR(A@Y9{(FHF)+F^Kps>P9ua!a6gGo$%lx=twXYkJm!Lxk($IB
z$`RY01Y^`1q}i`wmqW>tR+l)amVFv=?x*5d&amHEGLoRT?APFp8qGmx>OikL$79li
zd4o3_HU`J)BY6cjN&|Uq5}XV!gsV2V<fER$-wvr8Au59XIJsuhIe8-?9N}~2#x}%j
z1;E@R$-FapE0=i^LBXOtN`j}gTr!g62@aWGadY1`=KjzUA9Og}hk986ITfVW(|Bk`
zX|y=@S|t1|%$c1s<+-Y0k>ERSOk8}g0|_Z3zrD3}<TG=bNuBeJ$zi>moi6)iXkWf3
z6iL0NHv4S6m9jj2{aJG6{jEo*HEV;}qrU{SA)1F*EnhO~H#3jB@{U5*!mk0g$r@Qg
zLMUN?c(aW<&kbPO!jaA<u}X9A#VC{lXiB<T9X#3o7J;*@2`|ibipSOR_0q6zY+HI)
z;>Saru5)Fi=a^py+<MC(qP##zV(rld&02VPR2GGBX@vJ(PZKnw4s2^J`ReZixKE;?
zznkgW(^7xqO}|{b8rN~$j0S@~hxc;cawm$eEYmHl*oB3-0+diw)2P6pmvrsT-Q{AZ
zlYNpjWAUCq2%r|2jMROm1|6!5pc|?T4V)7``~aNtZbDriC+NiJJ+U!hwcS%|t=39&
z%sVKv!Nkdj|B{UHPkE<L1RpBaV3$~WXc}`B{yc8HbuHscZ?=woR)}PA;isdd+#%G|
zMhb&YBvL>|nwP%vZeB-xTBPmGEWcQi(w7Jy*<w2sN<66}P(D=uq)Z+a-{(;vn-Tmf
za2?J=$H^=wF|ST{yHaI?swy7DnC;MS2!>9y@(GZK<W3sXNSZoSf~)-XY;-GRr|;`p
z#P^D4ix2O!r7Dxi(n@dhvUTKE4sJqwiwDPme23ocRG;HwKB5^sR%=Sf*JT~;t?1vX
z%f4{OPj9*sV>tJ~7S2@(OY`1lU7)W1K2kPOcc_wJf!@?M#*jV!>&F1zm6*)lH8tJq
zFXgE1JBv(Oc7)kXCM=<h7w3g+yPg~CY<I5?7c)M<l{YLX3J^B53AWdX?l(;bvQ-^=
zuR6hitz;c{!6~{i^)Z$P)df^&U0(L+Ww;>K(K0PsR+*L8(~YrNsS7L(N#GsYR7*X~
zV8KmYsw(_~Sn@`WYM@|>n|J?5Sc87~Wfr8=d6rPeF$c{U!51GF(hux5)I%(8JOX3X
zK+|3KcA-_y6tWm;TA;-k>`4LG?N}hj^cpxK`rb;hHLk$NSof9ZpuDfqR_PV9pf;b)
zug*JUgl&7tO8q2450cJt?t1<`u>8VEHlx1&rq!Vn?Q(+!Dv6JPeIHd(h>OnZg1yB3
zv<9yg9S0|xEF_&W*UeqOgB|S13Ys&w6T)G(5)gN&LX;DFzD2bpvf4Ng%mjVwlI_lG
ztlm!5EYV=sw5VZUxa6O)U_=rpS{BAC26RTUScgs7<SW(;G(EiXvcZ@|@n;)7<u<^U
zS43GyEjO#)6qcxTZsz?Am4j2Peydtn4#e`Wk%u(Mb7{GI$_;&BgeSW`(qaiN+dm<=
zYolCg$!m*TOQ%6ZS6;-)p&IGC?8m_?HM1L3`yJh6Akw_NLKOzd@7RhAW5;mE_wbd8
zzK;CtO9(x=c&;R!nzD2E{Tua^Acv%54B-I_MuBhYJ^J6St_Ke#+3gGFeijliQCn+y
zA`{rr{X20*xuD#9&7JQEjjf=sb9GhaK_IhhS>Z~F>3GipuaXV?w+ysSBGe(7p2g<+
zbNg>+corQ~Eb!L$t_6->cCjLBo7Pm8q;-B$5?F(88&=?S1b7j>D4j<C?BF1PA_)n+
zI6=igR^<EbY+-oi0`S7HmidhfH_&^E^v~_#7rc$Bfv-LR-y`(iS(19#3Nu(9HuH88
zvxo=VUM*stUQajwem!A3n=q(kDb=x{iu`$Up~^Re10s0h_v0PJJF~saUClEs)iN6h
zj}{qmFx(%El?Fb0wSUa?5on-qx9#SR3}t2I3E$>}wI=LAE|n!ndn4X#=me;ES54-M
zVT3Hiu(cZx1+}`OtG%>54_$Ie#OW$R?%gPt`AB!A!{Uvq@Jn#fq3;Vc1HSE<o1{M~
zz9A0E(HC#QcL;AP8LhBA56^iIbyixQwJK0eJ!>z>XZki->^(Q#aHH(B$D7>h4W(pW
zvx)HTkL`94m_aA^im5K99YP_NsV9?rp2%F)OA?ln%zeT&v!#)=W!o4{LOpx<8Jn~J
zct?HFK-i*Uw~SFdGhPISUA~W%HEIA~Oa^nWSR(fHNj9-$y8TXXN}cewJAs%u&We{3
ztu&!LbcS?tD;nvt0dl@86oE6M=@ic!6-mQJ`&bo7RqlLzeVa!!kv)7+pEI?zAPg$8
zlqfJ~_c^EoEx}){67zwqp={-uDc2J2Lc68!oX%xgrd;!w!jh;^O3NPGyENr5CR&eR
z2ZSLv`?EQ@7}*yn&1iL{H$Ge}*=_c^rS{o@eEsqc%K7m%Q~bjG_((XWt3I{ut58j=
ztp!e1<*s}EVWq14)<_H5Q^I6V93KfLbsvJ?&W~A#D4<2A9h7Pce8<ZkQMIX#c^TNK
z2qw7c&{@Vrk+LUJAM<?c(77Cx!5c&tP?lqpAYR64J9WP&OFT`q0`dO%8q6`8Ht29?
z*ZPTEy&8D)b~pfJ5vkyv)blw5FOO3%H3>CE+diBxQh4NIBs0g~NoM9Hhe%Q_uns0;
zvvPzy?{6&(kJb}4z+b;rH7qi1R~5KMQP)SyubJp6V1DVvkgU$*5j+2h9V4;ppw3UV
z<o=L&(Rb~Iu`dVpu}+U>F*j!qRph75K8t~EZoGE6an{5_&>gb#go$TiBXPLsnRi-+
zc&v?`jPP;R3-{mix_NVh=0ExSc^80hJQ-+*^c{Y{dTxymI#TFd3y?P(wEgmpNl3Y_
z;iv7M9+jCQ;#8Rt<&#oTtPY84^wtVu;8=d9=dp8R?c+6;B$n2tkR|Q->-17|xgx6{
z#wBv;sM5YP2<TxyxF<2|cIU8wc!i%!qe3c0Jd}iGkPFGwY&Fx*;}y29U<Kn~thpDu
zGh2`W&D-w)%nI!%l&#sFHxt#^rd4W*XAUEdSq(w@{9d_=<{LUcj=_2tB&COHM6-<S
z35Im(1>c6Uj?s3B_H^1Ne-NP<{0vL=Z3_>kzEFhR8mmKlDb!iKmyoYkn^NC9bmivx
zvzlQnYU``^%CuP#mQ(kuD&@t}^vo(i*otjuo3Ev#5!U)?Vf6>0z>EAf(@QVrT(cRB
zS$rgv-!YTM8GhboRAKh9F0ji+yfNc)P;Pt1d(CLb>9alc>?4%HQk6(p|KRIR#`|8p
zcb>jVMrnL(SZz*phu4aR7g|*(sxDQiOckjrUiL>9L?$_WRL*%2S_-ayUn|$a*u^QE
zqQbA)pKC=qckB$+Vr>>De-hh*q8V*`H@H1$`Sh#I!iq#fAVYl^t2jj~TNYdeXfe*%
zjJ?e(?Y43Zs>R2?HR6nVb4yF}p~g6Frcmqi50&XWg)k1QTRffM0*1(vRMCd95d(_~
zht=W!*)3T_*VMFxBK)&?BxoA%kprcoU72p!W);X5^<6oRw`2%^OLKi)sFk&-0ar1-
z5rnFP+DvK8m)h~1U{4osXl&N8M#k}*((2}hD;qXfe!f31u{09t=bZ;;S)90`GBnH0
z*afc3w2CIJojJ^#3xahBv)Vpj>v;U8wWUMBR7|B4SMN_#trnoI7Z<$lZ{(a?2N0y1
zCZ;hGW_M+X-vUXZQf`)(yJk{cL)rq1;SC)Kx3Rl4%-dw!tYgS|9bU7w#xN1?{Pygm
z@C|#hQO3^h?sqssZ4r8t{|$X7xR8_kopgGcYDLlS0!%o)o?d;TBK*=T$^EAX0VSwu
z@)VcLc(Jt~;vW!)XY}6j(F1(v<Z{Ss`pJ(Kq%14p|M>p>(-&G<tn`-WOw3Q72Zr;%
z!i24Eck+IJs^HQ&=N;&M$Z)sZKNsIt+gQ4kysUk>dxLbk@G}rT9P{YAXYr<?_pa=f
zZPAod-DR%t-%PT2(HA&8M6Y#E#hY_byscY?x2}w}ICnw|K}`y27pBND4;?i<cB4x*
zk!yAQQx`GybhKZFuem@jxl3ya%hW2v7y?7AUboQ6Iq$CUW~+F?$W_paGyc|1dk#x3
z>wzUn6`hCJ@vlKHHxrH&UD$>RZkRxW`D=E(+!72Xy3F<%_cpyV!iEw*oVC?Q&HG&v
zt$S7N<%Qp2ho3ClUUSsT*|#@g);4~{e#;8T&_>L(HebXNMskeDs(2%6uHg(bt@&}!
zO>>$MV)(H7F+yya<ha)HG})!aNa+Pbu{2SYn>%2vLf>H(k-(HTd!P}ckH4u1ACT8n
z(0QNHEchwHkF|Kdlv}lF0MpXIG0t&=J^hM@Hy3r%^@Y{l=2_ZiTHet((TB}FxlZN|
zLv#UV4iFWJ8>*??Rw5tm{Oqr{WtuqjTMS-O7JSVkz#vY4_qlD7laOgdw$+8`_O;PT
zWY@7v$job@FVCC7SH?Z-&1~F9WZ#$vAzIa>W>ppKd_if`$@y0lB|DtlLstZ`VHv|E
zmXxf>zMDaZdoFVYCV_^8trUdu$h7!(x>@1b!jKcxFLhv>rSOW(7<Lk?mtB<}9(fh6
z$EQ`QD5H%hO8B<Zr&HQI5{&-1I;3T$sM_&W8jquxByv2RO$_!m;$5hWEpYA_kG9G%
zuj%%9VWh_BAvk>tdK&ubu9sf#iN^X&zS3o|?$uQ?EDuxHWJ#v4dFgFhwKmn}iJd@`
z?he>1N=x62;dRRvS_%yk*-Qu1?uIP^E6z%Gc&^o`cgqz26}q{BrR11F8LCLDzP^?^
z`f)vNg{#@u%aw*&nkJje<CUp&^_ngLecdKUn^#q-sgAr-PtwbqV~f%nPfje*<$0aE
zIJB6c)>=DJk9qIH(cm{C$wdE<L57Q^>8>CxDWiUql}z%$E$G*n75~I9aGh|otW-5p
zylyv1Dzv?~o|`m$c3QNawJ7t+P<G}054S8Lu8?zD_|B3klCU3<E7uS-FMUm3gcnW}
zjnXa0Va8R8C;Pj=wpTo0T&zj#wlePz<9Ac6iGe2x#h5v{Cr+Q}OJpwWEUgtK=eyjB
zFtc}pG7ZU~mRfW`^aBFH_Qh+{&G@dq<&@~F>ou<|gl!^883pvGh(`|7UR?^Cc^wsu
z`TC+PY_QG)TyD96$BbL_l;~QdlU2bA8`k`KvRn&EEsY<7t%T=Gr7bb?FP~TH_J?6>
z3{qYfb~k0ukB>Qy>%ASKb{)A4w6xZH;dpt^!dkvld&u;xdF>O;aV$e=^JQj&>SkgP
z{B@(k=sY1kj4{JC2$3LdU+mJPtx-VN7vN~tQ^dj^eXfgFleALz3khV!0<u+I>pUEb
zw9q84)`uY;Q7P#@>F~R6H)@2}mEXAaCOW@}w`We#3XWov4W5b3D~}2nE+7vYGYiVs
zuFV9x3<D_jnJ3pKCy!qS9*DL+iLESMwqyvBM?dw98q*<-Ckb934;;gtn^s#s)3T<9
zd<(vmI)mcqxyUayrhK@itRa6bI0ogPNC8*mIuoTYaQ0nSjcR;1iSWenqQD!KjbvYP
zXuP6|xphG)#aHEu%HHQu#q;G-&s5>+)$#1kH^Qu0ISe7aIbGwB0Kr`Hu%J#z?JSIb
z!gt&z%T30`YxEBSUMXnGn3MWSU@p>+JZ0|!<*#Z%G$;g8J?=7~x%*QD{Cl$<BA4?}
ze#7CtGfpjrlO@f((=>_;?aVDrAu~evr?d@4jA;_unc6GjONHE?gkubPbmh|oQ%hR>
z7{B^rj;q(%c8~;bT5DT!N%Dknl$+GHW%|3{g2lK5$&n#1$6d4aFDorBg7?pIQx{6k
zpG%i0f4R5?3D`7&_8gPZraXL^?!XxT*~a64*O>mfu`oQu0UIV#7r?<JKJh<xiKkrw
z{xXju;jzF!GN#`;@LURdk8Q7ND{RI4!g98B$?nv@xYS#MOKoBZE_J7&e^Z})-=(Ov
z?sZTff$#bI2-RI+Nj2_?^^A=sX`p7QHY=ZQdkLMnn^^D+xqXJF<z|rKp0lg=6tH1a
zwfXRU7Y8ju@8Og7wsQCI-nqo1(2t*eyR#Jvgbgjyyzx6*o0<<TA3Uh`irh|dU4BzP
z;hF3^A@-iGx_j<Ov_R1Y7twTXVSYA<(Q%XYbi)y{5dT%l+YIt?I<ha>{S!rQuqXJ~
zOT2m6e*@A$LtW^9Gc8E-N>t6tNKzNW`N^hrcGuyHnn5Oe&x6_H3M;A0TYz2$F8!7Y
zQr03&t;97wkN_%82WP}&M)-ZgeJYFhRuSgKaFMNHiwIBOfwpONtr{P-S4Qz{@<N?6
znA%T{ZkN8tXAR$6miCI)ETr-LbMEIUP&wR<X*&zMeKT@Cm#z5{7xNK#>>4zzCf)^U
z*<@`Q-m6xrR!myvkHdRRJsuxx>8_o5ymr_Z<qwI{{j@+0>2R>!vjLRorE2)A;_)l=
z!1jsQcQVw}3|I9!zVvk^tdYmCKp}4Q@KA%$h89#a*J|CUlCHCzl?qPURevmbxL|U5
zT4HWt+#{jW^s>drRIn#MWl|@@`P4vo!Pcqy6F@~iv&E?%3RF2wb!+qE4V_f;<izA*
zIs@4(=m!fsG@XlHC6^YCy05D;FJg_`_B_pAsQr%X28pCQLKfdYeknjs{^_NB<PFzQ
zgufW%lbFAZ;k0X(od#{1T6UUDYQ7|{Vw1Fp^hvo&4D?!$W}#Kn^{XEz&^f_ti|NZb
zke>O5_kwV}_=3b<e?F7ecDm}vDCYt$)$+qKP(Nc26=5I#3`J9;$pRNbU+sZNLgf?o
zXU2{!ts)xI!hz^6;`yW$_m9wtMX$lEdG&z-DNjv%%No0E3$045IKD}O5nPc6ic?~7
z6Kn-<Cn+e6B)dpt^qggTY4M;+I%1C4Lk-sKEU7r)|4#SwVA*K1^Rkm}Rwy#Ub?tpI
zS2t471{xJof9&O~$Q=bzbIxIpgt<?T_yqECY&h{|)}qNy+iKWpz?nd3z?lwjs=jpC
z8lAP0vBo}PTS9#qzn@_)zpbEV-rm<WAbMgqQ`EjxJLQoF(`XP*@cxv{12JD4O<oy<
z=`Pq_rQD=6!Av{Cgdrv`$2CsT(%7w2z37G3kpCUs&cV{ra_4gUP!W&9ip-uO`+To<
zi<UVo%Q=&-KoL&{(Zn@&XO*aQ<!ga6<7J!4AQLHG=R5Z@d`StgjV8^R_wzf(KHZzV
zuHni-GWo3K%gn<rnCd|0`*L5*1-t9zOsQ07S+m@`fuP1OfO3jG<aA{*A`x$W!ZEQK
z8#yM<KnI38$Di!5j-;gofRagd$c923_6VHI1Z3C|(}4=pk{QQ49flqyP4Zy7VkvQB
zV9+qm)`N|bMT_7^8h{^7Sdy?ovo7KEj(rgN%2#ULjQE6&X(s2zWz1Jmx~Q^BGtWr3
z<hnl1d{v$kYUEh(xgoIYxWBw#EbEX^vy6>`q@n(@wENj46j<fhuoimwD7;9o32@0(
zbu+FqU`Gu=<DLJo-GbV@Uond)oPB|@>M^XjoRj%l1CH}y6w(6D)4*rC$3{=sE(V&3
z_j7~=OpkvMU~%P>*-oc9udhLIb>WUey4Odi*Incp0G~W5oe#a{Ry<+eSi?HDYP#_u
z9eM7(ogS!%T8V|}Li;8=gl;^`MF=~mTKPn@iSKRB7bIyBb;oPj>&c43zU-JH6ImM`
z<H0QbxeH?IgI%*36p178An-MnD~BW7EQL^gEzWwSk5`-vE}As^Au%&F&#f%Q(kfq&
z1b60`#eV>{{+e-k?`_7tz^W>O+$7tyQ@dwC){6YImdCMS`JYuI6m`>aD;O0+o~wz-
zpybk6vm(}xuZ3Bi3$@Ce4>T9t(kok|8#Ab3PeKW#p7CWBNVDl52TqQzJWcS+5a}Fd
zFJ<o2HgmUl<rnV|sc4BzuJO>XhcHhj%6<j%mI-{X`d_y>zjc#;eqQ&_ZL_r+zd4z-
m!AP(@hNdL%-Oas2H$O~2cP3(Q1^$2^k&GnxZjOY(<NpUozK@mw

diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index 83e17b4f27..c858ca0940 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -25,12 +25,12 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
 [Currently Available USER-INTEL Styles:]
 
 Angle Styles: charmm, harmonic :ulb,l
-Bond Styles: fene, harmonic :l
+Bond Styles: fene, fourier, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
 Fixes: nve, npt, nvt, nvt/sllod :l
 Improper Styles: cvff, harmonic :l
 Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
-buck, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
+buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
 lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
 sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
@@ -82,6 +82,10 @@ this order :l
 The {newton} setting applies to all atoms, not just atoms shared
 between MPI tasks :l
 Vectorization can change the order for adding pairwise forces :l
+Unless specified otherwise at build time, the random number 
+generator for dissipative particle dynamics uses the Mersenne 
+Twister generator (that should be more robust than the standard
+generator) :l
 :ule
 
 The precision mode (described below) used with the USER-INTEL
diff --git a/doc/src/dihedral_fourier.txt b/doc/src/dihedral_fourier.txt
index da892b59da..0accbb22bf 100644
--- a/doc/src/dihedral_fourier.txt
+++ b/doc/src/dihedral_fourier.txt
@@ -7,6 +7,7 @@
 :line
 
 dihedral_style fourier command :h3
+dihedral_style fourier/intel command :h3
 dihedral_style fourier/omp command :h3
 
 [Syntax:]
diff --git a/doc/src/pair_dpd.txt b/doc/src/pair_dpd.txt
index 8d194bb092..9e29e93430 100644
--- a/doc/src/pair_dpd.txt
+++ b/doc/src/pair_dpd.txt
@@ -8,6 +8,7 @@
 
 pair_style dpd command :h3
 pair_style dpd/gpu command :h3
+pair_style dpd/intel command :h3
 pair_style dpd/omp command :h3
 pair_style dpd/tstat command :h3
 pair_style dpd/tstat/gpu command :h3
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index 3b84446057..35cde38f15 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -30,14 +30,15 @@ be added or changed in the Makefile depending on the version:
 
 2017 update 2         - No changes needed
 2017 updates 3 or 4   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
-2018 or newer         - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
+2018 inital release   - Use -xCOMMON-AVX512 and not -xHost or -xCORE-AVX512
+2018u1 or newer       - Use -xHost or -xCORE-AVX512 and -qopt-zmm-usage=high 
 
 -----------------------------------------------------------------------------
 
 When using the suffix command with "intel", intel styles will be used if they
 exist. If the suffix command is used with "hybrid intel omp" and the USER-OMP 
-USER-OMP styles will be used whenever USER-INTEL styles are not available. This
-allow for running most styles in LAMMPS with threading.
+is installed, USER-OMP styles will be used whenever USER-INTEL styles are not
+available. This allow for running most styles in LAMMPS with threading.
 
 -----------------------------------------------------------------------------
 
@@ -52,6 +53,15 @@ need to be changed.
 
 -----------------------------------------------------------------------------
 
+The random number generator for Dissipative Particle Dynamics (DPD) in the 
+Intel package uses the Mersenne Twister pseudorandom number generator as 
+implemented in the Intel Math Kernel Library (MKL). This generator is faster
+and more robust with a significantly longer period than the default DPD
+generator. However, if MKL is not installed, the standard random number
+generator can be used by adding the compile flag "-DLMP_NO_MKL_RNG".
+
+-----------------------------------------------------------------------------
+
 In order to use offload to Intel(R) Xeon Phi(TM) coprocessors, the flag 
 -DLMP_INTEL_OFFLOAD should be set in the Makefile. Offload requires the use of 
 Intel compilers.
diff --git a/src/USER-INTEL/TEST/README b/src/USER-INTEL/TEST/README
index 434189dd26..62602d5920 100644
--- a/src/USER-INTEL/TEST/README
+++ b/src/USER-INTEL/TEST/README
@@ -9,6 +9,7 @@
 # in.intel.tersoff -    Silicon benchmark with Tersoff
 # in.intel.water -      Coarse-grain water benchmark using Stillinger-Weber
 # in.intel.airebo -     Polyethelene benchmark with AIREBO
+# in.intel.dpd -        Dissipative Particle Dynamics
 #
 #############################################################################
 
@@ -16,16 +17,17 @@
 # Expected Timesteps/second with turbo on and HT enabled, LAMMPS June-2017
 #  - Compiled w/ Intel Parallel Studio 2017u2 and Makefile.intel_cpu_intelmpi
 #
-#                     Xeon E5-2697v4     Xeon Phi 7250
+#                     Xeon E5-2697v4     Xeon Phi 7250    Xeon Gold 6148
 #                    
-# in.intel.lj -            199.5               282.3
-# in.intel.rhodo -          12.4                17.5
-# in.intel.lc -	            19.0                25.7
-# in.intel.eam -            59.4                92.8
-# in.intel.sw -	           132.4               161.9
-# in.intel.tersoff -        83.3               101.1
-# in.intel.water -          53.4                90.3
-# in.intel.airebo -          7.3                11.8
+# in.intel.lj -            199.5               282.3            317.3
+# in.intel.rhodo -          12.4                17.5             24.4
+# in.intel.lc -	            19.0                25.7             26.8
+# in.intel.eam -            59.4                92.8            105.6 
+# in.intel.sw -	           132.4               161.9            213.8
+# in.intel.tersoff -        83.3               101.1            109.6
+# in.intel.water -          53.4                90.3            105.5
+# in.intel.airebo -          7.3                11.8             17.6
+# in.intel.dpd -            74.5               100.4            148.1
 #
 #############################################################################
 
diff --git a/src/USER-INTEL/TEST/in.intel.dpd b/src/USER-INTEL/TEST/in.intel.dpd
new file mode 100644
index 0000000000..e257d91f84
--- /dev/null
+++ b/src/USER-INTEL/TEST/in.intel.dpd
@@ -0,0 +1,48 @@
+# DPD benchmark
+
+variable        N index on      # Newton Setting
+variable	w index 10	# Warmup Timesteps
+variable	t index 4000	# Main Run Timesteps
+variable	m index 1	# Main Run Timestep Multiplier
+variable	n index 0	# Use NUMA Mapping for Multi-Node
+variable	p index 0	# Use Power Measurement
+
+variable	x index 4
+variable	y index 2
+variable	z index 2
+
+variable	xx equal 20*$x
+variable	yy equal 20*$y
+variable	zz equal 20*$z
+variable	rr equal floor($t*$m)
+
+newton          $N
+if "$n > 0"	then "processors * * * grid numa"
+
+units		lj
+atom_style	atomic
+comm_modify     mode single vel yes
+
+lattice		fcc 3.0
+region		box block 0 ${xx} 0 ${yy} 0 ${zz}
+create_box	1 box
+create_atoms	1 box
+mass		1 1.0
+
+velocity	all create 1.0 87287 loop geom
+
+pair_style	dpd 1.0 1.0 928948
+pair_coeff	1 1 25.0 4.5
+
+neighbor	0.5 bin
+neigh_modify    delay 0 every 1
+
+fix		1 all nve
+timestep	0.04
+
+thermo			1000
+
+if "$p > 0"		then "run_style verlet/power"
+
+if "$w > 0"		then "run $w"
+run    	 ${rr}
diff --git a/src/USER-INTEL/dihedral_fourier_intel.cpp b/src/USER-INTEL/dihedral_fourier_intel.cpp
new file mode 100644
index 0000000000..805ffc0e25
--- /dev/null
+++ b/src/USER-INTEL/dihedral_fourier_intel.cpp
@@ -0,0 +1,441 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include "dihedral_fourier_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "memory.h"
+#include "neighbor.h"
+#include "domain.h"
+#include "force.h"
+#include "pair.h"
+#include "update.h"
+#include "error.h"
+
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define PTOLERANCE (flt_t)1.05
+#define MTOLERANCE (flt_t)-1.05
+typedef struct { int a,b,c,d,t;  } int5_t;
+
+/* ---------------------------------------------------------------------- */
+
+DihedralFourierIntel::DihedralFourierIntel(class LAMMPS *lmp)
+  : DihedralFourier(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::compute(int eflag, int vflag)
+{
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (_use_base) {
+    DihedralFourier::compute(eflag, vflag);
+    return;
+  }
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::compute(int eflag, int vflag,
+				   IntelBuffers<flt_t,acc_t> *buffers,
+				   const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag,vflag);
+  } else evflag = 0;
+
+  if (evflag) {
+    if (vflag && !eflag) {
+      if (force->newton_bond)
+        eval<0,1,1>(vflag, buffers, fc);
+      else
+        eval<0,1,0>(vflag, buffers, fc);
+    } else {
+      if (force->newton_bond)
+        eval<1,1,1>(vflag, buffers, fc);
+      else
+        eval<1,1,0>(vflag, buffers, fc);
+    }
+  } else {
+    if (force->newton_bond)
+      eval<0,0,1>(vflag, buffers, fc);
+    else
+      eval<0,0,0>(vflag, buffers, fc);
+  }
+}
+
+template <int EFLAG, int VFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+void DihedralFourierIntel::eval(const int vflag,
+				IntelBuffers<flt_t,acc_t> *buffers,
+				const ForceConst<flt_t> &fc)
+
+{
+  const int inum = neighbor->ndihedrallist;
+  if (inum == 0) return;
+
+  ATOM_T * _noalias const x = buffers->get_x(0);
+  const int nlocal = atom->nlocal;
+  const int nall = nlocal + atom->nghost;
+
+  int f_stride;
+  if (NEWTON_BOND) f_stride = buffers->get_stride(nall);
+  else f_stride = buffers->get_stride(nlocal);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(0, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+
+  acc_t oedihedral, ov0, ov1, ov2, ov3, ov4, ov5;
+  if (EFLAG) oedihedral = (acc_t)0.0;
+  if (VFLAG && vflag) {
+    ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0.0;
+  }
+
+  #if defined(_OPENMP)
+  #pragma omp parallel default(none) \
+    shared(f_start,f_stride,fc)           \
+    reduction(+:oedihedral,ov0,ov1,ov2,ov3,ov4,ov5)
+  #endif
+  {
+    int nfrom, npl, nto, tid;
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    IP_PRE_omp_range_id(nfrom, nto, tid, inum, nthreads);
+    #else
+    IP_PRE_omp_stride_id(nfrom, npl, nto, tid, inum, nthreads);
+    #endif
+
+    FORCE_T * _noalias const f = f_start + (tid * f_stride);
+    if (fix->need_zero(tid))
+      memset(f, 0, f_stride * sizeof(FORCE_T));
+
+    const int5_t * _noalias const dihedrallist =
+      (int5_t *) neighbor->dihedrallist[0];
+
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    acc_t sedihedral, sv0, sv1, sv2, sv3, sv4, sv5;
+    if (EFLAG) sedihedral = (acc_t)0.0;
+    if (VFLAG && vflag) {
+      sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0.0;
+    }
+    #pragma simd reduction(+:sedihedral, sv0, sv1, sv2, sv3, sv4, sv5)
+    for (int n = nfrom; n < nto; n ++) {
+    #else
+    for (int n = nfrom; n < nto; n += npl) {
+    #endif
+      const int i1 = dihedrallist[n].a;
+      const int i2 = dihedrallist[n].b;
+      const int i3 = dihedrallist[n].c;
+      const int i4 = dihedrallist[n].d;
+      const int type = dihedrallist[n].t;
+
+      // 1st bond
+
+      const flt_t vb1x = x[i1].x - x[i2].x;
+      const flt_t vb1y = x[i1].y - x[i2].y;
+      const flt_t vb1z = x[i1].z - x[i2].z;
+
+      // 2nd bond
+
+      const flt_t vb2xm = x[i2].x - x[i3].x;
+      const flt_t vb2ym = x[i2].y - x[i3].y;
+      const flt_t vb2zm = x[i2].z - x[i3].z;
+
+      // 3rd bond
+
+      const flt_t vb3x = x[i4].x - x[i3].x;
+      const flt_t vb3y = x[i4].y - x[i3].y;
+      const flt_t vb3z = x[i4].z - x[i3].z;
+
+      // c,s calculation
+
+      const flt_t ax = vb1y*vb2zm - vb1z*vb2ym;
+      const flt_t ay = vb1z*vb2xm - vb1x*vb2zm;
+      const flt_t az = vb1x*vb2ym - vb1y*vb2xm;
+      const flt_t bx = vb3y*vb2zm - vb3z*vb2ym;
+      const flt_t by = vb3z*vb2xm - vb3x*vb2zm;
+      const flt_t bz = vb3x*vb2ym - vb3y*vb2xm;
+
+      const flt_t rasq = ax*ax + ay*ay + az*az;
+      const flt_t rbsq = bx*bx + by*by + bz*bz;
+      const flt_t rgsq = vb2xm*vb2xm + vb2ym*vb2ym + vb2zm*vb2zm;
+      const flt_t rg = sqrt(rgsq);
+
+      flt_t rginv, ra2inv, rb2inv;
+      rginv = ra2inv = rb2inv = (flt_t)0.0;
+      if (rg > 0) rginv = (flt_t)1.0/rg;
+      if (rasq > 0) ra2inv = (flt_t)1.0/rasq;
+      if (rbsq > 0) rb2inv = (flt_t)1.0/rbsq;
+      const flt_t rabinv = sqrt(ra2inv*rb2inv);
+
+      flt_t c = (ax*bx + ay*by + az*bz)*rabinv;
+      const flt_t s = rg*rabinv*(ax*vb3x + ay*vb3y + az*vb3z);
+
+      // error check
+      #ifndef LMP_INTEL_USE_SIMDOFF
+      if (c > PTOLERANCE || c < MTOLERANCE) {
+        int me = comm->me;
+
+        if (screen) {
+          char str[128];
+          sprintf(str,"Dihedral problem: %d/%d " BIGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT " "
+                  TAGINT_FORMAT " " TAGINT_FORMAT,
+                  me,tid,update->ntimestep,
+                  atom->tag[i1],atom->tag[i2],atom->tag[i3],atom->tag[i4]);
+          error->warning(FLERR,str,0);
+          fprintf(screen,"  1st atom: %d %g %g %g\n",
+                  me,x[i1].x,x[i1].y,x[i1].z);
+          fprintf(screen,"  2nd atom: %d %g %g %g\n",
+                  me,x[i2].x,x[i2].y,x[i2].z);
+          fprintf(screen,"  3rd atom: %d %g %g %g\n",
+                  me,x[i3].x,x[i3].y,x[i3].z);
+          fprintf(screen,"  4th atom: %d %g %g %g\n",
+                  me,x[i4].x,x[i4].y,x[i4].z);
+        }
+      }
+      #endif
+
+      if (c > (flt_t)1.0) c = (flt_t)1.0;
+      if (c < (flt_t)-1.0) c = (flt_t)-1.0;
+
+      flt_t deng;
+      flt_t df = (flt_t)0.0;
+      if (EFLAG) deng = (flt_t)0.0;
+      
+      for (int j = 0; j < nterms[type]; j++) {
+	const flt_t tcos_shift = fc.bp[j][type].cos_shift;
+	const flt_t tsin_shift = fc.bp[j][type].sin_shift;
+	const flt_t tk = fc.bp[j][type].k;
+	const int m = fc.bp[j][type].multiplicity;
+
+	flt_t p = (flt_t)1.0;
+	flt_t ddf1, df1;
+	ddf1 = df1 = (flt_t)0.0;
+
+	for (int i = 0; i < m; i++) {
+	  ddf1 = p*c - df1*s;
+	  df1 = p*s + df1*c;
+	  p = ddf1;
+	}
+
+	p = p*tcos_shift + df1*tsin_shift;
+	df1 = df1*tcos_shift - ddf1*tsin_shift;
+	df1 *= -m;
+	p += (flt_t)1.0;
+	
+	if (m == 0) {
+	  p = (flt_t)1.0 + tcos_shift;
+	  df1 = (flt_t)0.0;
+	}
+
+        if (EFLAG) deng += tk * p;
+        df -= tk * df1;
+      }
+
+      const flt_t fg = vb1x*vb2xm + vb1y*vb2ym + vb1z*vb2zm;
+      const flt_t hg = vb3x*vb2xm + vb3y*vb2ym + vb3z*vb2zm;
+      const flt_t fga = fg*ra2inv*rginv;
+      const flt_t hgb = hg*rb2inv*rginv;
+      const flt_t gaa = -ra2inv*rg;
+      const flt_t gbb = rb2inv*rg;
+
+      const flt_t dtfx = gaa*ax;
+      const flt_t dtfy = gaa*ay;
+      const flt_t dtfz = gaa*az;
+      const flt_t dtgx = fga*ax - hgb*bx;
+      const flt_t dtgy = fga*ay - hgb*by;
+      const flt_t dtgz = fga*az - hgb*bz;
+      const flt_t dthx = gbb*bx;
+      const flt_t dthy = gbb*by;
+      const flt_t dthz = gbb*bz;
+
+      const flt_t sx2 = df*dtgx;
+      const flt_t sy2 = df*dtgy;
+      const flt_t sz2 = df*dtgz;
+
+      flt_t f1x = df*dtfx;
+      flt_t f1y = df*dtfy;
+      flt_t f1z = df*dtfz;
+
+      const flt_t f2x = sx2 - f1x;
+      const flt_t f2y = sy2 - f1y;
+      const flt_t f2z = sz2 - f1z;
+
+      flt_t f4x = df*dthx;
+      flt_t f4y = df*dthy;
+      flt_t f4z = df*dthz;
+
+      const flt_t f3x = -sx2 - f4x;
+      const flt_t f3y = -sy2 - f4y;
+      const flt_t f3z = -sz2 - f4z;
+
+      if (EFLAG || VFLAG) {
+        #ifdef LMP_INTEL_USE_SIMDOFF
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, sedihedral, f, NEWTON_BOND, nlocal,
+                              sv0, sv1, sv2, sv3, sv4, sv5);
+        #else
+        IP_PRE_ev_tally_dihed(EFLAG, VFLAG, eatom, vflag, deng, i1, i2, i3, i4,
+                              f1x, f1y, f1z, f3x, f3y, f3z, f4x, f4y, f4z,
+                              vb1x, vb1y, vb1z, -vb2xm, -vb2ym, -vb2zm, vb3x,
+                              vb3y, vb3z, oedihedral, f, NEWTON_BOND, nlocal,
+                              ov0, ov1, ov2, ov3, ov4, ov5);
+        #endif
+      }
+
+      #ifdef LMP_INTEL_USE_SIMDOFF
+      #pragma simdoff
+      #endif
+      {
+        if (NEWTON_BOND || i1 < nlocal) {
+          f[i1].x += f1x;
+          f[i1].y += f1y;
+          f[i1].z += f1z;
+        }
+
+        if (NEWTON_BOND || i2 < nlocal) {
+          f[i2].x += f2x;
+          f[i2].y += f2y;
+          f[i2].z += f2z;
+        }
+
+        if (NEWTON_BOND || i3 < nlocal) {
+          f[i3].x += f3x;
+          f[i3].y += f3y;
+          f[i3].z += f3z;
+        }
+
+        if (NEWTON_BOND || i4 < nlocal) {
+          f[i4].x += f4x;
+          f[i4].y += f4y;
+          f[i4].z += f4z;
+        }
+      }
+    } // for n
+    #ifdef LMP_INTEL_USE_SIMDOFF
+    if (EFLAG) oedihedral += sedihedral;
+    if (VFLAG && vflag) {
+        ov0 += sv0; ov1 += sv1; ov2 += sv2;
+        ov3 += sv3; ov4 += sv4; ov5 += sv5;
+    }
+    #endif
+  } // omp parallel
+
+  if (EFLAG) energy += oedihedral;
+  if (VFLAG && vflag) {
+    virial[0] += ov0; virial[1] += ov1; virial[2] += ov2;
+    virial[3] += ov3; virial[4] += ov4; virial[5] += ov5;
+  }
+
+  fix->set_reduce_flag();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void DihedralFourierIntel::init_style()
+{
+  DihedralFourier::init_style();
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  _use_base = 0;
+  if (fix->offload_balance() != 0.0) {
+    _use_base = 1;
+    return;
+  }
+  #endif
+
+  fix->bond_init_check();
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void DihedralFourierIntel::pack_force_const(ForceConst<flt_t> &fc,
+					    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  const int bp1 = atom->ndihedraltypes + 1;
+  fc.set_ntypes(bp1, setflag, nterms, memory);
+
+  for (int i = 1; i < bp1; i++) {
+    if (setflag[i]) {
+      for (int j = 0; j < nterms[i]; j++) {
+        fc.bp[j][i].cos_shift = cos_shift[i][j];
+	fc.bp[j][i].sin_shift = sin_shift[i][j];
+	fc.bp[j][i].k = k[i][j];
+	fc.bp[j][i].multiplicity = multiplicity[i][j];
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void DihedralFourierIntel::ForceConst<flt_t>::set_ntypes(const int nbondtypes,
+                                                         int *setflag,
+							 int *nterms,
+							 Memory *memory) {
+  if (nbondtypes != _nbondtypes) {
+    if (_nbondtypes > 0)
+      _memory->destroy(bp);
+
+    if (nbondtypes > 0) {
+      _maxnterms = 1;
+      for (int i = 1; i <= nbondtypes; i++)
+        if (setflag[i]) _maxnterms = MAX(_maxnterms, nterms[i]);
+
+      _memory->create(bp, _maxnterms, nbondtypes, "dihedralfourierintel.bp");
+    }
+  }
+  _nbondtypes = nbondtypes;
+  _memory = memory;
+}
diff --git a/src/USER-INTEL/dihedral_fourier_intel.h b/src/USER-INTEL/dihedral_fourier_intel.h
new file mode 100644
index 0000000000..a775e129f4
--- /dev/null
+++ b/src/USER-INTEL/dihedral_fourier_intel.h
@@ -0,0 +1,82 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+------------------------------------------------------------------------- */
+
+#ifdef DIHEDRAL_CLASS
+
+DihedralStyle(fourier/intel,DihedralFourierIntel)
+
+#else
+
+#ifndef LMP_DIHEDRAL_FOURIER_INTEL_H
+#define LMP_DIHEDRAL_FOURIER_INTEL_H
+
+#include "dihedral_fourier.h"
+#include "fix_intel.h"
+
+namespace LAMMPS_NS {
+
+class DihedralFourierIntel : public DihedralFourier {
+
+ public:
+  DihedralFourierIntel(class LAMMPS *lmp);
+  virtual void compute(int, int);
+  void init_style();
+
+ private:
+  FixIntel *fix;
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int EVFLAG, int EFLAG, int NEWTON_BOND, class flt_t, class acc_t>
+  void eval(const int vflag, IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc);
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  #ifdef _LMP_INTEL_OFFLOAD
+  int _use_base;
+  #endif
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t cos_shift, sin_shift, k; 
+      int multiplicity; } fc_packed1;
+
+    fc_packed1 **bp;
+
+    ForceConst() : _nbondtypes(0)  {}
+    ~ForceConst() { set_ntypes(0, NULL, NULL, NULL); }
+
+    void set_ntypes(const int nbondtypes, int *setflag, int *nterms, 
+		    Memory *memory);
+
+   private:
+    int _nbondtypes, _maxnterms;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
diff --git a/src/USER-INTEL/fix_intel.cpp b/src/USER-INTEL/fix_intel.cpp
index 637fc0d06e..eac48b8510 100644
--- a/src/USER-INTEL/fix_intel.cpp
+++ b/src/USER-INTEL/fix_intel.cpp
@@ -285,6 +285,7 @@ int FixIntel::setmask()
 {
   int mask = 0;
   mask |= PRE_REVERSE;
+  mask |= MIN_PRE_REVERSE;
   #ifdef _LMP_INTEL_OFFLOAD
   mask |= POST_FORCE;
   mask |= MIN_POST_FORCE;
diff --git a/src/USER-INTEL/fix_intel.h b/src/USER-INTEL/fix_intel.h
index 068e5ed890..d7093e79bb 100644
--- a/src/USER-INTEL/fix_intel.h
+++ b/src/USER-INTEL/fix_intel.h
@@ -43,6 +43,7 @@ class FixIntel : public Fix {
   virtual int setmask();
   virtual void init();
   virtual void setup(int);
+  inline void min_setup(int in) { setup(in); }
   void setup_pre_reverse(int eflag = 0, int vflag = 0);
 
   void pair_init_check(const bool cdmessage=false);
@@ -50,6 +51,8 @@ class FixIntel : public Fix {
   void kspace_init_check();
 
   void pre_reverse(int eflag = 0, int vflag = 0);
+  inline void min_pre_reverse(int eflag = 0, int vflag = 0)
+    { pre_reverse(eflag, vflag); }
 
   // Get all forces, calculation results from coprocesser
   void sync_coprocessor();
diff --git a/src/USER-INTEL/intel_buffers.cpp b/src/USER-INTEL/intel_buffers.cpp
index b4b664cb94..ac208f5a0c 100644
--- a/src/USER-INTEL/intel_buffers.cpp
+++ b/src/USER-INTEL/intel_buffers.cpp
@@ -409,6 +409,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
   IP_PRE_get_stride(_ccache_stride3, nsize * 3, sizeof(acc_t), 0);
   lmp->memory->create(_ccachef, _ccache_stride3 * nt, "_ccachef");
   #endif
+  memset(_ccachei, 0, vsize * sizeof(int));
   memset(_ccachej, 0, vsize * sizeof(int));
 
   #ifdef _LMP_INTEL_OFFLOAD
@@ -425,7 +426,7 @@ void IntelBuffers<flt_t, acc_t>::grow_ccache(const int off_flag,
       #pragma offload_transfer target(mic:_cop) \
         nocopy(ccachex,ccachey:length(vsize) alloc_if(1) free_if(0)) \
         nocopy(ccachez,ccachew:length(vsize) alloc_if(1) free_if(0)) \
-        nocopy(ccachei:length(vsize) alloc_if(1) free_if(0)) \
+        in(ccachei:length(vsize) alloc_if(1) free_if(0)) \
         in(ccachej:length(vsize) alloc_if(1) free_if(0))
     }
     #ifdef LMP_USE_AVXCD
diff --git a/src/USER-INTEL/intel_preprocess.h b/src/USER-INTEL/intel_preprocess.h
index a7663d54a6..d49d0d8b00 100644
--- a/src/USER-INTEL/intel_preprocess.h
+++ b/src/USER-INTEL/intel_preprocess.h
@@ -292,6 +292,15 @@ enum {TIME_PACK, TIME_HOST_NEIGHBOR, TIME_HOST_PAIR, TIME_OFFLOAD_NEIGHBOR,
     ito = inum;                                                 \
   }
 
+#define IP_PRE_omp_stride_id_vec(ifrom, ip, ito, tid, inum,     \
+                                 nthr, vecsize)                 \
+  {                                                             \
+    tid = 0;							\
+    ifrom = 0;							\
+    ip = 1;							\
+    ito = inum;							\
+  }
+
 #endif
 
 #define IP_PRE_fdotr_acc_force_l5(lf, lt, minlocal, nthreads, f_start,  \
diff --git a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
index 12101712f1..e6d45d7b2c 100644
--- a/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
+++ b/src/USER-INTEL/npair_full_bin_ghost_intel.cpp
@@ -319,7 +319,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + binstart[k]];
 	      const int bend = binhead[ibin + binend[k]];
               #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
               #pragma simd
               #endif
               for (int jj = bstart; jj < bend; jj++)
@@ -341,7 +340,6 @@ void NPairFullBinGhostIntel::fbi(const int offload, NeighList * list,
 	      const int bstart = binhead[ibin + stencil[k]];
 	      const int bend = binhead[ibin + stencil[k] + 1];
               #if defined(LMP_SIMD_COMPILER)
-              #pragma vector aligned
               #pragma simd
               #endif
               for (int jj = bstart; jj < bend; jj++)
diff --git a/src/USER-INTEL/npair_intel.cpp b/src/USER-INTEL/npair_intel.cpp
index 79dc75366e..0068e02635 100644
--- a/src/USER-INTEL/npair_intel.cpp
+++ b/src/USER-INTEL/npair_intel.cpp
@@ -273,7 +273,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin + binstart[k]];
             const int bend = binhead[ibin + binend[k]];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
             #pragma simd
             #endif
             for (int jj = bstart; jj < bend; jj++)
@@ -307,7 +306,6 @@ void NPairIntel::bin_newton(const int offload, NeighList *list,
             const int bstart = binhead[ibin];
             const int bend = binhead[ibin + 1];
             #if defined(LMP_SIMD_COMPILER)
-            #pragma vector aligned
             #pragma simd
             #endif
             for (int jj = bstart; jj < bend; jj++) {
diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp
new file mode 100644
index 0000000000..0b5760a7b0
--- /dev/null
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@@ -0,0 +1,617 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   This software is distributed under the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "pair_dpd_intel.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "suffix.h"
+using namespace LAMMPS_NS;
+
+#define LMP_MKL_RNG VSL_BRNG_MT19937
+#define FC_PACKED1_T typename ForceConst<flt_t>::fc_packed1
+#define IEPSILON 1.0e10
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::PairDPDIntel(LAMMPS *lmp) :
+  PairDPD(lmp)
+{
+  suffix_flag |= Suffix::INTEL;
+  respa_enable = 0;
+  random_thread = NULL;
+  _nrandom_thread = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairDPDIntel::~PairDPDIntel()
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_NO_MKL_RNG
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #else
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #endif
+  }
+  #endif
+  delete []random_thread;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::compute(int eflag, int vflag)
+{
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    compute<float,double>(eflag, vflag, fix->get_mixed_buffers(),
+                          force_const_single);
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    compute<double,double>(eflag, vflag, fix->get_double_buffers(),
+                           force_const_double);
+  else
+    compute<float,float>(eflag, vflag, fix->get_single_buffers(),
+                         force_const_single);
+
+  fix->balance_stamp();
+  vflag_fdotr = 0;
+}
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::compute(int eflag, int vflag,
+                           IntelBuffers<flt_t,acc_t> *buffers,
+                           const ForceConst<flt_t> &fc)
+{
+  if (eflag || vflag) {
+    ev_setup(eflag, vflag);
+  } else evflag = vflag_fdotr = 0;
+
+  const int inum = list->inum;
+  const int nthreads = comm->nthreads;
+  const int host_start = fix->host_start_pair();
+  const int offload_end = fix->offload_end_pair();
+  const int ago = neighbor->ago;
+
+  if (ago != 0 && fix->separate_buffers() == 0) {
+    fix->start_watch(TIME_PACK);
+
+    int packthreads;
+    if (nthreads > INTEL_HTHREADS) packthreads = nthreads;
+    else packthreads = 1;
+    #if defined(_OPENMP)
+    #pragma omp parallel if(packthreads > 1)
+    #endif
+    {
+      int ifrom, ito, tid;
+      IP_PRE_omp_range_id_align(ifrom, ito, tid, atom->nlocal + atom->nghost,
+                                packthreads, sizeof(ATOM_T));
+      buffers->thr_pack(ifrom,ito,ago);
+    }
+    fix->stop_watch(TIME_PACK);
+  }
+
+  int ovflag = 0;
+  if (vflag_fdotr) ovflag = 2;
+  else if (vflag) ovflag = 1;
+  if (_onetype) {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<1,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<1,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<1,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<1,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  } else {
+    if (eflag) {
+      if (force->newton_pair) {
+        eval<0,1,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,1,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,1,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    } else {
+      if (force->newton_pair) {
+        eval<0,0,1>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,1>(0, ovflag, buffers, fc, host_start, inum);
+      } else {
+        eval<0,0,0>(1, ovflag, buffers, fc, 0, offload_end);
+        eval<0,0,0>(0, ovflag, buffers, fc, host_start, inum);
+      }
+    }
+  }
+}
+
+template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+void PairDPDIntel::eval(const int offload, const int vflag,
+                        IntelBuffers<flt_t,acc_t> *buffers,
+                        const ForceConst<flt_t> &fc,
+                        const int astart, const int aend)
+{
+  const int inum = aend - astart;
+  if (inum == 0) return;
+  int nlocal, nall, minlocal;
+  fix->get_buffern(offload, nlocal, nall, minlocal);
+
+  const int ago = neighbor->ago;
+  IP_PRE_pack_separate_buffers(fix, buffers, ago, offload, nlocal, nall);
+
+  ATOM_T * _noalias const x = buffers->get_x(offload);
+  typedef struct { double x, y, z; } lmp_vt;
+  lmp_vt *v = (lmp_vt *)atom->v[0];
+  const flt_t dtinvsqrt = 1.0/sqrt(update->dt);
+
+  const int * _noalias const numneigh = list->numneigh;
+  const int * _noalias const cnumneigh = buffers->cnumneigh(list);
+  const int * _noalias const firstneigh = buffers->firstneigh(list);
+  const FC_PACKED1_T * _noalias const param = fc.param[0];
+  const flt_t * _noalias const special_lj = fc.special_lj;
+  int * _noalias const rngi_thread = fc.rngi;
+  const int rng_size = buffers->get_max_nbors();
+
+  const int ntypes = atom->ntypes + 1;
+  const int eatom = this->eflag_atom;
+
+  // Determine how much data to transfer
+  int x_size, q_size, f_stride, ev_size, separate_flag;
+  IP_PRE_get_transfern(ago, NEWTON_PAIR, EFLAG, vflag,
+                       buffers, offload, fix, separate_flag,
+                       x_size, q_size, ev_size, f_stride);
+
+  int tc;
+  FORCE_T * _noalias f_start;
+  acc_t * _noalias ev_global;
+  IP_PRE_get_buffers(offload, buffers, fix, tc, f_start, ev_global);
+  const int nthreads = tc;
+  int *overflow = fix->get_off_overflow_flag();
+  {
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime();
+    #endif
+
+    IP_PRE_repack_for_offload(NEWTON_PAIR, separate_flag, nlocal, nall,
+                              f_stride, x, 0);
+
+    acc_t oevdwl, ov0, ov1, ov2, ov3, ov4, ov5;
+    if (EFLAG) oevdwl = (acc_t)0;
+    if (vflag) ov0 = ov1 = ov2 = ov3 = ov4 = ov5 = (acc_t)0;
+
+    // loop over neighbors of my atoms
+    #if defined(_OPENMP)
+    #pragma omp parallel reduction(+:oevdwl,ov0,ov1,ov2,ov3,ov4,ov5)
+    #endif
+    {
+      int iifrom, iip, iito, tid;
+      IP_PRE_omp_stride_id(iifrom, iip, iito, tid, inum, nthreads);
+      iifrom += astart;
+      iito += astart;
+
+      #ifdef LMP_NO_MKL_RNG
+      RanMars *my_random = random_thread[tid];
+      #else
+      VSLStreamStatePtr *my_random = &(random_thread[tid]);
+      #endif
+      flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
+      int rngi = rngi_thread[tid];
+
+      int foff;
+      if (NEWTON_PAIR) foff = tid * f_stride - minlocal;
+      else foff = -minlocal;
+      FORCE_T * _noalias const f = f_start + foff;
+      if (NEWTON_PAIR) memset(f + minlocal, 0, f_stride * sizeof(FORCE_T));
+
+      flt_t icut, a0, gamma, sigma;
+      if (ONETYPE) {
+        icut = param[3].icut;
+        a0 = param[3].a0;
+        gamma = param[3].gamma;
+        sigma = param[3].sigma;
+      }
+      for (int i = iifrom; i < iito; i += iip) {
+        int itype, ptr_off;
+        const FC_PACKED1_T * _noalias parami;
+        if (!ONETYPE) {
+          itype = x[i].w;
+          ptr_off = itype * ntypes;
+          parami = param + ptr_off;
+        }
+
+        const int * _noalias const jlist = firstneigh + cnumneigh[i];
+        const int jnum = numneigh[i];
+
+        acc_t fxtmp, fytmp, fztmp, fwtmp;
+        acc_t sevdwl, sv0, sv1, sv2, sv3, sv4, sv5;
+
+        const flt_t xtmp = x[i].x;
+        const flt_t ytmp = x[i].y;
+        const flt_t ztmp = x[i].z;
+	const flt_t vxtmp = v[i].x;
+	const flt_t vytmp = v[i].y;
+	const flt_t vztmp = v[i].z;
+        fxtmp = fytmp = fztmp = (acc_t)0;
+        if (EFLAG) fwtmp = sevdwl = (acc_t)0;
+        if (NEWTON_PAIR == 0)
+          if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
+
+	if (rngi + jnum > rng_size) {
+          #ifdef LMP_NO_MKL_RNG
+          for (int jj = 0; jj < rngi; jj++)
+            my_rand_buffer[jj] = my_random->gaussian();
+          #else
+	  if (sizeof(flt_t) == sizeof(float))
+	    vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+			  (float*)my_rand_buffer, (float)0.0, (float)1.0 );
+	  else
+	    vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
+	  		  (double*)my_rand_buffer, 0.0, 1.0 );
+          #endif
+	  rngi = 0;
+	}
+
+        #if defined(LMP_SIMD_COMPILER)
+	#pragma vector aligned
+	#pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
+	                         sv0, sv1, sv2, sv3, sv4, sv5)
+        #endif
+        for (int jj = 0; jj < jnum; jj++) {
+          flt_t forcelj, evdwl;
+          forcelj = evdwl = (flt_t)0.0;
+
+          int j, jtype, sbindex;
+          if (!ONETYPE) {
+            sbindex = jlist[jj] >> SBBITS & 3;
+            j = jlist[jj] & NEIGHMASK;
+          } else
+            j = jlist[jj];
+
+          const flt_t delx = xtmp - x[j].x;
+          const flt_t dely = ytmp - x[j].y;
+          const flt_t delz = ztmp - x[j].z;
+          if (!ONETYPE) {
+            jtype = x[j].w;
+            icut = parami[jtype].icut;
+          }
+          const flt_t rsq = delx * delx + dely * dely + delz * delz;
+	  const flt_t rinv = (flt_t)1.0/sqrt(rsq);
+
+          if (rinv > icut) {
+            flt_t factor_dpd;
+            if (!ONETYPE) factor_dpd = special_lj[sbindex];
+
+	    flt_t delvx = vxtmp - v[j].x;
+	    flt_t delvy = vytmp - v[j].y;
+	    flt_t delvz = vztmp - v[j].z;
+	    flt_t dot = delx*delvx + dely*delvy + delz*delvz;
+	    flt_t randnum = my_rand_buffer[jj];
+
+	    flt_t iwd = rinv - icut;
+	    if (rinv > (flt_t)IEPSILON) iwd = (flt_t)0.0;
+
+	    if (!ONETYPE) {
+	      a0 = parami[jtype].a0;
+	      gamma = parami[jtype].gamma;
+	      sigma = parami[jtype].sigma;
+	    }
+	    flt_t fpair = a0 - iwd * gamma * dot + sigma * randnum * dtinvsqrt;
+	    if (!ONETYPE) fpair *= factor_dpd;
+	    fpair *= iwd;
+
+            const flt_t fpx = fpair * delx;
+            fxtmp += fpx;
+            if (NEWTON_PAIR) f[j].x -= fpx;
+            const flt_t fpy = fpair * dely;
+            fytmp += fpy;
+            if (NEWTON_PAIR) f[j].y -= fpy;
+            const flt_t fpz = fpair * delz;
+            fztmp += fpz;
+            if (NEWTON_PAIR) f[j].z -= fpz;
+
+            if (EFLAG) {
+	      flt_t cut = (flt_t)1.0/icut;
+	      flt_t r = (flt_t)1.0/rinv;
+	      evdwl = (flt_t)0.5 * a0 * (cut - (flt_t)2.0*r + rsq * icut);
+	      if (!ONETYPE) evdwl *= factor_dpd;
+              sevdwl += evdwl;
+              if (eatom) {
+                fwtmp += (flt_t)0.5 * evdwl;
+                if (NEWTON_PAIR)
+                  f[j].w += (flt_t)0.5 * evdwl;
+              }
+            }
+
+            if (NEWTON_PAIR == 0)
+              IP_PRE_ev_tally_nborv(vflag, delx, dely, delz, fpx, fpy, fpz);
+          } // if rsq
+        } // for jj
+        if (NEWTON_PAIR) {
+          f[i].x += fxtmp;
+          f[i].y += fytmp;
+          f[i].z += fztmp;
+        } else {
+          f[i].x = fxtmp;
+          f[i].y = fytmp;
+          f[i].z = fztmp;
+        }
+
+        IP_PRE_ev_tally_atom(NEWTON_PAIR, EFLAG, vflag, f, fwtmp);
+	rngi += jnum;
+      } // for ii
+
+      IP_PRE_fdotr_reduce_omp(NEWTON_PAIR, nall, minlocal, nthreads, f_start,
+                              f_stride, x, offload, vflag, ov0, ov1, ov2, ov3,
+                              ov4, ov5);
+      rngi_thread[tid] = rngi;
+    } // end omp
+
+    IP_PRE_fdotr_reduce(NEWTON_PAIR, nall, nthreads, f_stride, vflag,
+                        ov0, ov1, ov2, ov3, ov4, ov5);
+
+    if (EFLAG) {
+      if (NEWTON_PAIR == 0) oevdwl *= (acc_t)0.5;
+      ev_global[0] = oevdwl;
+      ev_global[1] = (acc_t)0.0;
+    }
+    if (vflag) {
+      if (NEWTON_PAIR == 0) {
+        ov0 *= (acc_t)0.5;
+        ov1 *= (acc_t)0.5;
+        ov2 *= (acc_t)0.5;
+        ov3 *= (acc_t)0.5;
+        ov4 *= (acc_t)0.5;
+        ov5 *= (acc_t)0.5;
+      }
+      ev_global[2] = ov0;
+      ev_global[3] = ov1;
+      ev_global[4] = ov2;
+      ev_global[5] = ov3;
+      ev_global[6] = ov4;
+      ev_global[7] = ov5;
+    }
+    #if defined(__MIC__) && defined(_LMP_INTEL_OFFLOAD)
+    *timer_compute = MIC_Wtime() - *timer_compute;
+    #endif
+  } // end offload
+
+  if (offload)
+    fix->stop_watch(TIME_OFFLOAD_LATENCY);
+  else
+    fix->stop_watch(TIME_HOST_PAIR);
+
+  if (EFLAG || vflag)
+    fix->add_result_array(f_start, ev_global, offload, eatom, 0, vflag);
+  else
+    fix->add_result_array(f_start, 0, offload);
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::settings(int narg, char **arg) {
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_NO_MKL_RNG
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #else
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::settings(narg,arg);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_NO_MKL_RNG
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #else
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairDPDIntel::init_style()
+{
+  PairDPD::init_style();
+  if (force->newton_pair == 0) {
+    neighbor->requests[neighbor->nrequest-1]->half = 0;
+    neighbor->requests[neighbor->nrequest-1]->full = 1;
+  }
+  neighbor->requests[neighbor->nrequest-1]->intel = 1;
+
+  int ifix = modify->find_fix("package_intel");
+  if (ifix < 0)
+    error->all(FLERR,
+               "The 'package intel' command is required for /intel styles");
+  fix = static_cast<FixIntel *>(modify->fix[ifix]);
+
+  fix->pair_init_check();
+  #ifdef _LMP_INTEL_OFFLOAD
+  if (fix->offload_balance() != 0.0)
+    error->all(FLERR,
+          "Offload for dpd/intel is not yet available. Set balance to 0.");
+  #endif
+
+  if (fix->precision() == FixIntel::PREC_MODE_MIXED)
+    pack_force_const(force_const_single, fix->get_mixed_buffers());
+  else if (fix->precision() == FixIntel::PREC_MODE_DOUBLE)
+    pack_force_const(force_const_double, fix->get_double_buffers());
+  else
+    pack_force_const(force_const_single, fix->get_single_buffers());
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t, class acc_t>
+void PairDPDIntel::pack_force_const(ForceConst<flt_t> &fc,
+                                    IntelBuffers<flt_t,acc_t> *buffers)
+{
+  _onetype = 0;
+  if (atom->ntypes == 1 && !atom->molecular) _onetype = 1;
+
+  int tp1 = atom->ntypes + 1;
+  fc.set_ntypes(tp1,comm->nthreads,buffers->get_max_nbors(),memory,_cop);
+  buffers->set_ntypes(tp1);
+  flt_t **cutneighsq = buffers->get_cutneighsq();
+
+  // Repeat cutsq calculation because done after call to init_style
+  double cut, cutneigh;
+  for (int i = 1; i <= atom->ntypes; i++) {
+    for (int j = i; j <= atom->ntypes; j++) {
+      if (setflag[i][j] != 0 || (setflag[i][i] != 0 && setflag[j][j] != 0)) {
+        cut = init_one(i,j);
+        cutneigh = cut + neighbor->skin;
+        cutsq[i][j] = cutsq[j][i] = cut*cut;
+        cutneighsq[i][j] = cutneighsq[j][i] = cutneigh * cutneigh;
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      } else {
+        cut = init_one(i,j);
+        double icut = 1.0 / cut;
+        fc.param[i][j].icut = fc.param[j][i].icut = icut;
+      }
+    }
+  }
+
+  for (int i = 0; i < 4; i++) {
+    fc.special_lj[i] = force->special_lj[i];
+    fc.special_lj[0] = 1.0;
+  }
+
+  for (int i = 0; i < tp1; i++) {
+    for (int j = 0; j < tp1; j++) {
+      fc.param[i][j].a0 = a0[i][j];
+      fc.param[i][j].gamma = gamma[i][j];
+      fc.param[i][j].sigma = sigma[i][j];
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <class flt_t>
+void PairDPDIntel::ForceConst<flt_t>::set_ntypes(const int ntypes,
+                                                 const int nthreads,
+						 const int max_nbors,
+                                                 Memory *memory,
+                                                 const int cop) {
+  if (ntypes != _ntypes) {
+    if (_ntypes > 0) {
+      _memory->destroy(param);
+      _memory->destroy(rand_buffer_thread);
+      _memory->destroy(rngi);
+    }
+    if (ntypes > 0) {
+      _cop = cop;
+      memory->create(param,ntypes,ntypes,"fc.param");
+      memory->create(rand_buffer_thread, nthreads, max_nbors, 
+		     "fc.rand_buffer_thread");
+      memory->create(rngi,nthreads,"fc.param");
+      for (int i = 0; i < nthreads; i++) rngi[i] = max_nbors;
+    }
+  }
+  _ntypes = ntypes;
+  _memory = memory;
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+   ------------------------------------------------------------------------- */
+
+void PairDPDIntel::read_restart_settings(FILE *fp)
+{
+  #if defined(_OPENMP)
+  if (_nrandom_thread) {
+    #ifdef LMP_NO_MKL_RNG
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
+    #else
+    for (int i = 0; i < _nrandom_thread; i++)
+      vslDeleteStream(&random_thread[i]);
+    #endif
+  }
+  delete []random_thread;
+  #endif
+  PairDPD::read_restart_settings(fp);
+  _nrandom_thread = comm->nthreads;
+
+  #ifdef LMP_NO_MKL_RNG
+
+  random_thread =new RanMars*[comm->nthreads];
+  random_thread[0] = random;
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    if (tid > 0)
+      random_thread[tid] = new RanMars(lmp, seed+comm->me+comm->nprocs*tid);
+  }
+  #endif
+
+  #else
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #endif
+}
diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h
new file mode 100644
index 0000000000..9181ff38f4
--- /dev/null
+++ b/src/USER-INTEL/pair_dpd_intel.h
@@ -0,0 +1,110 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: W. Michael Brown (Intel)
+                        Shun Xu (Computer Network Information Center, CAS)
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/intel,PairDPDIntel)
+
+#else
+
+#ifndef LMP_PAIR_DPD_INTEL_H
+#define LMP_PAIR_DPD_INTEL_H
+
+#include "pair_dpd.h"
+#include "fix_intel.h"
+
+#ifdef LMP_NO_MKL_RNG
+#include "random_mars.h"
+#else
+#include "mkl_vsl.h"
+#endif
+
+namespace LAMMPS_NS {
+
+class PairDPDIntel : public PairDPD {
+
+ public:
+  PairDPDIntel(class LAMMPS *);
+  ~PairDPDIntel();
+
+  virtual void compute(int, int);
+  void settings(int, char **);
+  void init_style();
+  void read_restart_settings(FILE *);
+ 
+ private:
+  FixIntel *fix;
+  int _cop, _onetype, _nrandom_thread;
+
+  #ifdef LMP_NO_MKL_RNG
+  RanMars **random_thread;
+  #else
+  VSLStreamStatePtr *random_thread;
+  #endif
+
+  template <class flt_t> class ForceConst;
+  template <class flt_t, class acc_t>
+  void compute(int eflag, int vflag, IntelBuffers<flt_t,acc_t> *buffers,
+               const ForceConst<flt_t> &fc);
+  template <int ONETYPE, int EFLAG, int NEWTON_PAIR, class flt_t, class acc_t>
+  void eval(const int offload, const int vflag,
+            IntelBuffers<flt_t,acc_t> * buffers,
+            const ForceConst<flt_t> &fc, const int astart, const int aend);
+
+  template <class flt_t, class acc_t>
+  void pack_force_const(ForceConst<flt_t> &fc,
+                        IntelBuffers<flt_t, acc_t> *buffers);
+
+  // ----------------------------------------------------------------------
+
+  template <class flt_t>
+  class ForceConst {
+   public:
+    typedef struct { flt_t icut, a0, gamma, sigma; } fc_packed1;
+
+    _alignvar(flt_t special_lj[4],64);
+    fc_packed1 **param;
+    flt_t **rand_buffer_thread;
+    int *rngi;
+
+    ForceConst() : _ntypes(0)  {}
+    ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
+
+    void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, 
+		    Memory *memory, const int cop);
+
+   private:
+    int _ntypes, _cop;
+    Memory *_memory;
+  };
+  ForceConst<float> force_const_single;
+  ForceConst<double> force_const_double;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: The 'package intel' command is required for /intel styles
+
+Self-explanatory.
+
+*/

From 529eeb603923964e3853fbb272187d47042a93f6 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Mon, 2 Oct 2017 09:31:39 -0600
Subject: [PATCH 30/53] Reduce GPU data transfer

---
 src/KOKKOS/comm_kokkos.cpp     | 8 +++++---
 src/KOKKOS/neighbor_kokkos.cpp | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index ba44ea813f..a8b591e210 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -523,7 +523,7 @@ void CommKokkos::exchange_device()
           k_exchange_copylist.h_view(i) = sendpos;
           sendpos--;
         } else
-        k_exchange_copylist.h_view(i) = -1;
+          k_exchange_copylist.h_view(i) = -1;
       }
 
       k_exchange_copylist.modify<LMPHostType>();
@@ -916,8 +916,10 @@ void CommKokkos::borders_device() {
 
   if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
   atomKK->modified(exec_space,ALL_MASK);
-  atomKK->sync(Host,TAG_MASK);
-  if (map_style) atom->map_set();
+  if (map_style) {
+    atomKK->sync(Host,TAG_MASK);
+    atom->map_set();
+  }
 }
 /* ----------------------------------------------------------------------
    realloc the size of the send buffer as needed with BUFFACTOR and bufextra
diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp
index 9a40808052..f34b149864 100644
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@@ -310,9 +310,9 @@ void NeighborKokkos::build_kokkos(int topoflag)
   // build pairwise lists for all perpetual NPair/NeighList
   // grow() with nlocal/nall args so that only realloc if have to
 
-  atomKK->sync(Host,ALL_MASK);
   for (i = 0; i < npair_perpetual; i++) {
     m = plist[i];
+    if (!lists[m]->kokkos) atomKK->sync(Host,ALL_MASK);
     if (!lists[m]->copy) lists[m]->grow(nlocal,nall);
     neigh_pair[m]->build_setup();
     neigh_pair[m]->build(lists[m]);

From 8d384b9149c71d576fcea8f1b3f7cef54d3ec2ec Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 2 Oct 2017 15:03:48 -0400
Subject: [PATCH 31/53] whitespace cleanup

---
 src/dump.cpp   |  6 +++---
 src/modify.cpp | 56 +++++++++++++++++++++++++-------------------------
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/dump.cpp b/src/dump.cpp
index 44098298ba..ddd958c25c 100644
--- a/src/dump.cpp
+++ b/src/dump.cpp
@@ -238,7 +238,7 @@ void Dump::init()
     int gcmcflag = 0;
     for (int i = 0; i < modify->nfix; i++)
       if ((strcmp(modify->fix[i]->style,"gcmc") == 0))
-	gcmcflag = 1;
+        gcmcflag = 1;
 
     if (sortcol == 0 && atom->tag_consecutive() && !gcmcflag) {
       tagint *tag = atom->tag;
@@ -898,7 +898,7 @@ void Dump::modify_params(int narg, char **arg)
     } else if (strcmp(arg[iarg],"fileper") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
       if (!multiproc)
-	error->all(FLERR,"Cannot use dump_modify fileper "
+        error->all(FLERR,"Cannot use dump_modify fileper "
                    "without % in dump file name");
       int nper = force->inumeric(FLERR,arg[iarg+1]);
       if (nper <= 0) error->all(FLERR,"Illegal dump_modify command");
@@ -973,7 +973,7 @@ void Dump::modify_params(int narg, char **arg)
     } else if (strcmp(arg[iarg],"nfile") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal dump_modify command");
       if (!multiproc)
-	error->all(FLERR,"Cannot use dump_modify nfile "
+        error->all(FLERR,"Cannot use dump_modify nfile "
                    "without % in dump file name");
       int nfile = force->inumeric(FLERR,arg[iarg+1]);
       if (nfile <= 0) error->all(FLERR,"Illegal dump_modify command");
diff --git a/src/modify.cpp b/src/modify.cpp
index 4516788aa9..f723eb38fa 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -863,9 +863,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix)
       fix[ifix]->restart(state_restart_global[i]);
       used_restart_global[i] = 1;
       if (comm->me == 0) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"Resetting global fix info from restart file:\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"Resetting global fix info from restart file:\n");
         if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
                             fix[ifix]->style,fix[ifix]->id);
@@ -885,9 +885,9 @@ void Modify::add_fix(int narg, char **arg, int trysuffix)
         fix[ifix]->unpack_restart(j,index_restart_peratom[i]);
       fix[ifix]->restart_reset = 1;
       if (comm->me == 0) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"Resetting peratom fix info from restart file:\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"Resetting peratom fix info from restart file:\n");
         if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
                             fix[ifix]->style,fix[ifix]->id);
@@ -1409,24 +1409,24 @@ void Modify::restart_deallocate(int flag)
     if (flag && comm->me == 0) {
       int i;
       for (i = 0; i < nfix_restart_global; i++)
-	if (used_restart_global[i] == 0) break;
+        if (used_restart_global[i] == 0) break;
       if (i == nfix_restart_global) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"All restart file global fix info "
                   "was re-assigned\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"All restart file global fix info "
                   "was re-assigned\n");
       } else {
-	if (screen) fprintf(screen,"Unused restart file global fix info:\n");
-	if (logfile) fprintf(logfile,"Unused restart file global fix info:\n");
-	for (i = 0; i < nfix_restart_global; i++) {
-	  if (used_restart_global[i]) continue;
-	  if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
-			      style_restart_global[i],id_restart_global[i]);
-	  if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
-			       style_restart_global[i],id_restart_global[i]);
-	}
+        if (screen) fprintf(screen,"Unused restart file global fix info:\n");
+        if (logfile) fprintf(logfile,"Unused restart file global fix info:\n");
+        for (i = 0; i < nfix_restart_global; i++) {
+          if (used_restart_global[i]) continue;
+          if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
+                              style_restart_global[i],id_restart_global[i]);
+          if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
+                               style_restart_global[i],id_restart_global[i]);
+        }
       }
     }
 
@@ -1445,24 +1445,24 @@ void Modify::restart_deallocate(int flag)
     if (flag && comm->me == 0) {
       int i;
       for (i = 0; i < nfix_restart_peratom; i++)
-	if (used_restart_peratom[i] == 0) break;
+        if (used_restart_peratom[i] == 0) break;
       if (i == nfix_restart_peratom) {
-	if (screen) 
+        if (screen)
           fprintf(screen,"All restart file peratom fix info "
                   "was re-assigned\n");
-	if (logfile) 
+        if (logfile)
           fprintf(logfile,"All restart file peratom fix info "
                   "was re-assigned\n");
       } else {
-	if (screen) fprintf(screen,"Unused restart file peratom fix info:\n");
-	if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n");
-	for (i = 0; i < nfix_restart_peratom; i++) {
-	  if (used_restart_peratom[i]) continue;
-	  if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
-			      style_restart_peratom[i],id_restart_peratom[i]);
-	  if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
-			       style_restart_peratom[i],id_restart_peratom[i]);
-	}
+        if (screen) fprintf(screen,"Unused restart file peratom fix info:\n");
+        if (logfile) fprintf(logfile,"Unused restart file peratom fix info:\n");
+        for (i = 0; i < nfix_restart_peratom; i++) {
+          if (used_restart_peratom[i]) continue;
+          if (screen) fprintf(screen,"  fix style: %s, fix ID: %s\n",
+                              style_restart_peratom[i],id_restart_peratom[i]);
+          if (logfile) fprintf(logfile,"  fix style: %s, fix ID: %s\n",
+                               style_restart_peratom[i],id_restart_peratom[i]);
+        }
       }
     }
 

From 2a24cbfe0c2f4158aeac7fa833f59f918dcfe811 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 2 Oct 2017 21:13:51 -0400
Subject: [PATCH 32/53] reverse logic for using MKL pRNG: this way, make serial
 and make mpi will compile LAMMPS with USER-INTEL installed

---
 src/MAKE/OPTIONS/Makefile.intel_coprocessor   |  2 +-
 src/MAKE/OPTIONS/Makefile.intel_cpu           |  3 +-
 src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi  |  3 +-
 .../OPTIONS/Makefile.intel_knl_coprocessor    |  6 +-
 src/USER-INTEL/pair_dpd_intel.cpp             | 90 +++++++++----------
 src/USER-INTEL/pair_dpd_intel.h               | 16 ++--
 6 files changed, 62 insertions(+), 58 deletions(-)
 mode change 100755 => 100644 src/MAKE/OPTIONS/Makefile.intel_cpu

diff --git a/src/MAKE/OPTIONS/Makefile.intel_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_coprocessor
index a717be93ff..75e4d89170 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_coprocessor
+++ b/src/MAKE/OPTIONS/Makefile.intel_coprocessor
@@ -10,7 +10,7 @@ CC =		mpiicpc
 MIC_OPT =       -qoffload-option,mic,compiler,"-fp-model fast=2 -mGLOB_default_function_attrs=\"gather_scatter_loop_unroll=4\""
 CCFLAGS =	-g -O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
                 -xHost -fno-alias -ansi-alias -restrict -DLMP_INTEL_USELRT \
-                -qoverride-limits $(MIC_OPT)
+                -qoverride-limits $(MIC_OPT) -DLMP_USE_MKL_RNG
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu
old mode 100755
new mode 100644
index b7db064574..2c3cc51249
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu
@@ -9,7 +9,8 @@ SHELL = /bin/sh
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+                -fno-alias -ansi-alias -restrict $(OPTFLAGS) \
+		-DLMP_USE_MKL_RNG
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index 8a45b781f8..ff2d0cc5c2 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -9,7 +9,8 @@ SHELL = /bin/sh
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
 CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+                -fno-alias -ansi-alias -restrict $(OPTFLAGS) \
+		-DLMP_USE_MKL_RNG -DLMP_INTEL_USELRT
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
index 406e98b36d..769c166105 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
+++ b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
@@ -9,8 +9,10 @@ SHELL = /bin/sh
 CC =		mpiicpc 
 MIC_OPT =       -qoffload-arch=mic-avx512 -fp-model fast=2
 CCFLAGS =	-O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-                -xHost -fno-alias -ansi-alias -restrict \
-                -qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT
+		-xHost -fno-alias -ansi-alias -restrict \
+		-qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT \
+		-DLMP_USE_MKL_RNG
+
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
diff --git a/src/USER-INTEL/pair_dpd_intel.cpp b/src/USER-INTEL/pair_dpd_intel.cpp
index 0b5760a7b0..c7cddfccc1 100644
--- a/src/USER-INTEL/pair_dpd_intel.cpp
+++ b/src/USER-INTEL/pair_dpd_intel.cpp
@@ -47,12 +47,12 @@ PairDPDIntel::~PairDPDIntel()
 {
   #if defined(_OPENMP)
   if (_nrandom_thread) {
-    #ifdef LMP_NO_MKL_RNG
-    for (int i = 1; i < _nrandom_thread; i++)
-      delete random_thread[i];
-    #else
+    #ifdef LMP_USE_MKL_RNG
     for (int i = 0; i < _nrandom_thread; i++)
       vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
     #endif
   }
   #endif
@@ -216,10 +216,10 @@ void PairDPDIntel::eval(const int offload, const int vflag,
       iifrom += astart;
       iito += astart;
 
-      #ifdef LMP_NO_MKL_RNG
-      RanMars *my_random = random_thread[tid];
-      #else
+      #ifdef LMP_USE_MKL_RNG
       VSLStreamStatePtr *my_random = &(random_thread[tid]);
+      #else
+      RanMars *my_random = random_thread[tid];
       #endif
       flt_t *my_rand_buffer = fc.rand_buffer_thread[tid];
       int rngi = rngi_thread[tid];
@@ -264,16 +264,16 @@ void PairDPDIntel::eval(const int offload, const int vflag,
           if (vflag==1) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
 	if (rngi + jnum > rng_size) {
-          #ifdef LMP_NO_MKL_RNG
-          for (int jj = 0; jj < rngi; jj++)
-            my_rand_buffer[jj] = my_random->gaussian();
-          #else
+          #ifdef LMP_USE_MKL_RNG
 	  if (sizeof(flt_t) == sizeof(float))
 	    vsRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
 			  (float*)my_rand_buffer, (float)0.0, (float)1.0 );
 	  else
 	    vdRngGaussian(VSL_RNG_METHOD_GAUSSIAN_ICDF, *my_random, rngi, 
 	  		  (double*)my_rand_buffer, 0.0, 1.0 );
+          #else
+          for (int jj = 0; jj < rngi; jj++)
+            my_rand_buffer[jj] = my_random->gaussian();
           #endif
 	  rngi = 0;
 	}
@@ -420,12 +420,12 @@ void PairDPDIntel::eval(const int offload, const int vflag,
 void PairDPDIntel::settings(int narg, char **arg) {
   #if defined(_OPENMP)
   if (_nrandom_thread) {
-    #ifdef LMP_NO_MKL_RNG
-    for (int i = 1; i < _nrandom_thread; i++)
-      delete random_thread[i];
-    #else
+    #ifdef LMP_USE_MKL_RNG
     for (int i = 0; i < _nrandom_thread; i++)
       vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
     #endif
   }
   delete []random_thread;
@@ -433,7 +433,19 @@ void PairDPDIntel::settings(int narg, char **arg) {
   PairDPD::settings(narg,arg);
   _nrandom_thread = comm->nthreads;
 
-  #ifdef LMP_NO_MKL_RNG
+  #ifdef LMP_USE_MKL_RNG
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #else
 
   random_thread =new RanMars*[comm->nthreads];
   random_thread[0] = random;
@@ -446,18 +458,6 @@ void PairDPDIntel::settings(int narg, char **arg) {
   }
   #endif
 
-  #else
-
-  random_thread=new VSLStreamStatePtr[comm->nthreads];
-  #if defined(_OPENMP)
-  #pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
-		 seed + comm->me + comm->nprocs * tid );
-  }
-  #endif
-
   #endif
 }
 
@@ -575,12 +575,12 @@ void PairDPDIntel::read_restart_settings(FILE *fp)
 {
   #if defined(_OPENMP)
   if (_nrandom_thread) {
-    #ifdef LMP_NO_MKL_RNG
-    for (int i = 1; i < _nrandom_thread; i++)
-      delete random_thread[i];
-    #else
+    #ifdef LMP_USE_MKL_RNG
     for (int i = 0; i < _nrandom_thread; i++)
       vslDeleteStream(&random_thread[i]);
+    #else
+    for (int i = 1; i < _nrandom_thread; i++)
+      delete random_thread[i];
     #endif
   }
   delete []random_thread;
@@ -588,7 +588,19 @@ void PairDPDIntel::read_restart_settings(FILE *fp)
   PairDPD::read_restart_settings(fp);
   _nrandom_thread = comm->nthreads;
 
-  #ifdef LMP_NO_MKL_RNG
+  #ifdef LMP_USE_MKL_RNG
+
+  random_thread=new VSLStreamStatePtr[comm->nthreads];
+  #if defined(_OPENMP)
+  #pragma omp parallel
+  {
+    int tid = omp_get_thread_num();
+    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
+		 seed + comm->me + comm->nprocs * tid );
+  }
+  #endif
+
+  #else
 
   random_thread =new RanMars*[comm->nthreads];
   random_thread[0] = random;
@@ -601,17 +613,5 @@ void PairDPDIntel::read_restart_settings(FILE *fp)
   }
   #endif
 
-  #else
-
-  random_thread=new VSLStreamStatePtr[comm->nthreads];
-  #if defined(_OPENMP)
-  #pragma omp parallel
-  {
-    int tid = omp_get_thread_num();
-    vslNewStream(&random_thread[tid], LMP_MKL_RNG, 
-		 seed + comm->me + comm->nprocs * tid );
-  }
-  #endif
-
   #endif
 }
diff --git a/src/USER-INTEL/pair_dpd_intel.h b/src/USER-INTEL/pair_dpd_intel.h
index 9181ff38f4..416d873c00 100644
--- a/src/USER-INTEL/pair_dpd_intel.h
+++ b/src/USER-INTEL/pair_dpd_intel.h
@@ -28,10 +28,10 @@ PairStyle(dpd/intel,PairDPDIntel)
 #include "pair_dpd.h"
 #include "fix_intel.h"
 
-#ifdef LMP_NO_MKL_RNG
-#include "random_mars.h"
-#else
+#ifdef LMP_USE_MKL_RNG
 #include "mkl_vsl.h"
+#else
+#include "random_mars.h"
 #endif
 
 namespace LAMMPS_NS {
@@ -46,15 +46,15 @@ class PairDPDIntel : public PairDPD {
   void settings(int, char **);
   void init_style();
   void read_restart_settings(FILE *);
- 
+
  private:
   FixIntel *fix;
   int _cop, _onetype, _nrandom_thread;
 
-  #ifdef LMP_NO_MKL_RNG
-  RanMars **random_thread;
-  #else
+  #ifdef LMP_USE_MKL_RNG
   VSLStreamStatePtr *random_thread;
+  #else
+  RanMars **random_thread;
   #endif
 
   template <class flt_t> class ForceConst;
@@ -86,7 +86,7 @@ class PairDPDIntel : public PairDPD {
     ~ForceConst() { set_ntypes(0, 0, 0, NULL, _cop); }
 
     void set_ntypes(const int ntypes, const int nthreads, const int max_nbors, 
-		    Memory *memory, const int cop);
+                    Memory *memory, const int cop);
 
    private:
     int _ntypes, _cop;

From 466fde6443bf2c7c7b96502cc3ceecb0a24c979f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 2 Oct 2017 21:20:26 -0400
Subject: [PATCH 33/53] update documentation for the reversal in the
 INTEL_MKL_RNG define

---
 doc/src/accelerate_intel.txt | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index c858ca0940..e585209cf5 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -82,10 +82,11 @@ this order :l
 The {newton} setting applies to all atoms, not just atoms shared
 between MPI tasks :l
 Vectorization can change the order for adding pairwise forces :l
-Unless specified otherwise at build time, the random number 
-generator for dissipative particle dynamics uses the Mersenne 
-Twister generator (that should be more robust than the standard
-generator) :l
+When using the -DLMP_USE_MKL_RNG define (all included intel optimized
+makefiles do) at build time, the random number generator for
+dissipative particle dynamics (pair style dpd/intel) uses the Mersenne
+Twister generator included in the Intel MKL library (that should be
+more robust than the default Masaglia random number generator) :l
 :ule
 
 The precision mode (described below) used with the USER-INTEL

From d2aa05cb3661497c70204ae8ea0822689123ebff Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 2 Oct 2017 21:24:51 -0400
Subject: [PATCH 34/53] update README in USER-INTEL for recent LRT logic
 reversal

---
 src/USER-INTEL/README | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index 35cde38f15..edfc69120c 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -42,11 +42,11 @@ available. This allow for running most styles in LAMMPS with threading.
 
 -----------------------------------------------------------------------------
 
-The Long-Range Thread mode (LRT) in the Intel package currently uses
-pthreads by default. If pthreads are not supported in the build environment,
-the compile flag "-DLMP_INTEL_NOLRT" will disable the feature to allow for 
-builds without pthreads. Alternatively, "-DLMP_INTEL_LRT11" can be used to
-build with compilers that support threads using the C++11 standard. When using
+The Long-Range Thread mode (LRT) in the Intel package is enabled through the
+-DLMP_INTEL_USELRT define at compile time. All intel optimized makefiles
+include this define. This feature will use pthreads by default.
+Alternatively, "-DLMP_INTEL_LRT11" can be used to build with compilers that
+support threads intrinsically using the C++11 standard. When using
 LRT mode, you might need to disable OpenMP affinity settings (e.g.
 export KMP_AFFINITY=none). LAMMPS will generate a warning if the settings
 need to be changed.

From 5e89269631263f7b800e6db09546f580d93b03a9 Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Mon, 2 Oct 2017 23:41:14 -0700
Subject: [PATCH 35/53] Minor adjustments to intel makefiles and documentation
 based on the reversed preprocessor logic and default memory align. Removing
 knl_coprocessor makefile.

---
 doc/src/accelerate_intel.txt                  |  42 +++---
 src/MAKE/MACHINES/Makefile.cori2              |   7 +-
 src/MAKE/OPTIONS/Makefile.intel_cpu           |   9 +-
 src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi  |   5 +-
 src/MAKE/OPTIONS/Makefile.intel_cpu_mpich     |   6 +-
 src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi   |   8 +-
 .../OPTIONS/Makefile.intel_knl_coprocessor    | 125 ------------------
 src/MAKE/OPTIONS/Makefile.knl                 |   6 +-
 src/USER-INTEL/README                         |  11 +-
 src/USER-INTEL/verlet_lrt_intel.cpp           |   2 +-
 src/USER-INTEL/verlet_lrt_intel.h             |   5 +-
 11 files changed, 50 insertions(+), 176 deletions(-)
 delete mode 100644 src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor

diff --git a/doc/src/accelerate_intel.txt b/doc/src/accelerate_intel.txt
index e585209cf5..aaa38d7de2 100644
--- a/doc/src/accelerate_intel.txt
+++ b/doc/src/accelerate_intel.txt
@@ -27,12 +27,12 @@ LAMMPS to run on the CPU cores and coprocessor cores simultaneously.
 Angle Styles: charmm, harmonic :ulb,l
 Bond Styles: fene, fourier, harmonic :l
 Dihedral Styles: charmm, harmonic, opls :l
-Fixes: nve, npt, nvt, nvt/sllod :l
+Fixes: nve, npt, nvt, nvt/sllod, nve/asphere :l
 Improper Styles: cvff, harmonic :l
 Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, 
 buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, 
-lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, rebo,
-sw, tersoff :l
+lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, 
+rebo, sw, tersoff :l
 K-Space Styles: pppm, pppm/disp :l
 :ule
 
@@ -54,11 +54,12 @@ warmup run (for use with offload benchmarks).
 :c,image(JPG/user_intel.png)
 
 Results are speedups obtained on Intel Xeon E5-2697v4 processors
-(code-named Broadwell) and Intel Xeon Phi 7250 processors
-(code-named Knights Landing) with "June 2017" LAMMPS built with
-Intel Parallel Studio 2017 update 2. Results are with 1 MPI task
-per physical core. See {src/USER-INTEL/TEST/README} for the raw
-simulation rates and instructions to reproduce.
+(code-named Broadwell), Intel Xeon Phi 7250 processors (code-named
+Knights Landing), and Intel Xeon Gold 6148 processors (code-named
+Skylake) with "June 2017" LAMMPS built with Intel Parallel Studio
+2017 update 2. Results are with 1 MPI task per physical core. See
+{src/USER-INTEL/TEST/README} for the raw simulation rates and
+instructions to reproduce.
 
 :line
 
@@ -113,7 +114,7 @@ $t should be 2 for Intel Xeon CPUs and 2 or 4 for Intel Xeon Phi :l
 For some of the simple 2-body potentials without long-range
 electrostatics, performance and scalability can be better with
 the "newton off" setting added to the input script :l
-For simulations on higher node counts, add "processors * * * grid 
+For simulations on higher node counts, add "processors * * * grid
 numa" to the beginning of the input script for better scalability :l
 If using {kspace_style pppm} in the input script, add
 "kspace_modify diff ad" for better performance :l
@@ -124,8 +125,8 @@ For Intel Xeon Phi CPUs:
 Runs should be performed using MCDRAM. :ulb,l
 :ule
 
-For simulations using {kspace_style pppm} on Intel CPUs
-supporting AVX-512:
+For simulations using {kspace_style pppm} on Intel CPUs supporting
+AVX-512:
 
 Add "kspace_modify diff ad" to the input script :ulb,l
 The command-line option should be changed to
@@ -242,14 +243,17 @@ However, if you do not have coprocessors on your system, building
 without offload support will produce a smaller binary.
 
 The general requirements for Makefiles with the USER-INTEL package
-are as follows. "-DLAMMPS_MEMALIGN=64" is required for CCFLAGS. When
-using Intel compilers, "-restrict" is required and "-qopenmp" is
-highly recommended for CCFLAGS and LINKFLAGS. LIB should include
-"-ltbbmalloc". For builds supporting offload, "-DLMP_INTEL_OFFLOAD"
-is required for CCFLAGS and "-qoffload" is required for LINKFLAGS.
-Other recommended CCFLAG options for best performance are
-"-O2 -fno-alias -ansi-alias -qoverride-limits fp-model fast=2
--no-prec-div".
+are as follows. When using Intel compilers, "-restrict" is required 
+and "-qopenmp" is highly recommended for CCFLAGS and LINKFLAGS. 
+CCFLAGS should include "-DLMP_INTEL_USELRT" (unless POSIX Threads
+are not supported in the build environment) and "-DLMP_USE_MKL_RNG"
+(unless Intel Math Kernel Library (MKL) is not available in the build
+environment). For Intel compilers, LIB should include "-ltbbmalloc" 
+or if the library is not available, "-DLMP_INTEL_NO_TBB" can be added
+to CCFLAGS. For builds supporting offload, "-DLMP_INTEL_OFFLOAD" is
+required for CCFLAGS and "-qoffload" is required for LINKFLAGS. Other
+recommended CCFLAG options for best performance are "-O2 -fno-alias
+-ansi-alias -qoverride-limits fp-model fast=2 -no-prec-div".
 
 NOTE: The vectorization and math capabilities can differ depending on
 the CPU. For Intel compilers, the "-x" flag specifies the type of
diff --git a/src/MAKE/MACHINES/Makefile.cori2 b/src/MAKE/MACHINES/Makefile.cori2
index a367d54080..45e1ab1f8a 100755
--- a/src/MAKE/MACHINES/Makefile.cori2
+++ b/src/MAKE/MACHINES/Makefile.cori2
@@ -15,13 +15,14 @@ SHELL = /bin/sh
 
 CC =		CC
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_NO_TBB
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG -DLMP_INTEL_NO_TBB \
+                $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		CC
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu b/src/MAKE/OPTIONS/Makefile.intel_cpu
index 2c3cc51249..41d0f959fe 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu
@@ -8,15 +8,14 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) \
-		-DLMP_USE_MKL_RNG
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           -ltbbmalloc -ltbbmalloc_proxy
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size
 
 ARCHIVE =	ar
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
index ff2d0cc5c2..ef514f43c6 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_intelmpi
@@ -8,9 +8,8 @@ SHELL = /bin/sh
 
 CC =		mpiicpc 
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) \
-		-DLMP_USE_MKL_RNG -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
index 40d517bce4..68f879860a 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_mpich
@@ -8,13 +8,13 @@ SHELL = /bin/sh
 
 CC =		mpicxx -cxx=icc
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx -cxx=icc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           
 SIZE =		size
 
diff --git a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
index fe1be99e58..457a64b223 100644
--- a/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
+++ b/src/MAKE/OPTIONS/Makefile.intel_cpu_openmpi
@@ -9,14 +9,14 @@ SHELL = /bin/sh
 export OMPI_CXX = icc
 CC =		mpicxx
 OPTFLAGS =      -xHost -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-g -qopenmp -DLAMMPS_MEMALIGN=64 -no-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS) -DLMP_INTEL_USELRT
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpicxx
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
-LIB =           -ltbbmalloc -ltbbmalloc_proxy
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
+LIB =           -ltbbmalloc
 SIZE =		size
 
 ARCHIVE =	ar
diff --git a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor b/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
deleted file mode 100644
index 769c166105..0000000000
--- a/src/MAKE/OPTIONS/Makefile.intel_knl_coprocessor
+++ /dev/null
@@ -1,125 +0,0 @@
-# intel_phi = USER-INTEL with Phi x200 (KNL) offload support,Intel MPI,MKL FFT
-
-SHELL = /bin/sh
-
-# ---------------------------------------------------------------------
-# compiler/linker settings
-# specify flags and libraries needed for your compiler
-
-CC =		mpiicpc 
-MIC_OPT =       -qoffload-arch=mic-avx512 -fp-model fast=2
-CCFLAGS =	-O3 -qopenmp -DLMP_INTEL_OFFLOAD -DLAMMPS_MEMALIGN=64 \
-		-xHost -fno-alias -ansi-alias -restrict \
-		-qoverride-limits $(MIC_OPT) -DLMP_INTEL_USELRT \
-		-DLMP_USE_MKL_RNG
-
-SHFLAGS =	-fPIC
-DEPFLAGS =	-M
-
-LINK =		mpiicpc
-LINKFLAGS =	-g -O3 -xHost -qopenmp -qoffload $(MIC_OPT)
-LIB =           -ltbbmalloc
-SIZE =		size
-
-ARCHIVE =	ar
-ARFLAGS =	-rc
-SHLIBFLAGS =	-shared
-
-# ---------------------------------------------------------------------
-# LAMMPS-specific settings, all OPTIONAL
-# specify settings for LAMMPS features you will use
-# if you change any -D setting, do full re-compile after "make clean"
-
-# LAMMPS ifdef settings
-# see possible settings in Section 2.2 (step 4) of manual
-
-LMP_INC =	-DLAMMPS_GZIP -DLAMMPS_JPEG
-
-# MPI library
-# see discussion in Section 2.2 (step 5) of manual
-# MPI wrapper compiler/linker can provide this info
-# can point to dummy MPI library in src/STUBS as in Makefile.serial
-# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
-# INC = path for mpi.h, MPI compiler settings
-# PATH = path for MPI library
-# LIB = name of MPI library
-
-MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
-MPI_PATH = 
-MPI_LIB =
-
-# FFT library
-# see discussion in Section 2.2 (step 6) of manaul
-# can be left blank to use provided KISS FFT library
-# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
-# PATH = path for FFT library
-# LIB = name of FFT library
-
-FFT_INC =      -DFFT_MKL -DFFT_SINGLE
-FFT_PATH = 
-FFT_LIB =	-L$(MKLROOT)/lib/intel64/ -lmkl_intel_ilp64 -lmkl_sequential -lmkl_core
-
-# JPEG and/or PNG library
-# see discussion in Section 2.2 (step 7) of manual
-# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
-# INC = path(s) for jpeglib.h and/or png.h
-# PATH = path(s) for JPEG library and/or PNG library
-# LIB = name(s) of JPEG library and/or PNG library
-
-JPG_INC =       
-JPG_PATH = 	
-JPG_LIB =	-ljpeg
-
-# ---------------------------------------------------------------------
-# build rules and dependencies
-# do not edit this section
-
-include	Makefile.package.settings
-include	Makefile.package
-
-EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
-EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
-EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
-EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
-EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
-
-# Path to src files
-
-vpath %.cpp ..
-vpath %.h ..
-
-# Link target
-
-$(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
-	$(SIZE) $(EXE)
-
-# Library targets
-
-lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
-
-shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
-        $(OBJ) $(EXTRA_LIB) $(LIB)
-
-# Compilation rules
-
-%.o:%.cpp $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-%.d:%.cpp $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(EXTRA_INC) $(DEPFLAGS) $< > $@
-
-%.o:%.cu $(EXTRA_CPP_DEPENDS)
-	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
-
-# Individual dependencies
-
-depend : fastdep.exe $(SRC)
-	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
-
-fastdep.exe: ../DEPEND/fastdep.c
-	cc -O -o $@ $<
-
-sinclude .depend
diff --git a/src/MAKE/OPTIONS/Makefile.knl b/src/MAKE/OPTIONS/Makefile.knl
index 881c51f0e4..8e266a4fce 100644
--- a/src/MAKE/OPTIONS/Makefile.knl
+++ b/src/MAKE/OPTIONS/Makefile.knl
@@ -8,13 +8,13 @@ SHELL = /bin/sh
 
 CC =		mpiicpc
 OPTFLAGS =      -xMIC-AVX512 -O2 -fp-model fast=2 -no-prec-div -qoverride-limits
-CCFLAGS =	-qopenmp -DLAMMPS_MEMALIGN=64 -qno-offload \
-                -fno-alias -ansi-alias -restrict $(OPTFLAGS)
+CCFLAGS =	-qopenmp -qno-offload -fno-alias -ansi-alias -restrict \
+                -DLMP_INTEL_USELRT -DLMP_USE_MKL_RNG $(OPTFLAGS)
 SHFLAGS =	-fPIC
 DEPFLAGS =	-M
 
 LINK =		mpiicpc
-LINKFLAGS =	-g -qopenmp $(OPTFLAGS)
+LINKFLAGS =	-qopenmp $(OPTFLAGS)
 LIB =           -ltbbmalloc
 SIZE =		size
 
diff --git a/src/USER-INTEL/README b/src/USER-INTEL/README
index edfc69120c..871d881f39 100644
--- a/src/USER-INTEL/README
+++ b/src/USER-INTEL/README
@@ -53,12 +53,11 @@ need to be changed.
 
 -----------------------------------------------------------------------------
 
-The random number generator for Dissipative Particle Dynamics (DPD) in the 
-Intel package uses the Mersenne Twister pseudorandom number generator as 
-implemented in the Intel Math Kernel Library (MKL). This generator is faster
-and more robust with a significantly longer period than the default DPD
-generator. However, if MKL is not installed, the standard random number
-generator can be used by adding the compile flag "-DLMP_NO_MKL_RNG".
+Unless Intel Math Kernel Library (MKL) is unavailable, -DLMP_USE_MKL_RNG
+should be added to the compile flags. This will enable using the MKL Mersenne
+Twister random number generator (RNG) for Dissipative Particle Dynamics 
+(DPD). This RNG can allow significantly faster performance and it also has a 
+significantly longer period than the standard RNG for DPD.
 
 -----------------------------------------------------------------------------
 
diff --git a/src/USER-INTEL/verlet_lrt_intel.cpp b/src/USER-INTEL/verlet_lrt_intel.cpp
index 81f4586143..9ff5f85176 100644
--- a/src/USER-INTEL/verlet_lrt_intel.cpp
+++ b/src/USER-INTEL/verlet_lrt_intel.cpp
@@ -68,7 +68,7 @@ void VerletLRTIntel::init()
 
   _intel_kspace = (PPPMIntel*)(force->kspace_match("pppm/intel", 0));
 
-  #ifdef LMP_INTEL_NOLRT
+  #ifndef LMP_INTEL_USELRT
   error->all(FLERR,
              "LRT otion for Intel package disabled at compile time");
   #endif
diff --git a/src/USER-INTEL/verlet_lrt_intel.h b/src/USER-INTEL/verlet_lrt_intel.h
index 813cd53605..0d7154ff64 100644
--- a/src/USER-INTEL/verlet_lrt_intel.h
+++ b/src/USER-INTEL/verlet_lrt_intel.h
@@ -23,10 +23,7 @@ IntegrateStyle(verlet/lrt/intel,VerletLRTIntel)
 #include "verlet.h"
 #include "pppm_intel.h"
 
-#ifndef LMP_INTEL_USELRT
-#define LMP_INTEL_NOLRT
-#else
-
+#ifdef LMP_INTEL_USELRT
 #ifdef LMP_INTEL_LRT11
 #define _LMP_INTEL_LRT_11
 #include <thread>

From 9dc42fd4db713cb74d52697d0e1af2f6404867e3 Mon Sep 17 00:00:00 2001
From: Michael Brown <michael.w.brown@intel.com>
Date: Mon, 2 Oct 2017 23:53:05 -0700
Subject: [PATCH 36/53] intel_simd.h is currently also needed by
 dihedral/charmm, not just sw.

---
 src/USER-INTEL/Install.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-INTEL/Install.sh b/src/USER-INTEL/Install.sh
index f7163e6791..da553d158a 100644
--- a/src/USER-INTEL/Install.sh
+++ b/src/USER-INTEL/Install.sh
@@ -46,7 +46,7 @@ action nbin_intel.h
 action nbin_intel.cpp
 action npair_intel.h
 action npair_intel.cpp
-action intel_simd.h pair_sw_intel.cpp
+action intel_simd.h
 action intel_intrinsics.h pair_tersoff_intel.cpp
 action intel_intrinsics_airebo.h pair_airebo_intel.cpp
 

From 197f08278442df28c96eaaf2a3b9f350d7432dae Mon Sep 17 00:00:00 2001
From: James Barnett <jwb2162@columbia.edu>
Date: Tue, 3 Oct 2017 11:15:44 -0400
Subject: [PATCH 37/53] cmake: Add -restrict for Intel compilers for some
 packages

Some packages (USER-OMP, OPT, and USER-INTEL) require the -restrict
flag when using the Intel compiler.
---
 cmake/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index bc33da60de..9a74a788d0 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -150,6 +150,11 @@ if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL)
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()
 
+if((ENABLE_USER-OMP OR ENABLE_OPT OR ENABLE_USER-INTEL) AND
+        (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel"))
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
+endif()
+
 if(ENABLE_KSPACE)
   set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
   set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2)

From ca032f21fbfa9f5c3de41e09b7c94be220ebfc07 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 3 Oct 2017 10:14:24 -0600
Subject: [PATCH 38/53] Add Kokkos threaded reverse comm option

---
 doc/src/package.txt                   |  12 +-
 src/KOKKOS/atom_vec_atomic_kokkos.cpp | 444 -------------------
 src/KOKKOS/atom_vec_atomic_kokkos.h   |  18 -
 src/KOKKOS/atom_vec_bond_kokkos.cpp   | 442 -------------------
 src/KOKKOS/atom_vec_bond_kokkos.h     |  18 -
 src/KOKKOS/atom_vec_charge_kokkos.cpp | 391 -----------------
 src/KOKKOS/atom_vec_charge_kokkos.h   |  18 -
 src/KOKKOS/atom_vec_dpd_kokkos.h      |   3 -
 src/KOKKOS/atom_vec_full_kokkos.cpp   | 446 --------------------
 src/KOKKOS/atom_vec_full_kokkos.h     |  18 -
 src/KOKKOS/atom_vec_kokkos.cpp        | 586 ++++++++++++++++++++++++++
 src/KOKKOS/atom_vec_kokkos.h          |  41 +-
 src/KOKKOS/comm_kokkos.cpp            | 119 ++++--
 src/KOKKOS/comm_kokkos.h              |   7 +-
 src/KOKKOS/kokkos.cpp                 |  27 +-
 src/KOKKOS/kokkos.h                   |   2 +
 src/comm_brick.cpp                    |   6 +-
 17 files changed, 748 insertions(+), 1850 deletions(-)

diff --git a/doc/src/package.txt b/doc/src/package.txt
index 58f6a5e34d..5c698934e8 100644
--- a/doc/src/package.txt
+++ b/doc/src/package.txt
@@ -62,7 +62,7 @@ args = arguments specific to the style :l
       {no_affinity} values = none
   {kokkos} args = keyword value ...
     zero or more keyword/value pairs may be appended
-    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward}
+    keywords = {neigh} or {neigh/qeq} or {newton} or {binsize} or {comm} or {comm/exchange} or {comm/forward} or {comm/reverse}
       {neigh} value = {full} or {half}
         full = full neighbor list
         half = half neighbor list built in thread-safe manner
@@ -75,9 +75,10 @@ args = arguments specific to the style :l
       {binsize} value = size
         size = bin size for neighbor list construction (distance units)
       {comm} value = {no} or {host} or {device}
-        use value for both comm/exchange and comm/forward
+        use value for comm/exchange and comm/forward and comm/reverse
       {comm/exchange} value = {no} or {host} or {device}
       {comm/forward} value = {no} or {host} or {device}
+      {comm/reverse} value = {no} or {host} or {device}
         no = perform communication pack/unpack in non-KOKKOS mode
         host = perform pack/unpack on host (e.g. with OpenMP threading)
         device = perform pack/unpack on device (e.g. on GPU)
@@ -429,17 +430,18 @@ Coulombic solver"_kspace_style.html because the GPU is faster at
 performing pairwise interactions, then this rule of thumb may give too
 large a binsize.
 
-The {comm} and {comm/exchange} and {comm/forward} keywords determine
+The {comm} and {comm/exchange} and {comm/forward} and {comm/reverse} keywords determine
 whether the host or device performs the packing and unpacking of data
 when communicating per-atom data between processors.  "Exchange"
 communication happens only on timesteps that neighbor lists are
 rebuilt.  The data is only for atoms that migrate to new processors.
-"Forward" communication happens every timestep.  The data is for atom
+"Forward" communication happens every timestep. "Reverse" communication
+happens every timestep if the {newton} option is on.  The data is for atom
 coordinates and any other atom properties that needs to be updated for
 ghost atoms owned by each processor.
 
 The {comm} keyword is simply a short-cut to set the same value
-for both the {comm/exchange} and {comm/forward} keywords.
+for both the {comm/exchange} and {comm/forward} and {comm/reverse} keywords.
 
 The value options for all 3 keywords are {no} or {host} or {device}.
 A value of {no} means to use the standard non-KOKKOS method of
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
index b63dc5fb8c..6c610c8c11 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -136,450 +136,6 @@ void AtomVecAtomicKokkos::copy(int i, int j, int delflag)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecAtomicKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecAtomicKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
-                                          const DAT::tdual_int_2d &list,
-                                          const int & iswap,
-                                          const DAT::tdual_xfloat_2d &buf,
-                                          const int &pbc_flag,
-                                          const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecAtomicKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecAtomicKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecAtomicKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecAtomicKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecAtomicKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecAtomicKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm(int n, int *list, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecAtomicKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecAtomicKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0) {
-    sync(Host,F_MASK);
-    modified(Host,F_MASK);
-  }
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecAtomicKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.h b/src/KOKKOS/atom_vec_atomic_kokkos.h
index 5e9a72c2e3..e4d2654e2c 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.h
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.h
@@ -33,12 +33,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
   virtual ~AtomVecAtomicKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   void unpack_border(int, int, double *);
@@ -55,15 +49,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -99,9 +84,6 @@ class AtomVecAtomicKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::tdual_int_1d k_count;
 };
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp
index e0f29a27bb..076144420c 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@@ -178,448 +178,6 @@ void AtomVecBondKokkos::copy(int i, int j, int delflag)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecBondKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecBondKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
-                                        const DAT::tdual_int_2d &list,
-                                        const int & iswap,
-                                        const DAT::tdual_xfloat_2d &buf,
-                                        const int &pbc_flag,
-                                        const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecBondKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecBondKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecBondKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-										const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecBondKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecBondKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecBondKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecBondKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecBondKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                     int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecBondKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecBondKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecBondKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.h b/src/KOKKOS/atom_vec_bond_kokkos.h
index 3dcc99fa78..7ec15450ef 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.h
+++ b/src/KOKKOS/atom_vec_bond_kokkos.h
@@ -32,12 +32,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
   virtual ~AtomVecBondKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
@@ -59,15 +53,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -112,9 +97,6 @@ class AtomVecBondKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::t_tagint_1d d_molecule;
   DAT::t_int_2d d_nspecial;
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp
index 89f7e91c2b..7b8b74b405 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@@ -199,397 +199,6 @@ struct AtomVecChargeKokkos_PackComm {
 
 /* ---------------------------------------------------------------------- */
 
-int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
-                                          const DAT::tdual_int_2d &list,
-                                          const int & iswap,
-                                          const DAT::tdual_xfloat_2d &buf,
-                                          const int &pbc_flag,
-                                          const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecChargeKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecChargeKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecChargeKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
-                                        const int nfirst, const int &pbc_flag, const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecChargeKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
-          domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecChargeKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecChargeKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecChargeKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecChargeKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm(int n, int *list, double *buf,
-                             int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                 int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecChargeKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecChargeKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecChargeKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.h b/src/KOKKOS/atom_vec_charge_kokkos.h
index f9b385e7ed..e9ff70bbe1 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.h
+++ b/src/KOKKOS/atom_vec_charge_kokkos.h
@@ -33,12 +33,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
   virtual ~AtomVecChargeKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
@@ -60,15 +54,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -108,9 +93,6 @@ class AtomVecChargeKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::t_float_1d d_q;
 
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h
index 372404cc7d..cec1b82357 100644
--- a/src/KOKKOS/atom_vec_dpd_kokkos.h
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.h
@@ -111,9 +111,6 @@ class AtomVecDPDKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::tdual_int_1d k_count;
 };
diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp
index fd7eaf7c81..8e9abe4067 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@@ -307,452 +307,6 @@ void AtomVecFullKokkos::copy(int i, int j, int delflag)
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecFullKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecFullKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().dimension_0()
-				*buf.view<DeviceType>().dimension_1())/3;
-        const size_t elements = 3;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
-                                             const DAT::tdual_int_2d &list,
-                                             const int & iswap,
-                                             const DAT::tdual_xfloat_2d &buf,
-                                             const int &pbc_flag,
-                                             const int* const pbc)
-{
-  // Check whether to always run forward communication on the host
-  // Choose correct forward PackComm kernel
-
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,1,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,1,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,0,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPHostType,0,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,1,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,1>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      } else {
-        struct AtomVecFullKokkos_PackComm<LMPDeviceType,0,0>
-          f(atomKK->k_x,buf,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-            domain->xy,domain->xz,domain->yz,pbc);
-        Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-
-	return n*size_forward;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecFullKokkos_PackCommSelf {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_x_array _xw;
-  int _nfirst;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-
-  AtomVecFullKokkos_PackCommSelf(
-      const typename DAT::tdual_x_array &x,
-      const int &nfirst,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-    _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),
-    _list(list.view<DeviceType>()),_iswap(iswap),
-    _xprd(xprd),_yprd(yprd),_zprd(zprd),
-    _xy(xy),_xz(xz),_yz(yz) {
-    _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-    _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _xw(i+_nfirst,0) = _x(j,0);
-          _xw(i+_nfirst,1) = _x(j,1);
-          _xw(i+_nfirst,2) = _x(j,2);
-      } else {
-        if (TRICLINIC == 0) {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        } else {
-          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
-        }
-      }
-
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                                           const int & iswap,
-                                           const int nfirst, const int &pbc_flag,
-                                           const int* const pbc) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,1,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPHostType,0,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    if(pbc_flag) {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,1,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    } else {
-      if(domain->triclinic) {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,1>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      } else {
-      struct AtomVecFullKokkos_PackCommSelf<LMPDeviceType,0,0>
-        f(atomKK->k_x,nfirst,list,iswap,domain->xprd,domain->yprd,domain->zprd,
-          domain->xy,domain->xz,domain->yz,pbc);
-      Kokkos::parallel_for(n,f);
-      }
-    }
-  }
-	return n*3;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecFullKokkos_UnpackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  int _first;
-
-  AtomVecFullKokkos_UnpackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
-                        _first(first) {};
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first,
-    const DAT::tdual_xfloat_2d &buf ) {
-  if(commKK->forward_comm_on_host) {
-    sync(Host,X_MASK);
-    modified(Host,X_MASK);
-    struct AtomVecFullKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    sync(Device,X_MASK);
-    modified(Device,X_MASK);
-    struct AtomVecFullKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
-    Kokkos::parallel_for(n,f);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm(int n, int *list, double *buf,
-                                      int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_comm_vel(int n, int *list, double *buf,
-                                          int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
-      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
-      dz = pbc[2]*domain->zprd;
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_comm_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecFullKokkos::pack_reverse(int n, int first, double *buf)
-{
-  if(n > 0)
-    sync(Host,F_MASK);
-
-  int m = 0;
-  const int last = first + n;
-  for (int i = first; i < last; i++) {
-    buf[m++] = h_f(i,0);
-    buf[m++] = h_f(i,1);
-    buf[m++] = h_f(i,2);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecFullKokkos::unpack_reverse(int n, int *list, double *buf)
-{
-  if(n > 0)
-    modified(Host,F_MASK);
-
-  int m = 0;
-  for (int i = 0; i < n; i++) {
-    const int j = list[i];
-    h_f(j,0) += buf[m++];
-    h_f(j,1) += buf[m++];
-    h_f(j,2) += buf[m++];
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType,int PBC_FLAG>
 struct AtomVecFullKokkos_PackBorder {
   typedef DeviceType device_type;
diff --git a/src/KOKKOS/atom_vec_full_kokkos.h b/src/KOKKOS/atom_vec_full_kokkos.h
index 760df087e1..33760a8b5f 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.h
+++ b/src/KOKKOS/atom_vec_full_kokkos.h
@@ -32,12 +32,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
   virtual ~AtomVecFullKokkos() {}
   void grow(int);
   void copy(int, int, int);
-  int pack_comm(int, int *, double *, int, int *);
-  int pack_comm_vel(int, int *, double *, int, int *);
-  void unpack_comm(int, int, double *);
-  void unpack_comm_vel(int, int, double *);
-  int pack_reverse(int, int, double *);
-  void unpack_reverse(int, int *, double *);
   int pack_border(int, int *, double *, int, int *);
   int pack_border_vel(int, int *, double *, int, int *);
   int pack_border_hybrid(int, int *, double *);
@@ -59,15 +53,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
   bigint memory_usage();
 
   void grow_reset();
-  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
-                       const int & iswap,
-                       const DAT::tdual_xfloat_2d &buf,
-                       const int &pbc_flag, const int pbc[]);
-  void unpack_comm_kokkos(const int &n, const int &nfirst,
-                          const DAT::tdual_xfloat_2d &buf);
-  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
-                     const int & iswap, const int nfirst,
-                     const int &pbc_flag, const int pbc[]);
   int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                          DAT::tdual_xfloat_2d buf,int iswap,
                          int pbc_flag, int *pbc, ExecutionSpace space);
@@ -125,9 +110,6 @@ class AtomVecFullKokkos : public AtomVecKokkos {
   DAT::t_x_array d_x;
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
-  HAT::t_x_array h_x;
-  HAT::t_v_array h_v;
-  HAT::t_f_array h_f;
 
   DAT::t_float_1d d_q;
   HAT::t_float_1d h_q;
diff --git a/src/KOKKOS/atom_vec_kokkos.cpp b/src/KOKKOS/atom_vec_kokkos.cpp
index 5542991395..03fb2a4ead 100644
--- a/src/KOKKOS/atom_vec_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_kokkos.cpp
@@ -12,6 +12,10 @@
 ------------------------------------------------------------------------- */
 
 #include "atom_vec_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "atom_masks.h"
 
 using namespace LAMMPS_NS;
 
@@ -24,3 +28,585 @@ AtomVecKokkos::AtomVecKokkos(LAMMPS *lmp) : AtomVec(lmp)
   buffer_size = 0;
 }
 
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_xfloat_2d &buf,
+                                          const int &pbc_flag,
+                                          const int* const pbc)
+{
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+
+	return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecKokkos_PackCommSelf(
+      const typename DAT::tdual_x_array &x,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _xw(i+_nfirst,0) = _x(j,0);
+          _xw(i+_nfirst,1) = _x(j,1);
+          _xw(i+_nfirst,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+                                        const int nfirst, const int &pbc_flag, const int* const pbc) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+	return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecKokkos_UnpackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const int& first):_x(x.view<DeviceType>()),_buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm_kokkos(const int &n, const int &first,
+    const DAT::tdual_xfloat_2d &buf ) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK);
+    modified(Host,X_MASK);
+    struct AtomVecKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,X_MASK);
+    modified(Device,X_MASK);
+    struct AtomVecKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (atom->mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_PackReverse {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d _buf;
+  int _first;
+
+  AtomVecKokkos_PackReverse(
+      const typename DAT::tdual_f_array &f,
+      const typename DAT::tdual_ffloat_2d &buf,
+      const int& first):_f(f.view<DeviceType>()),_buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    _buf(i,0) = _f(i+_first,0);
+    _buf(i,1) = _f(i+_first,1);
+    _buf(i,2) = _f(i+_first,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_reverse_kokkos(const int &n, const int &first,
+    const DAT::tdual_ffloat_2d &buf ) {
+  if(commKK->reverse_comm_on_host) {
+    sync(Host,F_MASK);
+    struct AtomVecKokkos_PackReverse<LMPHostType> f(atomKK->k_f,buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,F_MASK);
+    struct AtomVecKokkos_PackReverse<LMPDeviceType> f(atomKK->k_f,buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+
+  return n*size_reverse;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnPackReverseSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array_randomread _f;
+  typename ArrayTypes<DeviceType>::t_f_array _fw;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+
+  AtomVecKokkos_UnPackReverseSelf(
+      const typename DAT::tdual_f_array &f,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap):
+      _f(f.view<DeviceType>()),_fw(f.view<DeviceType>()),_nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap) {
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    _fw(j,0) += _f(i+_nfirst,0);
+    _fw(j,1) += _f(i+_nfirst,1);
+    _fw(j,2) += _f(i+_nfirst,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+                                        const int nfirst) {
+  if(commKK->reverse_comm_on_host) {
+    sync(Host,F_MASK);
+    struct AtomVecKokkos_UnPackReverseSelf<LMPHostType> f(atomKK->k_f,nfirst,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Host,F_MASK);
+  } else {
+    sync(Device,F_MASK);
+    struct AtomVecKokkos_UnPackReverseSelf<LMPDeviceType> f(atomKK->k_f,nfirst,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Device,F_MASK);
+  }
+  return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecKokkos_UnPackReverse {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_f_array _f;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+
+  AtomVecKokkos_UnPackReverse(
+      const typename DAT::tdual_f_array &f,
+      const typename DAT::tdual_ffloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap):
+      _f(f.view<DeviceType>()),_list(list.view<DeviceType>()),_iswap(iswap) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+    const int j = _list(_iswap,i);
+    _f(j,0) += _buf(i,0);
+    _f(j,1) += _buf(i,1);
+    _f(j,2) += _buf(i,2);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_reverse_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_ffloat_2d &buf)
+{
+  // Check whether to always run reverse communication on the host
+  // Choose correct reverse UnPackReverse kernel
+
+  if(commKK->reverse_comm_on_host) {
+    struct AtomVecKokkos_UnPackReverse<LMPHostType> f(atomKK->k_f,buf,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Host,F_MASK);
+  } else {
+    struct AtomVecKokkos_UnPackReverse<LMPDeviceType> f(atomKK->k_f,buf,list,iswap);
+    Kokkos::parallel_for(n,f);
+    modified(Device,F_MASK);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecKokkos::pack_reverse(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,F_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+
+  if(n > 0)
+    modified(Host,F_MASK);
+}
diff --git a/src/KOKKOS/atom_vec_kokkos.h b/src/KOKKOS/atom_vec_kokkos.h
index 7f593f235f..20a07ec443 100644
--- a/src/KOKKOS/atom_vec_kokkos.h
+++ b/src/KOKKOS/atom_vec_kokkos.h
@@ -35,29 +35,48 @@ class AtomVecKokkos : public AtomVec {
  public:
   AtomVecKokkos(class LAMMPS *);
   virtual ~AtomVecKokkos() {}
+  virtual int pack_comm(int, int *, double *, int, int *);
+  virtual int pack_comm_vel(int, int *, double *, int, int *);
+  virtual void unpack_comm(int, int, double *);
+  virtual void unpack_comm_vel(int, int, double *);
+  virtual int pack_reverse(int, int, double *);
+  virtual void unpack_reverse(int, int *, double *);
 
   virtual void sync(ExecutionSpace space, unsigned int mask) = 0;
   virtual void modified(ExecutionSpace space, unsigned int mask) = 0;
-  virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) {};
+  virtual void sync_overlapping_device(ExecutionSpace space, unsigned int mask) = 0;
 
   virtual int
     pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
                    const int & iswap, const int nfirst,
-                   const int &pbc_flag, const int pbc[]) = 0;
-  //{return 0;}
+                   const int &pbc_flag, const int pbc[]);
+
   virtual int
     pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &list,
                      const int & iswap, const DAT::tdual_xfloat_2d &buf,
-                     const int &pbc_flag, const int pbc[]) = 0;
-  //{return 0;}
+                     const int &pbc_flag, const int pbc[]);
+
   virtual void
     unpack_comm_kokkos(const int &n, const int &nfirst,
-                       const DAT::tdual_xfloat_2d &buf) = 0;
+                       const DAT::tdual_xfloat_2d &buf);
+
+  virtual int
+    unpack_reverse_self(const int &n, const DAT::tdual_int_2d &list,
+                      const int & iswap, const int nfirst);
+
+  virtual int
+    pack_reverse_kokkos(const int &n, const int &nfirst,
+                        const DAT::tdual_ffloat_2d &buf);
+
+  virtual void
+    unpack_reverse_kokkos(const int &n, const DAT::tdual_int_2d &list,
+                          const int & iswap, const DAT::tdual_ffloat_2d &buf);
+
   virtual int
     pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                        DAT::tdual_xfloat_2d buf,int iswap,
                        int pbc_flag, int *pbc, ExecutionSpace space) = 0;
-  //{return 0;};
+
   virtual void
     unpack_border_kokkos(const int &n, const int &nfirst,
                          const DAT::tdual_xfloat_2d &buf,
@@ -68,15 +87,19 @@ class AtomVecKokkos : public AtomVec {
                          DAT::tdual_int_1d k_sendlist,
                          DAT::tdual_int_1d k_copylist,
                          ExecutionSpace space, int dim, X_FLOAT lo, X_FLOAT hi) = 0;
-  //{return 0;};
+
   virtual int
     unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
                            int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
                            ExecutionSpace space) = 0;
-  //{return 0;};
+
 
  protected:
 
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
   class CommKokkos *commKK;
   size_t buffer_size;
   void* buffer;
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index a8b591e210..d4d348d7e2 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -62,7 +62,7 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp)
     tdual_int_1d("comm:k_exchange_sendlist",100);
   k_exchange_copylist = DAT::
     tdual_int_1d("comm:k_exchange_copylist",100);
-  k_count = DAT::tdual_int_1d("comm:k_count",1);
+  k_count = DAT::tdual_int_scalar("comm:k_count");
   k_sendflag = DAT::tdual_int_1d("comm:k_sendflag",100);
 
   memory->destroy(maxsendlist);
@@ -103,8 +103,10 @@ void CommKokkos::init()
   atomKK = (AtomKokkos *) atom;
   exchange_comm_classic = lmp->kokkos->exchange_comm_classic;
   forward_comm_classic = lmp->kokkos->forward_comm_classic;
+  reverse_comm_classic = lmp->kokkos->reverse_comm_classic;
   exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host;
   forward_comm_on_host = lmp->kokkos->forward_comm_on_host;
+  reverse_comm_on_host = lmp->kokkos->reverse_comm_on_host;
 
   CommBrick::init();
 
@@ -133,8 +135,11 @@ void CommKokkos::init()
   if (force->newton == 0) check_reverse = 0;
   if (force->pair) check_reverse += force->pair->comm_reverse_off;
 
-  if(check_reverse || check_forward)
+  if (check_reverse || check_forward)
     forward_comm_classic = true;
+
+  if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
+    reverse_comm_classic = true;
 }
 
 /* ----------------------------------------------------------------------
@@ -174,7 +179,6 @@ void CommKokkos::forward_comm_device(int dummy)
   int n;
   MPI_Request request;
   AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
-  double **x = atom->x;
   double *buf;
 
   // exchange data with another proc
@@ -184,22 +188,17 @@ void CommKokkos::forward_comm_device(int dummy)
   k_sendlist.sync<DeviceType>();
 
   for (int iswap = 0; iswap < nswap; iswap++) {
-
     if (sendproc[iswap] != me) {
       if (comm_x_only) {
-        atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
-        if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
-        else buf = NULL;
-
         if (size_forward_recv[iswap]) {
+            atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
             buf = atomKK->k_x.view<DeviceType>().ptr_on_device() +
               firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1();
             MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
-                    recvproc[iswap],0,world,&request);
+                      recvproc[iswap],0,world,&request);
         }
         n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,
                                    iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]);
-
         if (n) {
           MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),
                    n,MPI_DOUBLE,sendproc[iswap],0,world);
@@ -249,21 +248,91 @@ void CommKokkos::forward_comm_device(int dummy)
     }
   }
 }
+
+/* ----------------------------------------------------------------------
+   reverse communication of forces on atoms every timestep
+   other per-atom attributes may also be sent via pack/unpack routines
+------------------------------------------------------------------------- */
+
 void CommKokkos::reverse_comm()
 {
+  if (!reverse_comm_classic) {
+    if (reverse_comm_on_host) reverse_comm_device<LMPHostType>();
+    else reverse_comm_device<LMPDeviceType>();
+    return;
+  }
+
   k_sendlist.sync<LMPHostType>();
+
   if (comm_f_only)
     atomKK->sync(Host,F_MASK);
   else
     atomKK->sync(Host,ALL_MASK);
+
   CommBrick::reverse_comm();
+
   if (comm_f_only)
     atomKK->modified(Host,F_MASK);
   else
     atomKK->modified(Host,ALL_MASK);
-  atomKK->sync(Device,ALL_MASK);
+
+  atomKK->sync(Device,ALL_MASK); // is this needed?
 }
 
+template<class DeviceType>
+void CommKokkos::reverse_comm_device()
+{
+  int n;
+  MPI_Request request;
+  AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
+  double *buf;
+  
+  // exchange data with another proc
+  // if other proc is self, just copy
+  // if comm_f_only set, exchange or copy directly from f, don't pack
+  
+  k_sendlist.sync<DeviceType>();
+  
+  for (int iswap = nswap-1; iswap >= 0; iswap--) {
+    if (sendproc[iswap] != me) {
+      if (comm_f_only) {
+        if (size_reverse_recv[iswap])
+            MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE,
+                    sendproc[iswap],0,world,&request);
+        if (size_reverse_send[iswap]) {
+          atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK);
+          buf = atomKK->k_f.view<DeviceType>().ptr_on_device() +
+            firstrecv[iswap]*atomKK->k_f.view<DeviceType>().dimension_1();
+  
+          MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
+                   recvproc[iswap],0,world);
+        }
+        if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+        atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                         space,F_MASK);
+      } else {
+        if (size_reverse_recv[iswap])
+          MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
+                    size_reverse_recv[iswap],MPI_DOUBLE,
+                    sendproc[iswap],0,world,&request);
+        n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send);
+        if (n)
+          MPI_Send(k_buf_send.view<DeviceType>().ptr_on_device(),n,
+                   MPI_DOUBLE,recvproc[iswap],0,world);
+        if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
+      }
+      avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap,
+                                k_buf_recv);
+    } else {
+      if (sendnum[iswap])
+        n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap,
+                                 firstrecv[iswap]);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void CommKokkos::forward_comm_fix(Fix *fix, int size)
 {
   k_sendlist.sync<LMPHostType>();
@@ -409,7 +478,7 @@ struct BuildExchangeListFunctor {
   typename AT::t_x_array _x;
 
   int _nlocal,_dim;
-  typename AT::t_int_1d _nsend;
+  typename AT::t_int_scalar _nsend;
   typename AT::t_int_1d _sendlist;
   typename AT::t_int_1d _sendflag;
 
@@ -417,7 +486,7 @@ struct BuildExchangeListFunctor {
   BuildExchangeListFunctor(
       const typename AT::tdual_x_array x,
       const typename AT::tdual_int_1d sendlist,
-      typename AT::tdual_int_1d nsend,
+      typename AT::tdual_int_scalar nsend,
       typename AT::tdual_int_1d sendflag,int nlocal, int dim,
                 X_FLOAT lo, X_FLOAT hi):
                 _x(x.template view<DeviceType>()),
@@ -431,7 +500,7 @@ struct BuildExchangeListFunctor {
   KOKKOS_INLINE_FUNCTION
   void operator() (int i) const {
     if (_x(i,_dim) < _lo || _x(i,_dim) >= _hi) {
-      const int mysend=Kokkos::atomic_fetch_add(&_nsend(0),1);
+      const int mysend=Kokkos::atomic_fetch_add(&_nsend(),1);
       if(mysend<_sendlist.dimension_0()) {
         _sendlist(mysend) = i;
         _sendflag(i) = 1;
@@ -490,9 +559,9 @@ void CommKokkos::exchange_device()
     if (true) {
       if (k_sendflag.h_view.dimension_0()<nlocal) k_sendflag.resize(nlocal);
       k_sendflag.sync<DeviceType>();
-      k_count.h_view(0) = k_exchange_sendlist.h_view.dimension_0();
-      while (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
-        k_count.h_view(0) = 0;
+      k_count.h_view() = k_exchange_sendlist.h_view.dimension_0();
+      while (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
+        k_count.h_view() = 0;
         k_count.modify<LMPHostType>();
         k_count.sync<DeviceType>();
 
@@ -505,10 +574,10 @@ void CommKokkos::exchange_device()
         k_count.modify<DeviceType>();
 
         k_count.sync<LMPHostType>();
-        if (k_count.h_view(0)>=k_exchange_sendlist.h_view.dimension_0()) {
-          k_exchange_sendlist.resize(k_count.h_view(0)*1.1);
-          k_exchange_copylist.resize(k_count.h_view(0)*1.1);
-          k_count.h_view(0)=k_exchange_sendlist.h_view.dimension_0();
+        if (k_count.h_view()>=k_exchange_sendlist.h_view.dimension_0()) {
+          k_exchange_sendlist.resize(k_count.h_view()*1.1);
+          k_exchange_copylist.resize(k_count.h_view()*1.1);
+          k_count.h_view()=k_exchange_sendlist.h_view.dimension_0();
         }
       }
       k_exchange_copylist.sync<LMPHostType>();
@@ -516,8 +585,8 @@ void CommKokkos::exchange_device()
       k_sendflag.sync<LMPHostType>();
 
       int sendpos = nlocal-1;
-      nlocal -= k_count.h_view(0);
-      for(int i = 0; i < k_count.h_view(0); i++) {
+      nlocal -= k_count.h_view();
+      for(int i = 0; i < k_count.h_view(); i++) {
         if (k_exchange_sendlist.h_view(i)<nlocal) {
           while (k_sendflag.h_view(sendpos)) sendpos--;
           k_exchange_copylist.h_view(i) = sendpos;
@@ -528,10 +597,10 @@ void CommKokkos::exchange_device()
 
       k_exchange_copylist.modify<LMPHostType>();
       k_exchange_copylist.sync<DeviceType>();
-      nsend = k_count.h_view(0);
+      nsend = k_count.h_view();
       if (nsend > maxsend) grow_send_kokkos(nsend,1);
       nsend =
-        avec->pack_exchange_kokkos(k_count.h_view(0),k_buf_send,
+        avec->pack_exchange_kokkos(k_count.h_view(),k_buf_send,
                                    k_exchange_sendlist,k_exchange_copylist,
                                    ExecutionSpaceFromDevice<DeviceType>::
                                    space,dim,lo,hi);
diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h
index 4065efd000..f137655cb8 100644
--- a/src/KOKKOS/comm_kokkos.h
+++ b/src/KOKKOS/comm_kokkos.h
@@ -25,15 +25,17 @@ class CommKokkos : public CommBrick {
 
   bool exchange_comm_classic;
   bool forward_comm_classic;
+  bool reverse_comm_classic;
   bool exchange_comm_on_host;
   bool forward_comm_on_host;
+  bool reverse_comm_on_host;
 
   CommKokkos(class LAMMPS *);
   ~CommKokkos();
   void init();
 
   void forward_comm(int dummy = 0);    // forward comm of atom coords
-  void reverse_comm();              // reverse comm of atom coords
+  void reverse_comm();                 // reverse comm of atom coords
   void exchange();                     // move atoms to new procs
   void borders();                      // setup list of atoms to comm
 
@@ -47,6 +49,7 @@ class CommKokkos : public CommBrick {
   void reverse_comm_dump(class Dump *);    // reverse comm from a Dump
 
   template<class DeviceType> void forward_comm_device(int dummy);
+  template<class DeviceType> void reverse_comm_device();
   template<class DeviceType> void forward_comm_pair_device(Pair *pair);
   template<class DeviceType> void exchange_device();
   template<class DeviceType> void borders_device();
@@ -56,7 +59,7 @@ class CommKokkos : public CommBrick {
   DAT::tdual_int_scalar k_total_send;
   DAT::tdual_xfloat_2d k_buf_send,k_buf_recv;
   DAT::tdual_int_1d k_exchange_sendlist,k_exchange_copylist,k_sendflag;
-  DAT::tdual_int_1d k_count;
+  DAT::tdual_int_scalar k_count;
   //double *buf_send;                 // send buffer for all comm
   //double *buf_recv;                 // recv buffer for all comm
 
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index 072a802b54..2b02624dce 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -123,8 +123,10 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp)
   neighflag_qeq_set = 0;
   exchange_comm_classic = 0;
   forward_comm_classic = 0;
+  reverse_comm_classic = 0;
   exchange_comm_on_host = 0;
   forward_comm_on_host = 0;
+  reverse_comm_on_host = 0;
 
 #ifdef KILL_KOKKOS_ON_SIGSEGV
   signal(SIGSEGV, my_signal_handler);
@@ -158,8 +160,8 @@ void KokkosLMP::accelerator(int narg, char **arg)
   neighflag_qeq_set = 0;
   int newtonflag = 0;
   double binsize = 0.0;
-  exchange_comm_classic = forward_comm_classic = 0;
-  exchange_comm_on_host = forward_comm_on_host = 0;
+  exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+  exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
 
   int iarg = 0;
   while (iarg < narg) {
@@ -200,13 +202,13 @@ void KokkosLMP::accelerator(int narg, char **arg)
     } else if (strcmp(arg[iarg],"comm") == 0) {
       if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
       if (strcmp(arg[iarg+1],"no") == 0) {
-        exchange_comm_classic = forward_comm_classic = 1;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1;
       } else if (strcmp(arg[iarg+1],"host") == 0) {
-        exchange_comm_classic = forward_comm_classic = 0;
-        exchange_comm_on_host = forward_comm_on_host = 1;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1;
       } else if (strcmp(arg[iarg+1],"device") == 0) {
-        exchange_comm_classic = forward_comm_classic = 0;
-        exchange_comm_on_host = forward_comm_on_host = 0;
+        exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0;
+        exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
     } else if (strcmp(arg[iarg],"comm/exchange") == 0) {
@@ -231,6 +233,17 @@ void KokkosLMP::accelerator(int narg, char **arg)
         forward_comm_on_host = 0;
       } else error->all(FLERR,"Illegal package kokkos command");
       iarg += 2;
+    } else if (strcmp(arg[iarg],"comm/reverse") == 0) {
+      if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command");
+      if (strcmp(arg[iarg+1],"no") == 0) reverse_comm_classic = 1;
+      else if (strcmp(arg[iarg+1],"host") == 0) {
+        reverse_comm_classic = 0;
+        reverse_comm_on_host = 1;
+      } else if (strcmp(arg[iarg+1],"device") == 0) {
+        reverse_comm_classic = 0;
+        reverse_comm_on_host = 0;
+      } else error->all(FLERR,"Illegal package kokkos command");
+      iarg += 2;
     } else error->all(FLERR,"Illegal package kokkos command");
   }
 
diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h
index 8e28b38cbf..7b7848f1f0 100644
--- a/src/KOKKOS/kokkos.h
+++ b/src/KOKKOS/kokkos.h
@@ -27,8 +27,10 @@ class KokkosLMP : protected Pointers {
   int neighflag_qeq_set;
   int exchange_comm_classic;
   int forward_comm_classic;
+  int reverse_comm_classic;
   int exchange_comm_on_host;
   int forward_comm_on_host;
+  int reverse_comm_on_host;
   int num_threads,ngpu;
   int numa;
   int auto_sync;
diff --git a/src/comm_brick.cpp b/src/comm_brick.cpp
index 3c972b8244..06227b7a84 100644
--- a/src/comm_brick.cpp
+++ b/src/comm_brick.cpp
@@ -476,8 +476,7 @@ void CommBrick::forward_comm(int dummy)
     if (sendproc[iswap] != me) {
       if (comm_x_only) {
         if (size_forward_recv[iswap]) {
-          if (size_forward_recv[iswap]) buf = x[firstrecv[iswap]];
-          else buf = NULL;
+          buf = x[firstrecv[iswap]];
           MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
                     recvproc[iswap],0,world,&request);
         }
@@ -547,8 +546,7 @@ void CommBrick::reverse_comm()
           MPI_Irecv(buf_recv,size_reverse_recv[iswap],MPI_DOUBLE,
                     sendproc[iswap],0,world,&request);
         if (size_reverse_send[iswap]) {
-          if (size_reverse_send[iswap]) buf = f[firstrecv[iswap]];
-          else buf = NULL;
+          buf = f[firstrecv[iswap]];
           MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
                    recvproc[iswap],0,world);
         }

From 2876baafd07d31c3a3ab30cc38b087ebcfc07eab Mon Sep 17 00:00:00 2001
From: James Barnett <jwb2162@columbia.edu>
Date: Tue, 3 Oct 2017 13:08:56 -0400
Subject: [PATCH 39/53] Use -restrict whenever Intel is used, no matter the
 package

---
 cmake/CMakeLists.txt | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 9a74a788d0..ca71c41ddb 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -37,6 +37,10 @@ enable_language(CXX)
 #####################################################################
 include(CheckCCompilerFlag)
 
+if (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
+endif()
+
 ########################################################################
 # User input options                                                   #
 ########################################################################
@@ -150,11 +154,6 @@ if(ENABLE_USER-OMP OR ENABLE_KOKKOS OR ENABLE_USER-INTEL)
   set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
 endif()
 
-if((ENABLE_USER-OMP OR ENABLE_OPT OR ENABLE_USER-INTEL) AND
-        (${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel"))
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict")
-endif()
-
 if(ENABLE_KSPACE)
   set(FFT "KISSFFT" CACHE STRING "FFT library for KSPACE package")
   set_property(CACHE FFT PROPERTY STRINGS KISSFFT FFTW3 MKL FFTW2)

From a55adf4a6848a734a492e2f5dc993041927db08a Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Tue, 3 Oct 2017 11:30:00 -0600
Subject: [PATCH 40/53] Update to Kokkos r2.04.04 and add workaround for
 performance regression

---
 lib/kokkos/CHANGELOG.md                       |  19 +
 lib/kokkos/Makefile.kokkos                    |  32 +-
 lib/kokkos/algorithms/src/Kokkos_Random.hpp   | 237 ++++++++++++
 lib/kokkos/algorithms/unit_tests/Makefile     |  12 +
 lib/kokkos/algorithms/unit_tests/TestROCm.cpp | 112 ++++++
 lib/kokkos/bin/hpcbind                        | 239 ++++++++----
 lib/kokkos/bin/kokkos-bind                    | 221 -----------
 lib/kokkos/bin/nvcc_wrapper                   |  15 +-
 lib/kokkos/config/master_history.txt          |   1 +
 .../config/trilinos-integration/checkin-test  |   2 +-
 .../containers/src/Kokkos_StaticCrsGraph.hpp  | 149 ++++++++
 .../KokkosExp_Cuda_IterateTile_Refactor.hpp   | 160 ++++----
 lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp  |   4 +-
 lib/kokkos/core/src/Kokkos_Complex.hpp        | 357 +++++++++++++-----
 lib/kokkos/core/src/Kokkos_Crs.hpp            |   9 +-
 lib/kokkos/core/src/Kokkos_HBWSpace.hpp       |   3 +-
 lib/kokkos/core/src/Kokkos_NumericTraits.hpp  |   6 +-
 lib/kokkos/core/src/Kokkos_ROCm.hpp           |  18 +
 lib/kokkos/core/src/Makefile                  |   1 +
 .../core/src/OpenMP/Kokkos_OpenMP_Exec.hpp    |   1 +
 .../core/src/ROCm/Kokkos_ROCm_Reduce.hpp      |  12 +-
 lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp |   4 +-
 lib/kokkos/core/src/impl/Kokkos_BitOps.hpp    |  30 +-
 lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp  |   4 -
 lib/kokkos/core/unit_test/TestComplex.hpp     |  15 +-
 lib/kokkos/core/unit_test/TestMDRange.hpp     | 121 ++++--
 26 files changed, 1222 insertions(+), 562 deletions(-)
 create mode 100644 lib/kokkos/algorithms/unit_tests/TestROCm.cpp
 delete mode 100755 lib/kokkos/bin/kokkos-bind

diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 43d3f17d63..d414056187 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,24 @@
 # Change Log
 
+## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04)
+
+**Implemented enhancements:**
+
+- OpenMP partition: set number of threads on nested level [\#1082](https://github.com/kokkos/kokkos/issues/1082)
+- Add StaticCrsGraph row\(\) method [\#1071](https://github.com/kokkos/kokkos/issues/1071)
+- Enhance Kokkos complex operator overloading [\#1052](https://github.com/kokkos/kokkos/issues/1052)
+- Tell Trilinos packages about host+device lambda [\#1019](https://github.com/kokkos/kokkos/issues/1019)
+- Function markup for defaulted class members [\#952](https://github.com/kokkos/kokkos/issues/952)
+- Add deterministic random number generator [\#857](https://github.com/kokkos/kokkos/issues/857)
+
+**Fixed bugs:**
+
+- Fix reduction\_identity\<T\>::max for floating point numbers [\#1048](https://github.com/kokkos/kokkos/issues/1048)
+- Fix MD iteration policy ignores lower bound on GPUs [\#1041](https://github.com/kokkos/kokkos/issues/1041)
+- (Experimental) HBWSpace  Linking issues in KokkosKernels [\#1094](https://github.com/kokkos/kokkos/issues/1094)
+- (Experimental) ROCm:  algorithms/unit\_tests test\_sort failing with segfault [\#1070](https://github.com/kokkos/kokkos/issues/1070)
+
 ## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
 [Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
 
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index b8236e8fd1..4641232a1f 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -443,7 +443,7 @@ endif
 ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1)
   KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include
   KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib
-  KOKKOS_LIBS += -lmemkind
+  KOKKOS_LIBS += -lmemkind -lnuma
   tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp )
 endif
 
@@ -614,9 +614,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
   else
-    # Assume that this is a really a GNU compiler or it could be XL on P8.
-    KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
-    KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P8.
+        KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
+        KOKKOS_LDFLAGS  += -mcpu=power8 -mtune=power8
+      endif
+    endif
   endif
 endif
 
@@ -626,9 +635,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
   ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
 
   else
-    # Assume that this is a really a GNU compiler or it could be XL on P9.
-    KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
-    KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1) 
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+    else
+      ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
+
+      else 
+        # Assume that this is a really a GNU compiler on P9
+        KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
+        KOKKOS_LDFLAGS  += -mcpu=power9 -mtune=power9
+      endif
+    endif
   endif
 endif
 
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 9082e47052..3db9a145d7 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -1265,6 +1265,243 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
 }
 
 
+#endif
+
+#if defined(KOKKOS_ENABLE_ROCM) 
+
+  template<>
+  class Random_XorShift1024<Kokkos::Experimental::ROCm> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    const int stride_;
+    friend class Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>;
+  public:
+
+    typedef Kokkos::Experimental::ROCm device_type;
+    typedef Random_XorShift1024_Pool<device_type> pool_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ * stride_ ];
+      uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = 2.0*drand() - 1.0;
+        const double V = 2.0*drand() - 1.0;
+        S = U*U+V*V;
+      }
+      return U*std::sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Experimental::ROCm> Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Experimental::ROCm>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift64<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  state_(state.state_idx_) = state.state_;
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Experimental::ROCm> Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::get_state() const {
+#ifdef __HCC_ACCELERATOR__
+  const int i_offset = (threadIdx_x*blockDim_y + threadIdx_y)*blockDim_z+threadIdx_z;
+  int i = (((blockIdx_x*gridDim_y+blockIdx_y)*gridDim_z + blockIdx_z) *
+           blockDim_x*blockDim_y*blockDim_z + i_offset)%num_states_;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim_x*blockDim_y*blockDim_z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Experimental::ROCm>(state_, p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Experimental::ROCm>::free_state(const Random_XorShift1024<Kokkos::Experimental::ROCm> &state) const {
+#ifdef __HCC_ACCELERATOR__
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
 #endif
 
 
diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile
index b74192ef18..a5a10c82ee 100644
--- a/lib/kokkos/algorithms/unit_tests/Makefile
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@@ -30,6 +30,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
 	TEST_TARGETS += test-cuda
 endif
 
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+	OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_ROCm
+	TEST_TARGETS += test-rocm
+endif
+
 ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
 	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
 	TARGETS += KokkosAlgorithms_UnitTest_Threads
@@ -51,6 +57,9 @@ endif
 KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Cuda
 
+KokkosAlgorithms_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_ROCm
+
 KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
 	$(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosAlgorithms_UnitTest_Threads
 
@@ -63,6 +72,9 @@ KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
 test-cuda: KokkosAlgorithms_UnitTest_Cuda
 	./KokkosAlgorithms_UnitTest_Cuda
 
+test-rocm: KokkosAlgorithms_UnitTest_ROCm
+	./KokkosAlgorithms_UnitTest_ROCm
+
 test-threads: KokkosAlgorithms_UnitTest_Threads
 	./KokkosAlgorithms_UnitTest_Threads
 
diff --git a/lib/kokkos/algorithms/unit_tests/TestROCm.cpp b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
new file mode 100644
index 0000000000..720b377ed2
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestROCm.cpp
@@ -0,0 +1,112 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#ifdef KOKKOS_ENABLE_ROCM
+
+#include <cstdint>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class rocm : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Experimental::ROCm::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+void rocm_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+void rocm_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Experimental::ROCm> >(num_draws);
+}
+
+
+#define ROCM_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( rocm, Random_XorShift64 ) {        \
+  rocm_test_random_xorshift64(num_draws);    \
+  }
+
+#define ROCM_RANDOM_XORSHIFT1024( num_draws )  \
+  TEST_F( rocm, Random_XorShift1024 ) {        \
+  rocm_test_random_xorshift1024(num_draws);    \
+  }
+
+#define ROCM_SORT_UNSIGNED( size )                                    \
+  TEST_F( rocm, SortUnsigned ) {                                      \
+      Impl::test_sort< Kokkos::Experimental::ROCm, unsigned >(size);  \
+  }
+
+ROCM_RANDOM_XORSHIFT64(  132141141 )
+ROCM_RANDOM_XORSHIFT1024( 52428813 )
+ROCM_SORT_UNSIGNED(171)
+
+#undef ROCM_RANDOM_XORSHIFT64
+#undef ROCM_RANDOM_XORSHIFT1024
+#undef ROCM_SORT_UNSIGNED
+}
+#else
+void KOKKOS_ALGORITHMS_UNITTESTS_TESTROCM_PREVENT_LINK_ERROR() {}
+#endif  /* #ifdef KOKKOS_ENABLE_ROCM */
+
diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind
index ca34648780..b88b334f8b 100755
--- a/lib/kokkos/bin/hpcbind
+++ b/lib/kokkos/bin/hpcbind
@@ -27,7 +27,7 @@ fi
 HPCBIND_HWLOC_PARENT_CPUSET=""
 if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
   MY_PID="$BASHPID"
-  HPCBIND_HWLOC_PARENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
+  HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)"
 fi
 
 ################################################################################
@@ -58,23 +58,34 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0))
 ################################################################################
 HPCBIND_QUEUE_NAME=""
 declare -i HPCBIND_QUEUE_INDEX=0
-declare -i HPCBIND_QUEUE_GPU_MAPPING=0
+declare -i HPCBIND_QUEUE_MAPPING=0
 
-if [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
-  HPCBIND_QUEUE_NAME="sbatch"
+if [[ ! -z "${PMI_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mpich"
+  HPCBIND_QUEUE_INDEX=${PMI_RANK}
+elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="openmpi"
+  HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK}
+elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="mvapich2"
+  HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK}
+elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then
+  HPCBIND_QUEUE_MAPPING=1
+  HPCBIND_QUEUE_NAME="slurm"
   HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID}
 elif [[ ! -z "${LBS_JOBINDEX}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
   HPCBIND_QUEUE_NAME="bsub"
   HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX}
 elif [[ ! -z "${ALPS_APP_PE}" ]]; then
-  HPCBIND_QUEUE_GPU_MAPPING=1
+  HPCBIND_QUEUE_MAPPING=1
   HPCBIND_QUEUE_NAME="aprun"
   HPCBIND_QUEUE_INDEX=${ALPS_APP_PE}
 fi
 
-
 ################################################################################
 # Show help
 ################################################################################
@@ -91,13 +102,14 @@ function show_help {
   echo "  --proc-bind=<LOC>     Set the initial process mask for the script"
   echo "                        LOC can be any valid location argument for"
   echo "                        hwloc-calc  Default: all"
+  echo "  --whole-system        ${cmd} will ignore the its parent process binding"
   echo "  --distribute=N        Distribute the current cpuset into N partitions"
   echo "  --distribute-partition=I"
   echo "                        Use the i'th partition (zero based)"
   echo "  --visible-gpus=<L>    Comma separated list of gpu ids"
   echo "                        Default: CUDA_VISIBLE_DEVICES or all gpus in"
   echo "                        sequential order"
-  echo "  --gpu-ignore-queue    Ignore queue job id when choosing visible GPU"
+  echo "  --ignore-queue        Ignore queue job id when choosing visible GPU and partition"
   echo "  --no-gpu-mapping      Do not set CUDA_VISIBLE_DEVICES"
   echo "  --openmp=M.m          Set env variables for the given OpenMP version"
   echo "                        Default: 4.0"
@@ -110,22 +122,30 @@ function show_help {
   echo "  --force-openmp-proc-bind=<OP>"
   echo "                        Override logic for selecting OMP_PROC_BIND"
   echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  --show-bindings       Show the bindings"
-  echo "  --lstopo              Show bindings in lstopo without executing a command"
-  echo "  -v|--verbose          Show options and relevant environment variables"
+  echo "  --output-prefix=<P>   Save the output to files of the form"
+  echo "                        P-N.log, P-N.out and P-N.err where P is the prefix"
+  echo "                        and N is the queue index or mpi rank (no spaces)"
+  echo "  --output-mode=<Op>    How console output should be handled."
+  echo "                        Options are all, rank0, and none.  Default: rank0" 
+  echo "  --lstopo              Show bindings in lstopo"
+  echo "  -v|--verbose          Print bindings and relevant environment variables"
   echo "  -h|--help             Show this message"
   echo ""
   echo "Sample Usage:"
   echo "  Split the current process cpuset into 4 and use the 3rd partition"
   echo "    ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..."
-  echo "  Bing the process to all even cores"
+  echo "  Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus"
+  echo "  and save the output to rank specific files"
+  echo "    mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\"
+  echo "      --distribute=4 -v --output-prefix=output  -- command ..."
+  echo "  Bind the process to all even cores"
   echo "    ${cmd} --proc-bind=core:even -v -- command ..."
-  echo "  Bind to the first 64 cores and split the current process cpuset into 4"
-  echo "    ${cmd} --proc-bind=core:0-63 --distribute=4 --distribute-partition=0 -- command ..."
-  echo "  skip GPU 0 when mapping visible devices"
+  echo "  Bind the the even cores of socket 0 and the odd cores of socket 1"
+  echo "    ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..."
+  echo "  Skip GPU 0 when mapping visible devices"
   echo "    ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..."
   echo "  Display the current bindings"
-  echo "    ${cmd} --proc-bind=numa:0 --show-bindings -- command"
+  echo "    ${cmd} --proc-bind=numa:0 -- command"
   echo "  Display the current bindings using lstopo"
   echo "    ${cmd} --proc-bind=numa:0.core:odd --lstopo"
   echo ""
@@ -144,7 +164,7 @@ fi
 declare -a UNKNOWN_ARGS=()
 declare -i HPCBIND_ENABLE_HWLOC_BIND=${HPCBIND_HAS_HWLOC}
 declare -i HPCBIND_DISTRIBUTE=1
-declare -i HPCBIND_PARTITION=0
+declare -i HPCBIND_PARTITION=-1
 HPCBIND_PROC_BIND="all"
 HPCBIND_OPENMP_VERSION=4.0
 declare -i HPCBIND_OPENMP_PERCENT=100
@@ -155,11 +175,15 @@ HPCBIND_OPENMP_FORCE_PROC_BIND=""
 HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true}
 declare -i HPCBIND_VERBOSE=0
 
-declare -i HPCBIND_SHOW_BINDINGS=0
 declare -i HPCBIND_LSTOPO=0
 
-for i in $@; do
-  case $i in
+HPCBIND_OUTPUT_PREFIX=""
+HPCBIND_OUTPUT_MODE="rank0"
+
+declare -i HPCBIND_HAS_COMMAND=0
+
+for i in "$@"; do
+  case "$i" in
     # number of partitions to create
     --no-hwloc-bind)
       HPCBIND_ENABLE_HWLOC_BIND=0
@@ -169,6 +193,10 @@ for i in $@; do
       HPCBIND_PROC_BIND="${i#*=}"
       shift
       ;;
+    --whole-system)
+      HPCBIND_HWLOC_PARENT_CPUSET=""
+      shift
+      ;;
     --distribute=*)
       HPCBIND_DISTRIBUTE="${i#*=}"
       shift
@@ -182,8 +210,8 @@ for i in $@; do
       HPCBIND_VISIBLE_GPUS=$(echo "${i#*=}" | tr ',' ' ')
       shift
       ;;
-    --gpu-ignore-queue)
-      HPCBIND_QUEUE_GPU_MAPPING=0
+    --ignore-queue)
+      HPCBIND_QUEUE_MAPPING=0
       shift
       ;;
     --no-gpu-mapping)
@@ -218,14 +246,18 @@ for i in $@; do
       HPCBIND_OPENMP_NESTED="false"
       shift
       ;;
-    --show-bindings)
-      HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=1
+    --output-prefix=*)
+      HPCBIND_OUTPUT_PREFIX="${i#*=}"
+      shift
+      ;;
+    --output-mode=*)
+      HPCBIND_OUTPUT_MODE="${i#*=}"
+      #convert to lower case
+      HPCBIND_OUTPUT_MODE="${HPCBIND_OUTPUT_MODE,,}"
       shift
       ;;
     --lstopo)
       HPCBIND_VERBOSE=1
-      HPCBIND_SHOW_BINDINGS=0
       HPCBIND_LSTOPO=1
       shift
       ;;
@@ -239,6 +271,7 @@ for i in $@; do
       ;;
     # ignore remaining arguments
     --)
+      HPCBIND_HAS_COMMAND=1
       shift
       break
       ;;
@@ -250,16 +283,41 @@ for i in $@; do
   esac
 done
 
+################################################################################
+# Check output mode
+################################################################################
+declare -i HPCBIND_TEE=0
+
+if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then
+  HPCBIND_TEE=0
+elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then
+  HPCBIND_TEE=1
+elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then
+  #default to rank0 printing to screen
+  HPCBIND_TEE=1
+fi
+
+
+if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then
+  HPCBIND_LOG=/dev/null
+  HPCBIND_ERR=/dev/null
+  HPCBIND_OUT=/dev/null
+else
+  HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log"
+  HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err"
+  HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out"
+  > ${HPCBIND_LOG}
+fi
+
 
 ################################################################################
 # Check unknown arguments
 ################################################################################
 if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
+  echo "HPCBIND Uknown options: ${UNKNOWN_ARGS[*]}" > >(tee -a ${HPCBIND_LOG})
   exit 1
 fi
 
-
 ################################################################################
 # Check that visible gpus are valid
 ################################################################################
@@ -268,22 +326,19 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
   for ((i=0; i < ${#HPCBIND_VISIBLE_GPUS[*]}; i++)); do
     if [[ ${HPCBIND_VISIBLE_GPUS[$i]} -ge ${NUM_GPUS} ||
       ${HPCBIND_VISIBLE_GPUS[$i]} -lt 0 ]]; then
-      echo "Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]}, setting to 0"
+      echo "HPCBIND Invaild GPU ID ${HPCBIND_VISIBLE_GPUS[$i]} (setting to 0)" > >(tee -a ${HPCBIND_LOG})
       HPCBIND_VISIBLE_GPUS[$i]=0;
     fi
   done
   NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]}
 fi
 
-
 ################################################################################
 # Check OpenMP percent
 ################################################################################
 if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then
-  echo "OpenMP percent < 1, setting to 1"
   HPCBIND_OPENMP_PERCENT=1
 elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then
-  echo "OpenMP percent > 100, setting to 100"
   HPCBIND_OPENMP_PERCENT=100
 fi
 
@@ -291,15 +346,21 @@ fi
 # Check distribute
 ################################################################################
 if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
   HPCBIND_DISTRIBUTE=1
 fi
 
-if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
-  echo "Invalid input for distribute-partition, changing to 0"
+################################################################################
+#choose the correct partition
+################################################################################
+if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then
+  HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX}
+elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then
   HPCBIND_PARTITION=0
 fi
 
+if [[ ${HPCBIND_PARTITION} -ge ${HPCBIND_DISTRIBUTE} ]]; then
+  HPCBIND_PARTITION=$((HPCBIND_PARTITION % HPCBIND_DISTRIBUTE))
+fi
 
 ################################################################################
 # Find cpuset and num threads
@@ -309,13 +370,17 @@ declare -i HPCBIND_NUM_PUS=0
 
 if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
   if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc ${HPCBIND_PROC_BIND[*]})
   else
-    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND})
+    BINDING=$(hwloc-calc --restrict ${HPCBIND_HWLOC_PARENT_CPUSET} ${HPCBIND_PROC_BIND[*]})
   fi
 
-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
-  HPCBIND_HWLOC_CPUSET=${CPUSETS[${HPCBIND_PARTITION}]}
+  if [[ ${HPCBIND_DISTRIBUTE} -gt 1 ]]; then
+    CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${HPCBIND_DISTRIBUTE}))
+    HPCBIND_HWLOC_CPUSET="${CPUSETS[${HPCBIND_PARTITION}]}"
+  else
+    HPCBIND_HWLOC_CPUSET="${BINDING}"
+  fi
   HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l)
 else
   HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor)
@@ -373,13 +438,13 @@ export OMP_NESTED=${HPCBIND_OPENMP_NESTED}
 ################################################################################
 
 if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then
-  if [[ ${HPCBIND_QUEUE_GPU_MAPPING} -eq 0 ]]; then
+  if [[ ${HPCBIND_QUEUE_MAPPING} -eq 0 ]]; then
     declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
   else
     declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION))
     declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS))
-    export CUDA_VISIBLE_DEVICES=${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}
+    export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}"
   fi
 fi
 
@@ -389,22 +454,22 @@ fi
 export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC}
 export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA}
 export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS}
-export HPCBIND_HWLOC_CPUSET=${HPCBIND_HWLOC_CPUSET}
+export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}"
 export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE}
 export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION}
 if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then
   export HPCBIND_HWLOC_PARENT_CPUSET="all"
 else
-  export HPCBIND_HWLOC_PARENT_CPUSET=${HPCBIND_HWLOC_PARENT_CPUSET}
+  export HPCBIND_HWLOC_PARENT_CPUSET="${HPCBIND_HWLOC_PARENT_CPUSET}"
 fi
-export HPCBIND_HWLOC_PROC_BIND=${HPCBIND_PROC_BIND}
+export HPCBIND_HWLOC_PROC_BIND="${HPCBIND_PROC_BIND}"
 export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING}
 export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',')
-export HPCBIND_OPENMP_VERSION=${HPCBIND_OPENMP_VERSION}
+export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}"
 if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then
   export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX}
-  export HPCBIND_QUEUE_NAME=${HPCBIND_QUEUE_NAME}
-  export HPCBIND_QUEUE_GPU_MAPPING=${HPCBIND_QUEUE_GPU_MAPPING}
+  export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}"
+  export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING}
 fi
 
 
@@ -412,43 +477,63 @@ fi
 # Print verbose
 ################################################################################
 
-if [[ ${HPCBIND_VERBOSE} -eq 1 ]]; then
-  MY_ENV=$(env | sort)
-  echo "[HPCBIND]"
-  echo "${MY_ENV}" | grep -E "^HPCBIND_"
-  echo "[CUDA]"
-  echo "${MY_ENV}" | grep -E "^CUDA_"
-  echo "[OPENMP]"
-  echo "${MY_ENV}" | grep -E "^OMP_"
-fi
+TMP_ENV=$(env | sort)
+if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then
+  echo "[HOST]" >> ${HPCBIND_LOG}
+  hostname -s >> ${HPCBIND_LOG}
+  echo "[HPCBIND]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" >> ${HPCBIND_LOG}
+  echo "[CUDA]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG}
+  echo "[OPENMP]" >> ${HPCBIND_LOG}
+  echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG}
 
-if [[ ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "[BINDINGS]"
-  hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-elif [[ ${HPCBIND_SHOW_BINDINGS} -eq 1 ]]; then
-  echo "Unable to show bindings, hwloc not available."
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" >> ${HPCBIND_LOG}
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG}
+  else
+    echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG}
+  fi
+else
+  echo "[HOST]" > >(tee -a ${HPCBIND_LOG})
+  hostname -s > >(tee -a ${HPCBIND_LOG})
+  echo "[HPCBIND]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^HPCBIND_" > >(tee -a ${HPCBIND_LOG})
+  echo "[CUDA]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG})
+  echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG})
+  echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG})
+
+  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
+    echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG})
+    hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG})
+  else
+    echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG})
+  fi
 fi
 
 ################################################################################
 # Run command
 ################################################################################
 
-if [[ ${HPCBIND_LSTOPO} -eq 0 ]]; then
-  if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
-    hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- $@
-  else
-    eval $@
-  fi
-else
-  if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then
-    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
-      echo "[BINDINGS]"
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu
-      hwloc-bind ${HPCBIND_HWLOC_CPUSET} -- lstopo --pid 0
+# must be the last executed command so that the return value is correct
+if [[ ${HPCBIND_LSTOPO} -eq 1 && ${HPCBIND_HAS_HWLOC} -eq 1 && ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 && ! -z ${DISPLAY} ]]; then
+  hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- lstopo --pid 0
+elif [[ ${HPCBIND_HAS_COMMAND} -eq 1 ]]; then
+  # clear output files
+  > ${HPCBIND_ERR}
+  > ${HPCBIND_OUT}
+  if [[ ${HPCBIND_TEE} -eq 0 ]]; then
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     else
-      hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET}
+      eval $@ > ${HPCBIND_OUT} 2> ${HPCBIND_ERR}
     fi
   else
-    echo "Unable to show bindings, hwloc not available."
+    if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then
+      hwloc-bind "${HPCBIND_HWLOC_CPUSET}" -- $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    else
+      eval $@ > >(tee ${HPCBIND_OUT}) 2> >(tee ${HPCBIND_ERR} >&2)
+    fi
   fi
 fi
diff --git a/lib/kokkos/bin/kokkos-bind b/lib/kokkos/bin/kokkos-bind
deleted file mode 100755
index b6fe07a1bd..0000000000
--- a/lib/kokkos/bin/kokkos-bind
+++ /dev/null
@@ -1,221 +0,0 @@
-#!/usr/bin/env bash
-
-# check if hwloc commands exist
-declare -i HAS_HWLOC=0
-type hwloc-bind >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-distrib >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ls >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-calc >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-type hwloc-ps >/dev/null 2>&1
-HAS_HWLOC="${HAS_HWLOC} + $?"
-
-
-#parse args
-declare -a UNKNOWN_ARGS=()
-declare -i DISTRIBUTE=1
-declare -i INDEX=0
-PROC_BIND="all"
-CURRENT_CPUSET=""
-OPENMP_VERSION=4.0
-OPENMP_PROC_BIND=True
-OPENMP_NESTED=True
-VERBOSE=False
-
-#get the current process cpuset
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-  MY_PID="$BASHPID"
-  CURRENT_CPUSET=$(hwloc-ps --cpuset | grep "${MY_PID}" | cut -f 2)
-  echo "$CURRENT_CPUSET"
-fi
-
-function show_help {
-  local cmd=$(basename "$0")
-  echo "Usage: ${cmd} <options> -- command ..." 
-  echo "  Uses hwloc to divide the node into the given number of groups,"
-  echo "  set the appropriate OMP_NUM_THREADS and execute the command on the"
-  echo "  selected group."
-  echo ""
-  echo "  NOTE: This command assumes it has exclusive use of the node"
-  echo ""
-  echo "Options:"
-  echo "  --proc-bind=<LOC>     Set the initial process mask for the script.  "
-  echo "                        LOC can be any valid location argumnet for"
-  echo "                        hwloc-calc.  Defaults to the entire machine"
-  echo "  --distribute=N        Distribute the current proc-bind into N groups" 
-  echo "  --index=I             Use the i'th group (zero based)" 
-  echo "  --openmp=M.m          Set env variables for the given OpenMP version"
-  echo "                        (default 4.0)"
-  echo "  --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES"    
-  echo "  --no-openmp-nested    Set OMP_NESTED to false"
-  echo "  -v|--verbose" 
-  echo "  -h|--help" 
-  echo ""
-  echo "Sample Usage:"
-  echo "  ${cmd} --distribute=4 --index=2 -v -- command ..."
-  echo ""
-}
-
-if [[ "$#" -eq 0 ]]; then
-  show_help 
-  exit 0
-fi
-
-
-for i in $@; do
-  case $i in
-    # number of partitions to create
-    --proc-bind=*)
-      PROC_BIND="${i#*=}"
-      shift
-      ;;
-    --distribute=*)
-      DISTRIBUTE="${i#*=}"
-      shift
-      ;;
-    # which group to use
-    --index=*)
-      INDEX="${i#*=}"
-      shift
-      ;;
-    --openmp=*)
-      OPENMP_VERSION="${i#*=}"
-      shift
-      ;;
-    --no-openmp-proc-bind)
-      OPENMP_PROC_BIND=False
-      shift
-      ;;
-    --no-openmp-nested)
-      OPENMP_NESTED=False
-      shift
-      ;;
-    -v|--verbose)
-      VERBOSE=True
-      shift
-      ;;
-    -h|--help)
-      show_help
-      exit 0
-      ;;
-    # ignore remaining arguments
-    --)
-      shift
-      break
-      ;;
-    # unknown option
-    *)
-      UNKNOWN_ARGS+=("$i")
-      shift
-      ;;
-  esac
-done
-
-if [[ ${#UNKNOWN_ARGS[*]} > 0 ]]; then
-  echo "Uknown options: ${UNKNOWN_ARGS[*]}"
-  exit 1
-fi
-
-if [[ ${DISTRIBUTE} -le 0 ]]; then
-  echo "Invalid input for distribute, changing distribute to 1"
-  DISTRIBUTE=1
-fi
-
-if [[ ${INDEX} -ge ${DISTRIBUTE} ]]; then
-  echo "Invalid input for index, changing index to 0"
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -ne 0 ]]; then
-  echo "hwloc not found, no process binding will occur"
-  DISTRIBUTE=1
-  INDEX=0
-fi
-
-if [[ ${HAS_HWLOC} -eq 0 ]]; then
-
-  if [[ "${CURRENT_CPUSET}" == "" ]]; then
-    BINDING=$(hwloc-calc ${PROC_BIND})
-  else 
-    BINDING=$(hwloc-calc --restrict ${CURRENT_CPUSET} ${PROC_BIND})
-  fi
-
-  CPUSETS=($(hwloc-distrib --restrict ${BINDING} --at core ${DISTRIBUTE}))
-  CPUSET=${CPUSETS[${INDEX}]}
-  NUM_THREADS=$(hwloc-ls --restrict ${CPUSET} --only pu | wc -l)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:         true"
-    echo "  proc_bind:     ${PROC_BIND}"
-    echo "  distribute:    ${DISTRIBUTE}"
-    echo "  index:         ${INDEX}"
-    echo "  parent_cpuset: ${CURRENT_CPUSET}"
-    echo "  cpuset:        ${CPUSET}"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  hwloc-bind ${CPUSET} -- $@
-else
-  NUM_THREADS=$(cat /proc/cpuinfo | grep -c processor)
-
-  if [[ "${VERBOSE}" == "True" ]]; then
-    echo "hwloc:           false"
-    echo "omp_num_threads: ${NUM_THREADS}"
-    echo "omp_proc_bind:   ${OPENMP_PROC_BIND}"
-    echo "omp_nested:      ${OPENMP_NESTED}"
-    echo "OpenMP:          ${OPENMP_VERSION}"
-  fi
-    
-  # set OMP env
-  if [[ "${OPENMP_PROC_BIND}" == "True" ]]; then
-    if [[ "${OPENMP_VERSION}" == "4.0" || "${OPENMP_VERSION}" > "4.0" ]]; then
-      export OMP_PLACES="threads"
-      export OMP_PROC_BIND="spread"
-    else
-      export OMP_PROC_BIND="true"
-      unset OMP_PLACES
-    fi
-  else
-    unset OMP_PLACES
-    unset OMP_PROC_BIND
-  fi
-  if [[ "${OPENMP_NESTED}" == "True" ]]; then
-    export OMP_NESTED="true"
-  else
-    export OMP_NESTED="false"
-  fi
-  export OMP_NUM_THREADS="${NUM_THREADS}"
-
-  eval $@
-fi
-
diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper
index 09fa5d500a..76e33f3c66 100755
--- a/lib/kokkos/bin/nvcc_wrapper
+++ b/lib/kokkos/bin/nvcc_wrapper
@@ -78,6 +78,9 @@ temp_dir=${TMPDIR:-/tmp}
 # Check if we have an optimization argument already
 optimization_applied=0
 
+# Check if we have -std=c++X  or --std=c++X already
+stdcxx_applied=0
+
 #echo "Arguments: $# $@"
 
 while [ $# -gt 0 ]
@@ -130,10 +133,16 @@ do
     cuda_args="$cuda_args $1 $2"
     shift
     ;;
-  #Handle c++11 setting
-  --std=c++11|-std=c++11)
-    shared_args="$shared_args $1"
+  #Handle c++11
+  --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z)
+    if [ $stdcxx_applied -eq 1 ]; then
+       echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting"
+    else
+       shared_args="$shared_args $1"
+       stdcxx_applied=1
+    fi
     ;;
+
   #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98
   -std=c++98|--std=c++98)
     ;;
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index 96b05c02e1..6f9ca897d9 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -9,3 +9,4 @@ tag:  2.03.00    date: 04:25:2017    master: 120d9ce7    develop: 015ba641
 tag:  2.03.05    date: 05:27:2017    master: 36b92f43    develop: 79073186
 tag:  2.03.13    date: 07:27:2017    master: da314444    develop: 29ccb58a
 tag:  2.04.00    date: 08:16:2017    master: 54eb75c0    develop: 32fb8ee1
+tag:  2.04.04    date: 09:11:2017    master: 2b7e9c20    develop: 51e7b25a
diff --git a/lib/kokkos/config/trilinos-integration/checkin-test b/lib/kokkos/config/trilinos-integration/checkin-test
index 92a1b1c068..ffb565fcbb 100644
--- a/lib/kokkos/config/trilinos-integration/checkin-test
+++ b/lib/kokkos/config/trilinos-integration/checkin-test
@@ -1,4 +1,4 @@
 module purge
-module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.59.0/base
+module load sems-env sems-gcc/4.9.3 sems-openmpi/1.10.1 sems-hdf5/1.8.12/parallel sems-netcdf/4.3.2/parallel sems-python/2.7.9 sems-zlib/1.2.8/base sems-cmake/3.5.2 sems-parmetis/4.0.3/64bit_parallel sems-scotch/6.0.3/nopthread_64bit_parallel sems-boost/1.63.0/base sems-yaml_cpp sems-superlu
 
 #Run Trilinos CheckinTest
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
index 0408472c68..996b6b5610 100644
--- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -125,6 +125,123 @@ namespace Impl {
   };
 }
 
+/// \class GraphRowViewConst
+/// \brief View of a row of a sparse graph.
+/// \tparam GraphType Sparse graph type, such as (but not limited to) StaticCrsGraph.
+///
+/// This class provides a generic view of a row of a sparse graph.
+/// We intended this class to view a row of a StaticCrsGraph, but
+/// GraphType need not necessarily be CrsMatrix.
+///
+/// The row view is suited for computational kernels like sparse
+/// matrix-vector multiply, as well as for modifying entries in the
+/// sparse matrix.  The view is always const as it does not allow graph modification.
+///
+/// Here is an example loop over the entries in the row:
+/// \code
+/// typedef typename GraphRowViewConst<MatrixType>::ordinal_type ordinal_type;
+///
+/// GraphRowView<GraphType> G_i = ...;
+/// const ordinal_type numEntries = G_i.length;
+/// for (ordinal_type k = 0; k < numEntries; ++k) {
+///   ordinal_type j = G_i.colidx (k);
+///   // ... do something with A_ij and j ...
+/// }
+/// \endcode
+///
+/// GraphType must provide the \c data_type
+/// typedefs. In addition, it must make sense to use GraphRowViewConst to
+/// view a row of GraphType. In particular, column
+/// indices of a row must be accessible using the <tt>entries</tt>
+/// resp. <tt>colidx</tt> arrays given to the constructor of this
+/// class, with a constant <tt>stride</tt> between successive entries.
+/// The stride is one for the compressed sparse row storage format (as
+/// is used by CrsMatrix), but may be greater than one for other
+/// sparse matrix storage formats (e.g., ELLPACK or jagged diagonal).
+template<class GraphType>
+struct GraphRowViewConst {
+  //! The type of the column indices in the row.
+  typedef const typename GraphType::data_type ordinal_type;
+
+private:
+  //! Array of (local) column indices in the row.
+  ordinal_type* colidx_;
+  /// \brief Stride between successive entries in the row.
+  ///
+  /// For compressed sparse row (CSR) storage, this is always one.
+  /// This might be greater than one for storage formats like ELLPACK
+  /// or Jagged Diagonal.  Nevertheless, the stride can never be
+  /// greater than the number of rows or columns in the matrix.  Thus,
+  /// \c ordinal_type is the correct type.
+  const ordinal_type stride_;
+
+public:
+  /// \brief Constructor
+  ///
+  /// \param values [in] Array of the row's values.
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( ordinal_type* const colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count) :
+    colidx_ (colidx_in), stride_ (stride), length (count)
+  {}
+
+  /// \brief Constructor with offset into \c colidx array
+  ///
+  /// \param colidx [in] Array of the row's column indices.
+  /// \param stride [in] (Constant) stride between matrix entries in
+  ///   each of the above arrays.
+  /// \param count [in] Number of entries in the row.
+  /// \param idx [in] Start offset into \c colidx array
+  ///
+  /// \tparam OffsetType The type of \c idx (see above).  Must be a
+  ///   built-in integer type.  This may differ from ordinal_type.
+  ///   For example, the matrix may have dimensions that fit in int,
+  ///   but a number of entries that does not fit in int.
+  template<class OffsetType>
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst ( const typename GraphType::entries_type& colidx_in,
+                      const ordinal_type& stride,
+                      const ordinal_type& count,
+                      const OffsetType& idx,
+                      const typename std::enable_if<std::is_integral<OffsetType>::value, int>::type& = 0) :
+    colidx_ (&colidx_in(idx)), stride_ (stride), length (count)
+  {}
+
+  /// \brief Number of entries in the row.
+  ///
+  /// This is a public const field rather than a public const method,
+  /// in order to avoid possible overhead of a method call if the
+  /// compiler is unable to inline that method call.
+  ///
+  /// We assume that rows contain no duplicate entries (i.e., entries
+  /// with the same column index).  Thus, a row may have up to
+  /// A.numCols() entries.  This means that the correct type of
+  /// 'length' is ordinal_type.
+  const ordinal_type length;
+
+  /// \brief (Const) reference to the column index of entry i in this
+  ///   row of the sparse matrix.
+  ///
+  /// "Entry i" is not necessarily the entry with column index i, nor
+  /// does i necessarily correspond to the (local) row index.
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& colidx (const ordinal_type& i) const {
+    return colidx_[i*stride_];
+  }
+
+  /// \brief An alias for colidx
+  KOKKOS_INLINE_FUNCTION
+  ordinal_type& operator()(const ordinal_type& i) const {
+    return colidx(i);
+  }
+};
+
+
 /// \class StaticCrsGraph
 /// \brief Compressed row storage array.
 ///
@@ -218,6 +335,38 @@ public:
       static_cast<size_type> (0);
   }
 
+  /// \brief Return a const view of row i of the graph.
+  ///
+  /// If row i does not belong to the graph, return an empty view.
+  ///
+  /// The returned object \c view implements the following interface:
+  /// <ul>
+  /// <li> \c view.length is the number of entries in the row </li>
+  /// <li> \c view.colidx(k) returns a const reference to the
+  ///      column index of the k-th entry in the row </li>
+  /// </ul>
+  /// k is not a column index; it just counts from 0 to
+  /// <tt>view.length - 1</tt>.
+  ///
+  /// Users should not rely on the return type of this method.  They
+  /// should instead assign to 'auto'.  That allows compile-time
+  /// polymorphism for different kinds of sparse matrix formats (e.g.,
+  /// ELLPACK or Jagged Diagonal) that we may wish to support in the
+  /// future.
+  KOKKOS_INLINE_FUNCTION
+  GraphRowViewConst<StaticCrsGraph> rowConst (const data_type i) const {
+    const size_type start = row_map(i);
+    // count is guaranteed to fit in ordinal_type, as long as no row
+    // has duplicate entries.
+    const data_type count = static_cast<data_type> (row_map(i+1) - start);
+
+    if (count == 0) {
+      return GraphRowViewConst<StaticCrsGraph> (NULL, 1, 0);
+    } else {
+      return GraphRowViewConst<StaticCrsGraph> (entries, 1, count, start);
+    }
+  }
+
   /**  \brief  Create a row partitioning into a given number of blocks
    *           balancing non-zeros + a fixed cost per row.
    */
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
index 46321378d9..c184c14d07 100644
--- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp
@@ -91,11 +91,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
     // LL
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
         if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
           for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
             if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
               m_func(offset_0 , offset_1);
             }
@@ -106,11 +106,11 @@ struct DeviceIterateTile<2,RP,Functor,void >
     // LR
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
               m_func(offset_0 , offset_1);
             }
@@ -143,11 +143,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
     if (RP::inner_direction == RP::Left) {
       // Loop over size maxnumblocks until full range covered
       for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+        const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
         if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
           for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+            const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
             if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
               m_func(Tag(), offset_0 , offset_1);
             }
@@ -157,11 +157,11 @@ struct DeviceIterateTile<2,RP,Functor,Tag>
     }
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
               m_func(Tag(), offset_0 , offset_1);
             }
@@ -196,15 +196,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
     // LL
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
         if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                 if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                   m_func(offset_0 , offset_1 , offset_2);
                 }
@@ -217,15 +217,15 @@ struct DeviceIterateTile<3,RP,Functor,void >
     // LR
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                   m_func(offset_0 , offset_1 , offset_2);
                 }
@@ -259,15 +259,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
   {
     if (RP::inner_direction == RP::Left) {
       for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+        const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
         if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+                const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
                 if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
                   m_func(Tag(), offset_0 , offset_1 , offset_2);
                 }
@@ -279,15 +279,15 @@ struct DeviceIterateTile<3,RP,Functor,Tag>
     }
     else {
       for ( index_type tile_id0 = (index_type)blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) {
-        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x;
+        const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && (index_type)threadIdx.x < m_rp.m_tile[0] ) {
 
           for ( index_type tile_id1 = (index_type)blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && (index_type)threadIdx.y < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.z < m_rp.m_tile[2] ) {
                   m_func(Tag(), offset_0 , offset_1 , offset_2);
                 }
@@ -340,19 +340,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
       const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
 
       for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
         if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
 
           for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
             if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
               for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                 if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                   for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                     if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                       m_func(offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -378,19 +378,19 @@ struct DeviceIterateTile<4,RP,Functor,void >
       const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
                   for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                       m_func(offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -442,19 +442,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
       const index_type thr_id1 = (index_type)threadIdx.x / m_rp.m_tile[0];
 
       for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+        const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
         if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
 
           for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+            const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
             if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
               for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                 if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                   for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                    const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                     if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                       m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -479,19 +479,19 @@ struct DeviceIterateTile<4,RP,Functor,Tag>
       const index_type thr_id1 = (index_type)threadIdx.x % m_rp.m_tile[1];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type tile_id2 = (index_type)blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) {
-                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y;
+                const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && (index_type)threadIdx.y < m_rp.m_tile[2] ) {
 
                   for ( index_type tile_id3 = (index_type)blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) {
-                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z;
+                    const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && (index_type)threadIdx.z < m_rp.m_tile[3] ) {
                       m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3);
                     }
@@ -558,23 +558,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
       const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
 
       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
 
           for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
             if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                     if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                       for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                           m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                         }
@@ -613,23 +613,23 @@ struct DeviceIterateTile<5,RP,Functor,void >
       const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                           m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                         }
@@ -695,23 +695,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
       const index_type thr_id3 = (index_type)threadIdx.y / m_rp.m_tile[2];
 
       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
 
           for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+            const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
             if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                    const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                     if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                       for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                           m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4);
                         }
@@ -750,23 +750,23 @@ struct DeviceIterateTile<5,RP,Functor,Tag>
       const index_type thr_id3 = (index_type)threadIdx.y % m_rp.m_tile[3];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type tile_id4 = (index_type)blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) {
-                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z;
+                        const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && (index_type)threadIdx.z < m_rp.m_tile[4] ) {
                           m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4);
                         }
@@ -845,27 +845,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
       const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
 
       for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
         if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
 
           for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
             if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
               for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                 if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                   for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                     if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                       for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                         if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                           for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                             if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                               m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                             }
@@ -917,27 +917,27 @@ struct DeviceIterateTile<6,RP,Functor,void >
       const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
                           for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                             if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                               m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                             }
@@ -1016,27 +1016,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
       const index_type thr_id5 = (index_type)threadIdx.z / m_rp.m_tile[4];
 
       for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+        const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
         if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
 
           for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+            const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
             if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
               for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                 if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                   for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                    const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                     if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                       for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+                        const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
                         if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
                           for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+                            const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
                             if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
                               m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5);
                             }
@@ -1088,27 +1088,27 @@ struct DeviceIterateTile<6,RP,Functor,Tag>
       const index_type thr_id5 = (index_type)threadIdx.z % m_rp.m_tile[5];
 
       for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) {
-        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0;
+        const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0];
         if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) {
 
           for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) {
-            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1;
+            const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1];
             if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) {
 
               for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) {
-                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2;
+                const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2];
                 if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) {
 
                   for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) {
-                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3;
+                    const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3];
                     if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) {
 
                       for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) {
-                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4;
+                        const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4];
                         if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) {
 
                           for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) {
-                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5;
+                            const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5];
                             if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) {
                               m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5);
                             }
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
index cae8ecd489..079d9f0889 100644
--- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory()
 
 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_constant_memory()
 {
   const DriverType & driver =
@@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver )
 
 template< class DriverType, unsigned int maxTperB, unsigned int minBperSM >
 __global__
-__launch_bounds__(maxTperB, minBperSM)
+//__launch_bounds__(maxTperB, minBperSM)
 static void cuda_parallel_launch_local_memory( const DriverType driver )
 {
   driver();
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
index 26b47a8b74..f8355f0d06 100644
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -242,45 +242,89 @@ public:
     re_ = v;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const complex<RealType>& src) {
+  complex<RealType>&
+  operator += (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src.re_;
     im_ += src.im_;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile complex<RealType>& src) volatile {
+  void
+  operator += (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src.re_;
     im_ += src.im_;
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator += (const RealType& src) {
+  complex<RealType>&
+  operator += (const std::complex<RealType>& src) {
+    re_ += src.real();
+    im_ += src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator += (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator += (const volatile RealType& src) volatile {
+  void
+  operator += (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ += src;
   }
-
+  
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator -= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ -= src.re_;
     im_ -= src.im_;
     return *this;
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator -= (const RealType& src) {
+  complex<RealType>&
+  operator -= (const std::complex<RealType>& src) {
+    re_ -= src.real();
+    im_ -= src.imag();
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator -= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ -= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const complex<RealType>& src) {
+  complex<RealType>&
+  operator *= (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
     re_ = realPart;
@@ -288,8 +332,12 @@ public:
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile complex<RealType>& src) volatile {
+  void
+  operator *= (const volatile complex<InputRealType>& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     const RealType realPart = re_ * src.re_ - im_ * src.im_;
     const RealType imagPart = re_ * src.im_ + im_ * src.re_;
     re_ = realPart;
@@ -297,20 +345,70 @@ public:
   }
 
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator *= (const RealType& src) {
+  complex<RealType>&
+  operator *= (const std::complex<RealType>& src) {
+    const RealType realPart = re_ * src.real() - im_ * src.imag();
+    const RealType imagPart = re_ * src.imag() + im_ * src.real();
+    re_ = realPart;
+    im_ = imagPart;
+    return *this;
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator *= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ *= src;
     im_ *= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  void operator *= (const volatile RealType& src) volatile {
+  void
+  operator *= (const volatile InputRealType& src) volatile {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
     re_ *= src;
     im_ *= src;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const complex<RealType>& y) {
+  complex<RealType>&
+  operator /= (const complex<InputRealType>& y) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
+    // If the real part is +/-Inf and the imaginary part is -/+Inf,
+    // this won't change the result.
+    const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
+
+    // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
+    // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
+    // because y/s is NaN.
+    if (s == 0.0) {
+      this->re_ /= s;
+      this->im_ /= s;
+    }
+    else {
+      const complex<RealType> x_scaled (this->re_ / s, this->im_ / s);
+      const complex<RealType> y_conj_scaled (y.re_ / s, -(y.im_) / s);
+      const RealType y_scaled_abs = y_conj_scaled.re_ * y_conj_scaled.re_ +
+        y_conj_scaled.im_ * y_conj_scaled.im_; // abs(y) == abs(conj(y))
+      *this = x_scaled * y_conj_scaled;
+      *this /= y_scaled_abs;
+    }
+    return *this;
+  }
+  
+  KOKKOS_INLINE_FUNCTION
+  complex<RealType>&
+  operator /= (const std::complex<RealType>& y) {
+
     // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
     // If the real part is +/-Inf and the imaginary part is -/+Inf,
     // this won't change the result.
@@ -334,57 +432,95 @@ public:
     return *this;
   }
 
+
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  complex<RealType>& operator /= (const RealType& src) {
+  complex<RealType>&
+  operator /= (const InputRealType& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
     re_ /= src;
     im_ /= src;
     return *this;
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const complex<RealType>& src) {
-    return (re_ == src.re_) && (im_ == src.im_);
+  bool
+  operator == (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src.re_)) && (im_ == static_cast<RealType>(src.im_));
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator == (const RealType src) {
-    return (re_ == src) && (im_ == RealType(0));
+  bool
+  operator == (const std::complex<RealType>& src) {
+    return (re_ == src.real()) && (im_ == src.imag());
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator == (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ == static_cast<RealType>(src)) && (im_ == RealType(0));
+  }
+
+  template<typename InputRealType>
+  KOKKOS_INLINE_FUNCTION
+  bool
+  operator != (const complex<InputRealType>& src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
+
+    return (re_ != static_cast<RealType>(src.re_)) || (im_ != static_cast<RealType>(src.im_));
   }
 
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const complex<RealType>& src) {
-    return (re_ != src.re_) || (im_ != src.im_);
+  bool
+  operator != (const std::complex<RealType>& src) {
+    return (re_ != src.real()) || (im_ != src.imag());
   }
 
+  template<typename InputRealType>
   KOKKOS_INLINE_FUNCTION
-  bool operator != (const RealType src) {
-    return (re_ != src) || (im_ != RealType(0));
-  }
+  bool
+  operator != (const InputRealType src) {
+    static_assert(std::is_convertible<InputRealType,RealType>::value, 
+                  "InputRealType must be convertible to RealType");
 
+    return (re_ != static_cast<RealType>(src)) || (im_ != RealType(0));
+  }
+  
 };
 
 //! Binary + operator for complex complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () + y.real (), x.imag () + y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type > (x.real () + y.real (), x.imag () + y.imag ());
 }
 
 //! Binary + operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () + y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () + y , x.imag ());
 }
 
 //! Binary + operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator + (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x + y.real (), y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator + (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x + y.real (), y.imag ());
 }
 
 //! Unary + operator for complex.
@@ -396,27 +532,27 @@ operator + (const complex<RealType>& x) {
 }
 
 //! Binary - operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () - y.real (), x.imag () - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y.real (), x.imag () - y.imag ());
 }
 
 //! Binary - operator for complex scalar.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const complex<RealType>& x, const RealType& y) {
-  return complex<RealType> (x.real () - y , x.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const complex<RealType1>& x, const RealType2& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () - y , x.imag ());
 }
 
 //! Binary - operator for scalar complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator - (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x - y.real (), - y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator - (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x - y.real (), - y.imag ());
 }
 
 //! Unary - operator for complex.
@@ -428,12 +564,12 @@ operator - (const complex<RealType>& x) {
 }
 
 //! Binary * operator for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }
 
 /// \brief Binary * operator for std::complex and complex.
@@ -446,33 +582,34 @@ operator * (const complex<RealType>& x, const complex<RealType>& y) {
 /// This function cannot be called in a CUDA device function, because
 /// std::complex's methods and nonmember functions are not marked as
 /// CUDA device functions.
-template<class RealType>
-complex<RealType>
-operator * (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return complex<RealType> (x.real () * y.real () - x.imag () * y.imag (),
-                            x.real () * y.imag () + x.imag () * y.real ());
+template<class RealType1, class RealType2>
+inline
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x.real () * y.real () - x.imag () * y.imag (),
+                                                                        x.real () * y.imag () + x.imag () * y.real ());
 }
 
 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const RealType& x, const complex<RealType>& y) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const RealType1& x, const complex<RealType2>& y) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }
 
 /// \brief Binary * operator for RealType times complex.
 ///
 /// This function exists because the compiler doesn't know that
 /// RealType and complex<RealType> commute with respect to operator*.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator * (const complex<RealType>& y, const RealType& x) {
-  return complex<RealType> (x * y.real (), x * y.imag ());
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator * (const complex<RealType1>& y, const RealType2& x) {
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x * y.real (), x * y.imag ());
 }
 
 //! Imaginary part of a complex number.
@@ -539,33 +676,34 @@ complex<RealType> pow (const complex<RealType>& x) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const complex<RealType1>& x, const RealType2& y) {
-  return complex<RealType1> (real (x) / y, imag (x) / y);
+  return complex<typename std::common_type<RealType1,RealType2>::type> (real (x) / y, imag (x) / y);
 }
 
 //! Binary operator / for complex.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType>
-operator / (const complex<RealType>& x, const complex<RealType>& y) {
+complex<typename std::common_type<RealType1,RealType2>::type>
+operator / (const complex<RealType1>& x, const complex<RealType2>& y) {
   // Scale (by the "1-norm" of y) to avoid unwarranted overflow.
   // If the real part is +/-Inf and the imaginary part is -/+Inf,
   // this won't change the result.
-  const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  const common_real_type s = std::fabs (real (y)) + std::fabs (imag (y));
 
   // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
   // In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
   // because y/s is NaN.
   if (s == 0.0) {
-    return complex<RealType> (real (x) / s, imag (x) / s);
+    return complex<common_real_type> (real (x) / s, imag (x) / s);
   }
   else {
-    const complex<RealType> x_scaled (real (x) / s, imag (x) / s);
-    const complex<RealType> y_conj_scaled (real (y) / s, -imag (y) / s);
-    const RealType y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
+    const complex<common_real_type> x_scaled (real (x) / s, imag (x) / s);
+    const complex<common_real_type> y_conj_scaled (real (y) / s, -imag (y) / s);
+    const RealType1 y_scaled_abs = real (y_conj_scaled) * real (y_conj_scaled) +
       imag (y_conj_scaled) * imag (y_conj_scaled); // abs(y) == abs(conj(y))
-    complex<RealType> result = x_scaled * y_conj_scaled;
+    complex<common_real_type> result = x_scaled * y_conj_scaled;
     result /= y_scaled_abs;
     return result;
   }
@@ -574,16 +712,19 @@ operator / (const complex<RealType>& x, const complex<RealType>& y) {
 //! Binary operator / for complex and real numbers
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-complex<RealType1>
+complex<typename std::common_type<RealType1,RealType2>::type>
 operator / (const RealType1& x, const complex<RealType2>& y) {
-  return complex<RealType1> (x)/y;
+  return complex<typename std::common_type<RealType1,RealType2>::type> (x)/y;
 }
 
 //! Equality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) == real (y) && imag (x) == imag (y);
+bool
+operator == (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(imag (y)) );
 }
 
 /// \brief Equality operator for std::complex and Kokkos::complex.
@@ -592,50 +733,68 @@ bool operator == (const complex<RealType>& x, const complex<RealType>& y) {
 /// Otherwise, CUDA builds will give compiler warnings ("warning:
 /// calling a constexpr __host__ function("real") from a __host__
 /// __device__ function("operator==") is not allowed").
-template<class RealType>
-bool operator == (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) == real (y) && std::imag (x) == imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator == (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) == static_cast<common_real_type>(real (y)) && 
+           static_cast<common_real_type>(std::imag (x)) == static_cast<common_real_type>(imag (y)) );
 }
-
+  
 //! Equality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) == y && imag (x) == static_cast<RealType1> (0.0);
+bool
+operator == (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) == static_cast<common_real_type>(y) && 
+           static_cast<common_real_type>(imag (x)) == static_cast<common_real_type>(0.0) );
 }
 
 //! Equality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator == (const RealType& x, const complex<RealType>& y) {
+bool
+operator == (const RealType1& x, const complex<RealType2>& y) {
   return y == x;
 }
 
 //! Inequality operator for two complex numbers.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType>& x, const complex<RealType>& y) {
-  return real (x) != real (y) || imag (x) != imag (y);
+bool
+operator != (const complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(imag (y)) );
 }
 
 //! Inequality operator for std::complex and Kokkos::complex.
-template<class RealType>
-KOKKOS_INLINE_FUNCTION
-bool operator != (const std::complex<RealType>& x, const complex<RealType>& y) {
-  return std::real (x) != real (y) || std::imag (x) != imag (y);
+template<class RealType1, class RealType2>
+inline
+bool
+operator != (const std::complex<RealType1>& x, const complex<RealType2>& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(std::real (x)) != static_cast<common_real_type>(real (y)) || 
+           static_cast<common_real_type>(std::imag (x)) != static_cast<common_real_type>(imag (y)) );
 }
 
 //! Inequality operator for complex and real number.
 template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const complex<RealType1>& x, const RealType2& y) {
-  return real (x) != y || imag (x) != static_cast<RealType1> (0.0);
+bool
+operator != (const complex<RealType1>& x, const RealType2& y) {
+  typedef typename std::common_type<RealType1,RealType2>::type common_real_type;
+  return ( static_cast<common_real_type>(real (x)) != static_cast<common_real_type>(y) || 
+           static_cast<common_real_type>(imag (x)) != static_cast<common_real_type>(0.0) );
 }
 
 //! Inequality operator for real and complex number.
-template<class RealType>
+template<class RealType1, class RealType2>
 KOKKOS_INLINE_FUNCTION
-bool operator != (const RealType& x, const complex<RealType>& y) {
+bool
+operator != (const RealType1& x, const complex<RealType2>& y) {
   return y != x;
 }
 
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
index f089c16ad2..b9c131cd7a 100644
--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -353,7 +353,14 @@ struct CountAndFill {
   struct Fill {};
   KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
     auto j = m_crs.row_map(i);
-    data_type* fill = &(m_crs.entries(j));
+    /* we don't want to access entries(entries.size()), even if its just to get its
+       address and never use it.
+       this can happen when row (i) is empty and all rows after it are also empty.
+       we could compare to row_map(i + 1), but that is a read from global memory,
+       whereas dimension_0() should be part of the View in registers (or constant memory) */
+    data_type* fill =
+      (j == static_cast<decltype(j)>(m_crs.entries.dimension_0())) ?
+      nullptr : (&(m_crs.entries(j)));
     m_functor(i, fill);
   }
   using self_type = CountAndFill<CrsType, Functor>;
diff --git a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
index 9c9af0dd8b..b811751a2c 100644
--- a/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
+++ b/lib/kokkos/core/src/Kokkos_HBWSpace.hpp
@@ -147,12 +147,11 @@ public:
                  , const size_t arg_alloc_size ) const;
 
   /**\brief Return Name of the MemorySpace */
-  static constexpr const char* name();
+  static constexpr const char* name() { return "HBW"; }
 
 private:
 
   AllocationMechanism  m_alloc_mech;
-  static constexpr const char* m_name = "HBW";
   friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
index 339571941d..a825fd54d3 100644
--- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
+++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp
@@ -192,7 +192,7 @@ template<>
 struct reduction_identity<float> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float sum()  {return static_cast<float>(0.0f);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float prod() {return static_cast<float>(1.0f);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return FLT_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static float max()  {return -FLT_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static float min()  {return FLT_MAX;}
 };
 
@@ -200,7 +200,7 @@ template<>
 struct reduction_identity<double> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double sum()  {return static_cast<double>(0.0);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double prod() {return static_cast<double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return DBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static double max()  {return -DBL_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static double min()  {return DBL_MAX;}
 };
 
@@ -208,7 +208,7 @@ template<>
 struct reduction_identity<long double> {
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum()  {return static_cast<long double>(0.0);}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double prod() {return static_cast<long double>(1.0);}
-  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return LDBL_MIN;}
+  KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max()  {return -LDBL_MAX;}
   KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min()  {return LDBL_MAX;}
 };
 
diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp
index b13b0b01de..0118d4667e 100644
--- a/lib/kokkos/core/src/Kokkos_ROCm.hpp
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@@ -211,6 +211,24 @@ struct VerifyExecutionCanAccessMemorySpace
 } // namespace Kokkos
 
 
+
+#define threadIdx_x (hc_get_workitem_id(0))
+#define threadIdx_y (hc_get_workitem_id(1))
+#define threadIdx_z (hc_get_workitem_id(2))
+
+#define blockIdx_x  (hc_get_group_id(0))
+#define blockIdx_y  (hc_get_group_id(1))
+#define blockIdx_z  (hc_get_group_id(2))
+
+#define blockDim_x  (hc_get_group_size(0))
+#define blockDim_y  (hc_get_group_size(1))
+#define blockDim_z  (hc_get_group_size(2))
+
+#define gridDim_x   (hc_get_num_groups(0))
+#define gridDim_y   (hc_get_num_groups(1))
+#define gridDim_z   (hc_get_num_groups(2))
+
+
 #include <ROCm/Kokkos_ROCm_Parallel.hpp>
 #include <ROCm/Kokkos_ROCm_Task.hpp>
 
diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile
index 8fb13b8954..a917cf1656 100644
--- a/lib/kokkos/core/src/Makefile
+++ b/lib/kokkos/core/src/Makefile
@@ -88,6 +88,7 @@ build-makefile-kokkos:
 	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
 	echo "" >> Makefile.kokkos
 	echo "#Variables used in application Makefiles" >> Makefile.kokkos
+	echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos
 	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
 	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
 	echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
index 37d2ac8318..de84f6e59f 100644
--- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.hpp
@@ -211,6 +211,7 @@ void OpenMP::partition_master( F const& f
                                                  , thread_local_bytes
                                                  );
 
+      omp_set_num_threads(partition_size);
       f( omp_get_thread_num(), omp_get_num_threads() );
 
       Impl::t_openmp_instance->~Exec();
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
index 0b7a1e2583..f2674e5929 100644
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp
@@ -113,7 +113,6 @@ void reduce_enqueue(
 
   if (output_length < 1) return;
 
-  assert(output_result != nullptr);
   const auto td = get_tile_desc<T>(szElements,output_length,team_size,vector_size, shared_size);
 
   // allocate host and device memory for the results from each team
@@ -176,14 +175,17 @@ void reduce_enqueue(
       }
       
   });
-  ValueInit::init(ReducerConditional::select(f, reducer), output_result);
+  if (output_result != nullptr)
+     ValueInit::init(ReducerConditional::select(f, reducer), output_result);
   fut.wait();
 
   copy(result,result_cpu.data());
-  for(std::size_t i=0;i<td.num_tiles;i++)
-    ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
+  if (output_result != nullptr) {
+    for(std::size_t i=0;i<td.num_tiles;i++)
+       ValueJoin::join(ReducerConditional::select(f, reducer), output_result, result_cpu.data()+i*output_length);
 
-  ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+    ValueFinal::final( ReducerConditional::select(f, reducer) , output_result );
+  }
 
 }
 
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
index acf75f6f13..c2e85ad112 100644
--- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Scan.hpp
@@ -67,7 +67,7 @@ void scan_enqueue(
     hc::array<value_type> result(td.num_tiles);
     hc::array<value_type> scratch(len);
 
-    tile_for<value_type>(td, [&,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
+    tile_for<value_type>(td, [&,f,len,td](hc::tiled_index<1> t_idx, tile_buffer<value_type> buffer) [[hc]] 
     {
         const auto local = t_idx.local[0];
         const auto global = t_idx.global[0];
@@ -135,7 +135,7 @@ void scan_enqueue(
       ValueJoin::join(f, &result_cpu[i], &result_cpu[i-1]);
 
     copy(result_cpu.data(),result);
-    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,len,td](hc::tiled_index<1> t_idx) [[hc]] 
+    hc::parallel_for_each(hc::extent<1>(len).tile(td.tile_size), [&,f,len,td](hc::tiled_index<1> t_idx) [[hc]] 
     {
 //        const auto local = t_idx.local[0];
         const auto global = t_idx.global[0];
diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
index 3d3029535e..c5e73c8b26 100644
--- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
+++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp
@@ -68,6 +68,8 @@ int bit_first_zero( unsigned i ) noexcept
   return full != i ? _bit_scan_forward( ~i ) : -1 ;
 #elif defined( KOKKOS_COMPILER_IBM )
   return full != i ? __cnttz4( ~i ) : -1 ;
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return full != i ? _popcnt( i ^ (i+1) ) - 1 : -1 ;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
   return full != i ? __builtin_ffs( ~i ) - 1 : -1 ;
 #else
@@ -90,17 +92,16 @@ int bit_scan_forward( unsigned i )
   return _bit_scan_forward(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return __cnttz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? _popcnt(~i & (i-1)) : -1;
 #elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
   return __builtin_ffs(i) - 1;
 #else
-  unsigned t = 1u;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t << 1;
-    ++r;
+  int offset = -1;
+  if ( i ) {
+    for ( offset = 0 ; (i & ( 1 << offset ) ) == 0 ; ++offset );
   }
-  return r;
+  return offset;
 #endif
 }
 
@@ -116,17 +117,16 @@ int bit_scan_reverse( unsigned i )
   return _bit_scan_reverse(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return shift - __cntlz4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return i ? shift - _leadz32(i) : 0 ;
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return shift - __builtin_clz(i);
 #else
-  unsigned t = 1u << shift;
-  int r = 0;
-  while ( i && ( i & t == 0 ) )
-  {
-    t = t >> 1;
-    ++r;
+  int offset = 0;
+  if ( i ) {
+    for ( offset = shift ; (i & ( 1 << offset ) ) == 0 ; --offset );
   }
-  return r;
+  return offset;
 #endif
 }
 
@@ -142,6 +142,8 @@ int bit_count( unsigned i )
   return _popcnt32(i);
 #elif defined( KOKKOS_COMPILER_IBM )
   return __popcnt4(i);
+#elif defined( KOKKOS_COMPILER_CRAYC )
+  return _popcnt(i);
 #elif defined( __GNUC__ ) || defined( __GNUG__ )
   return __builtin_popcount(i);
 #else
diff --git a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
index e11f8b6d34..cd0553218d 100644
--- a/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
+++ b/lib/kokkos/core/src/impl/Kokkos_HBWSpace.cpp
@@ -166,10 +166,6 @@ void HBWSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_s
   }
 }
 
-constexpr const char* HBWSpace::name() {
-  return m_name;
-}
-
 } // namespace Experimental
 } // namespace Kokkos
 
diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp
index ce5537fed3..c7f681699e 100644
--- a/lib/kokkos/core/unit_test/TestComplex.hpp
+++ b/lib/kokkos/core/unit_test/TestComplex.hpp
@@ -114,7 +114,7 @@ struct TestComplexBasicMath {
   typename Kokkos::View<Kokkos::complex<double>*,ExecSpace>::HostMirror h_results;
 
   void testit () {
-    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",20);
+    d_results = Kokkos::View<Kokkos::complex<double>*,ExecSpace>("TestComplexBasicMath",24);
     h_results = Kokkos::create_mirror_view(d_results);
 
     Kokkos::parallel_for(Kokkos::RangePolicy<ExecSpace>(0,1), *this);
@@ -125,6 +125,7 @@ struct TestComplexBasicMath {
     std::complex<double> b(3.25,5.75);
     std::complex<double> d(1.0,2.0);
     double c = 9.3;
+    int e = 2;
 
     std::complex<double> r;
     r = a+b; ASSERT_FLOAT_EQ(h_results(0).real(),  r.real()); ASSERT_FLOAT_EQ(h_results(0).imag(),  r.imag());
@@ -147,6 +148,12 @@ struct TestComplexBasicMath {
     r = c-a; ASSERT_FLOAT_EQ(h_results(17).real(), r.real()); ASSERT_FLOAT_EQ(h_results(17).imag(), r.imag());
     r = c*a; ASSERT_FLOAT_EQ(h_results(18).real(), r.real()); ASSERT_FLOAT_EQ(h_results(18).imag(), r.imag());
     r = c/a; ASSERT_FLOAT_EQ(h_results(19).real(), r.real()); ASSERT_FLOAT_EQ(h_results(19).imag(), r.imag());
+
+    r = a; 
+    /* r = a+e; */ ASSERT_FLOAT_EQ(h_results(20).real(),  r.real()+e); ASSERT_FLOAT_EQ(h_results(20).imag(),  r.imag());
+    /* r = a-e; */ ASSERT_FLOAT_EQ(h_results(21).real(),  r.real()-e); ASSERT_FLOAT_EQ(h_results(21).imag(),  r.imag());
+    /* r = a*e; */ ASSERT_FLOAT_EQ(h_results(22).real(),  r.real()*e); ASSERT_FLOAT_EQ(h_results(22).imag(),  r.imag()*e);
+    /* r = a/e; */ ASSERT_FLOAT_EQ(h_results(23).real(),  r.real()/2); ASSERT_FLOAT_EQ(h_results(23).imag(),  r.imag()/e);
   }
 
   KOKKOS_INLINE_FUNCTION
@@ -190,6 +197,12 @@ struct TestComplexBasicMath {
     d_results(17) = c-a;
     d_results(18) = c*a;
     d_results(19) = c/a;
+
+    int e = 2;
+    d_results(20) = a+e;
+    d_results(21) = a-e;
+    d_results(22) = a*e;
+    d_results(23) = a/e;
   }
 };
 
diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp
index f579ddf02c..fbc3a65c2f 100644
--- a/lib/kokkos/core/unit_test/TestMDRange.hpp
+++ b/lib/kokkos/core/unit_test/TestMDRange.hpp
@@ -286,7 +286,9 @@ struct TestMDRange_2D {
     // Test with reducers - scalar
     {
       typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType<int> > range_type;
-      range_type range( {{ 0, 0 }}, {{ N0, N1 }}, {{ 3, 3 }} );
+      int s0 = 1;
+      int s1 = 1;
+      range_type range( {{ s0, s1 }}, {{ N0, N1 }}, {{ 3, 3 }} );
 
       TestMDRange_2D functor( N0, N1 );
 
@@ -297,7 +299,7 @@ struct TestMDRange_2D {
 
       parallel_reduce( range, functor, reducer_scalar );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) );
     }
     // Test with reducers - scalar view
     {
@@ -445,7 +447,9 @@ struct TestMDRange_2D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
+      const int s0 = 1;
+      const int s1 = 1;
+      range_type range( point_type{ { s0, s1 } }, point_type{ { N0, N1 } }, tile_type{ { 3, 3 } } );
       TestMDRange_2D functor( N0, N1 );
 
       parallel_for( range, functor );
@@ -454,8 +458,8 @@ struct TestMDRange_2D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
       {
         if ( h_view( i, j ) != 3 ) {
           ++counter;
@@ -463,7 +467,7 @@ struct TestMDRange_2D {
       }
 
       if ( counter != 0 ) {
-        printf( "Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Default Layouts + InitTag op(): Errors in test_for2; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -699,6 +703,7 @@ struct TestMDRange_2D {
 
       ASSERT_EQ( counter, 0 );
     }
+
   } // end test_for2
 }; // MDRange_2D
 
@@ -749,7 +754,10 @@ struct TestMDRange_3D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
 
       TestMDRange_3D functor( N0, N1, N2 );
 
@@ -757,7 +765,7 @@ struct TestMDRange_3D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) );
     }
 
     // Test with reducers - scalar
@@ -952,7 +960,10 @@ struct TestMDRange_3D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      range_type range( point_type{ { s0, s1, s2 } }, point_type{ { N0, N1, N2 } }, tile_type{ { 3, 3, 3 } } );
       TestMDRange_3D functor( N0, N1, N2 );
 
       parallel_for( range, functor );
@@ -961,9 +972,9 @@ struct TestMDRange_3D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
       {
         if ( h_view( i, j, k ) != 3 ) {
           ++counter;
@@ -971,7 +982,7 @@ struct TestMDRange_3D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for3; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -1207,7 +1218,11 @@ struct TestMDRange_4D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 3, 3, 3 } } );
 
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
@@ -1215,7 +1230,7 @@ struct TestMDRange_4D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) );
     }
 
     // Test with reducers - scalar
@@ -1415,7 +1430,11 @@ struct TestMDRange_4D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3 } }, point_type{ { N0, N1, N2, N3 } }, tile_type{ { 3, 11, 3, 3 } } );
       TestMDRange_4D functor( N0, N1, N2, N3 );
 
       parallel_for( range, functor );
@@ -1424,10 +1443,10 @@ struct TestMDRange_4D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
       {
         if ( h_view( i, j, k, l ) != 3 ) {
           ++counter;
@@ -1435,7 +1454,7 @@ struct TestMDRange_4D {
       }
 
       if ( counter != 0 ) {
-        printf("Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
+        printf("Offset Start + Defaults +m_tile > m_upper dim2 InitTag op(): Errors in test_for4; mismatches = %d\n\n",counter);
       }
 
       ASSERT_EQ( counter, 0 );
@@ -1682,7 +1701,12 @@ struct TestMDRange_5D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 3 } } );
 
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
@@ -1690,7 +1714,7 @@ struct TestMDRange_5D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) );
     }
 
     // Test with reducers - scalar
@@ -1810,7 +1834,12 @@ struct TestMDRange_5D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4 } }, point_type{ { N0, N1, N2, N3, N4 } }, tile_type{ { 3, 3, 3, 3, 5 } } );
       TestMDRange_5D functor( N0, N1, N2, N3, N4 );
 
       parallel_for( range, functor );
@@ -1819,11 +1848,11 @@ struct TestMDRange_5D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
       {
         if ( h_view( i, j, k, l, m ) != 3 ) {
           ++counter;
@@ -1831,7 +1860,7 @@ struct TestMDRange_5D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for5; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );
@@ -2084,7 +2113,13 @@ struct TestMDRange_6D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 3, 2 } } );
 
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
@@ -2092,7 +2127,7 @@ struct TestMDRange_6D {
       double sum = 0.0;
       parallel_reduce( range, functor, sum );
 
-      ASSERT_EQ( sum, 2 * N0 * N1 * N2 * N3 * N4 * N5 );
+      ASSERT_EQ( sum, 2 * (N0 - s0) * (N1 - s1) * (N2 - s2) * (N3 - s3) * (N4 - s4) * (N5 - s5) );
     }
 
     // Test with reducers - scalar
@@ -2214,7 +2249,13 @@ struct TestMDRange_6D {
       typedef typename range_type::tile_type tile_type;
       typedef typename range_type::point_type point_type;
 
-      range_type range( point_type{ { 0, 0, 0, 0, 0, 0 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
+      int s0 = 1;
+      int s1 = 1;
+      int s2 = 1;
+      int s3 = 1;
+      int s4 = 1;
+      int s5 = 1;
+      range_type range( point_type{ { s0, s1, s2, s3, s4, s5 } }, point_type{ { N0, N1, N2, N3, N4, N5 } }, tile_type{ { 3, 3, 3, 3, 2, 3 } } ); //tile dims 3,3,3,3,3,3 more than cuda can handle with debugging
       TestMDRange_6D functor( N0, N1, N2, N3, N4, N5 );
 
       parallel_for( range, functor );
@@ -2223,12 +2264,12 @@ struct TestMDRange_6D {
       Kokkos::deep_copy( h_view, functor.input_view );
 
       int counter = 0;
-      for ( int i = 0; i < N0; ++i )
-      for ( int j = 0; j < N1; ++j )
-      for ( int k = 0; k < N2; ++k )
-      for ( int l = 0; l < N3; ++l )
-      for ( int m = 0; m < N4; ++m )
-      for ( int n = 0; n < N5; ++n )
+      for ( int i = s0; i < N0; ++i )
+      for ( int j = s1; j < N1; ++j )
+      for ( int k = s2; k < N2; ++k )
+      for ( int l = s3; l < N3; ++l )
+      for ( int m = s4; m < N4; ++m )
+      for ( int n = s5; n < N5; ++n )
       {
         if ( h_view( i, j, k, l, m, n ) != 3 ) {
           ++counter;
@@ -2236,7 +2277,7 @@ struct TestMDRange_6D {
       }
 
       if ( counter != 0 ) {
-        printf( "Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
+        printf( "Offset Start + Defaults + InitTag op(): Errors in test_for6; mismatches = %d\n\n", counter );
       }
 
       ASSERT_EQ( counter, 0 );

From c522b1b7a974792e0a6a4285b7171a73040774c9 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 4 Oct 2017 00:22:56 -0400
Subject: [PATCH 41/53] add call to fftw_cleanup() before exiting to avoid
 bogus leak reports when compiling with FFTW v3.x

---
 src/main.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/main.cpp b/src/main.cpp
index 7401183fea..82dac5af6d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -22,6 +22,10 @@
 #include <fenv.h>
 #endif
 
+#ifdef FFT_FFTW3
+#include <fftw3.h>
+#endif
+
 using namespace LAMMPS_NS;
 
 /* ----------------------------------------------------------------------
@@ -62,4 +66,10 @@ int main(int argc, char **argv)
 #endif
   MPI_Barrier(MPI_COMM_WORLD);
   MPI_Finalize();
+
+#ifdef FFT_FFTW3
+  // tell fftw3 to delete its global memory pool
+  // and thus avoid bogus valgrind memory leak reports
+  fftw_cleanup();
+#endif
 }

From bda0ee3aa1ebc3c652bfd83da5f9281394a8a650 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 4 Oct 2017 12:06:03 -0600
Subject: [PATCH 42/53] Destroy unneeded fix in pair_reaxc_kokkos

---
 src/KOKKOS/pair_reaxc_kokkos.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp
index d95cd8f8ae..cd0ebcde05 100644
--- a/src/KOKKOS/pair_reaxc_kokkos.cpp
+++ b/src/KOKKOS/pair_reaxc_kokkos.cpp
@@ -131,6 +131,8 @@ template<class DeviceType>
 void PairReaxCKokkos<DeviceType>::init_style()
 {
   PairReaxC::init_style();
+  if (fix_reax) modify->delete_fix("REAXC"); // not needed in the Kokkos version
+  fix_reax = NULL;
 
   // irequest = neigh request made by parent class
 

From 3653f4012013ac1f04840395ff3c1b821a30b4da Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Wed, 4 Oct 2017 12:10:13 -0600
Subject: [PATCH 43/53] Reduce unnecessary communication in fix_qeq_reax

---
 src/KOKKOS/fix_qeq_reax_kokkos.cpp | 15 ++++++++-------
 src/USER-REAXC/fix_qeq_reax.cpp    |  2 +-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
index e54b53ae89..5a1d3c7f1c 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -63,6 +63,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) :
 
   nmax = nmax = m_cap = 0;
   allocated_flag = 0;
+  nprev = 4;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -158,15 +159,15 @@ void FixQEqReaxKokkos<DeviceType>::init_hist()
 {
   int i,j;
 
-  k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,5);
+  k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev);
   d_s_hist = k_s_hist.template view<DeviceType>();
   h_s_hist = k_s_hist.h_view;
-  k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,5);
+  k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev);
   d_t_hist = k_t_hist.template view<DeviceType>();
   h_t_hist = k_t_hist.h_view;
 
   for( i = 0; i < atom->nmax; i++ )
-    for( j = 0; j < 5; j++ )
+    for( j = 0; j < nprev; j++ )
       k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0;
 
   k_s_hist.template modify<LMPHostType>();
@@ -334,11 +335,11 @@ void FixQEqReaxKokkos<DeviceType>::allocate_array()
     d_d = k_d.template view<DeviceType>();
     h_d = k_d.h_view;
 
-    k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,5);
+    k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev);
     d_s_hist = k_s_hist.template view<DeviceType>();
     h_s_hist = k_s_hist.h_view;
 
-    k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,5);
+    k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev);
     d_t_hist = k_t_hist.template view<DeviceType>();
     h_t_hist = k_t_hist.h_view;
   }
@@ -368,7 +369,7 @@ void FixQEqReaxKokkos<DeviceType>::zero_item(int ii) const
     d_o[i] = 0.0;
     d_r[i] = 0.0;
     d_d[i] = 0.0;
-    //for( int j = 0; j < 5; j++ )
+    //for( int j = 0; j < nprev; j++ )
       //d_s_hist(i,j) = d_t_hist(i,j) = 0.0;
   }
 
@@ -1173,7 +1174,7 @@ double FixQEqReaxKokkos<DeviceType>::memory_usage()
 {
   double bytes;
 
-  bytes = atom->nmax*5*2 * sizeof(F_FLOAT); // s_hist & t_hist
+  bytes = atom->nmax*nprev*2 * sizeof(F_FLOAT); // s_hist & t_hist
   bytes += atom->nmax*8 * sizeof(F_FLOAT); // storage
   bytes += n_cap*2 * sizeof(int); // matrix...
   bytes += m_cap * sizeof(int);
diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp
index 9d165f3fd3..33b70c972d 100644
--- a/src/USER-REAXC/fix_qeq_reax.cpp
+++ b/src/USER-REAXC/fix_qeq_reax.cpp
@@ -95,7 +95,7 @@ FixQEqReax::FixQEqReax(LAMMPS *lmp, int narg, char **arg) :
   pack_flag = 0;
   s = NULL;
   t = NULL;
-  nprev = 5;
+  nprev = 4;
 
   Hdia_inv = NULL;
   b_s = NULL;

From 2b0bfcb10f906873f3771193f7cb1374dfd9a39e Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 5 Oct 2017 10:35:09 -0600
Subject: [PATCH 44/53] Fix memory leak in pair_reaxc_kokkos

---
 src/KOKKOS/pair_reaxc_kokkos.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/KOKKOS/pair_reaxc_kokkos.cpp b/src/KOKKOS/pair_reaxc_kokkos.cpp
index cd0ebcde05..d5f83f4537 100644
--- a/src/KOKKOS/pair_reaxc_kokkos.cpp
+++ b/src/KOKKOS/pair_reaxc_kokkos.cpp
@@ -557,8 +557,8 @@ void PairReaxCKokkos<DeviceType>::Deallocate_Lookup_Tables()
 
   ntypes = atom->ntypes;
 
-  for( i = 0; i < ntypes; ++i ) {
-    for( j = i; j < ntypes; ++j )
+  for( i = 0; i <= ntypes; ++i ) {
+    for( j = i; j <= ntypes; ++j )
       if( LR[i][j].n ) {
         sfree( LR[i][j].y, "LR[i,j].y" );
         sfree( LR[i][j].H, "LR[i,j].H" );

From eecd2fbaee77536910031b3f40b455b242c301b1 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 5 Oct 2017 11:23:31 -0600
Subject: [PATCH 45/53] Remove hardcoded value in fix_qeq_reax

---
 src/KOKKOS/fix_qeq_reax_kokkos.cpp | 2 +-
 src/USER-OMP/fix_qeq_reax_omp.cpp  | 2 +-
 src/USER-REAXC/fix_qeq_reax.cpp    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
index 5a1d3c7f1c..5d2f6a0438 100644
--- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp
+++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp
@@ -1088,7 +1088,7 @@ void FixQEqReaxKokkos<DeviceType>::calculate_q_item(int ii) const
   if (mask[i] & groupbit) {
     q(i) = d_s[i] - delta * d_t[i];
 
-    for (int k = 4; k > 0; --k) {
+    for (int k = nprev-1; k > 0; --k) {
       d_s_hist(i,k) = d_s_hist(i,k-1);
       d_t_hist(i,k) = d_t_hist(i,k-1);
     }
diff --git a/src/USER-OMP/fix_qeq_reax_omp.cpp b/src/USER-OMP/fix_qeq_reax_omp.cpp
index 4457ab6592..d89c9627fe 100644
--- a/src/USER-OMP/fix_qeq_reax_omp.cpp
+++ b/src/USER-OMP/fix_qeq_reax_omp.cpp
@@ -703,7 +703,7 @@ void FixQEqReaxOMP::calculate_Q()
       q[i] = s[i] - u * t[i];
 
       // backup s & t
-      for (int k = 4; k > 0; --k) {
+      for (int k = nprev-1; k > 0; --k) {
         s_hist[i][k] = s_hist[i][k-1];
         t_hist[i][k] = t_hist[i][k-1];
       }
diff --git a/src/USER-REAXC/fix_qeq_reax.cpp b/src/USER-REAXC/fix_qeq_reax.cpp
index 33b70c972d..d1c4f90771 100644
--- a/src/USER-REAXC/fix_qeq_reax.cpp
+++ b/src/USER-REAXC/fix_qeq_reax.cpp
@@ -817,7 +817,7 @@ void FixQEqReax::calculate_Q()
       q[i] = s[i] - u * t[i];
 
       /* backup s & t */
-      for (k = 4; k > 0; --k) {
+      for (k = nprev-1; k > 0; --k) {
         s_hist[i][k] = s_hist[i][k-1];
         t_hist[i][k] = t_hist[i][k-1];
       }

From 6bf2c60c07edefc7e1843b289454ce7ecb645e0a Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 5 Oct 2017 14:58:05 -0600
Subject: [PATCH 46/53] Fix issues in Kokkos comm

---
 src/KOKKOS/comm_kokkos.cpp   | 60 +++++++++++++++++++++++-------------
 src/KOKKOS/verlet_kokkos.cpp |  7 +++--
 2 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index d4d348d7e2..3276d0cdb0 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -135,9 +135,10 @@ void CommKokkos::init()
   if (force->newton == 0) check_reverse = 0;
   if (force->pair) check_reverse += force->pair->comm_reverse_off;
 
-  if (check_reverse || check_forward)
-    forward_comm_classic = true;
+  //if (check_forward)
+  //  forward_comm_classic = true;
 
+  //if (check_reverse || !comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
   if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
     reverse_comm_classic = true;
 }
@@ -186,12 +187,12 @@ void CommKokkos::forward_comm_device(int dummy)
   // if comm_x_only set, exchange or copy directly to x, don't unpack
 
   k_sendlist.sync<DeviceType>();
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
 
   for (int iswap = 0; iswap < nswap; iswap++) {
     if (sendproc[iswap] != me) {
       if (comm_x_only) {
         if (size_forward_recv[iswap]) {
-            atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
             buf = atomKK->k_x.view<DeviceType>().ptr_on_device() +
               firstrecv[iswap]*atomKK->k_x.view<DeviceType>().dimension_1();
             MPI_Irecv(buf,size_forward_recv[iswap],MPI_DOUBLE,
@@ -204,9 +205,11 @@ void CommKokkos::forward_comm_device(int dummy)
                    n,MPI_DOUBLE,sendproc[iswap],0,world);
         }
 
-        if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
-                         space,X_MASK);
+        if (size_forward_recv[iswap]) {
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                           space,X_MASK);
+        }
       } else if (ghost_velocity) {
         error->all(FLERR,"Ghost velocity forward comm not yet "
                    "implemented with Kokkos");
@@ -276,7 +279,7 @@ void CommKokkos::reverse_comm()
   else
     atomKK->modified(Host,ALL_MASK);
 
-  atomKK->sync(Device,ALL_MASK); // is this needed?
+  //atomKK->sync(Device,ALL_MASK); // is this needed?
 }
 
 template<class DeviceType>
@@ -290,9 +293,10 @@ void CommKokkos::reverse_comm_device()
   // exchange data with another proc
   // if other proc is self, just copy
   // if comm_f_only set, exchange or copy directly from f, don't pack
-  
+
   k_sendlist.sync<DeviceType>();
-  
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK);
+
   for (int iswap = nswap-1; iswap >= 0; iswap--) {
     if (sendproc[iswap] != me) {
       if (comm_f_only) {
@@ -300,16 +304,17 @@ void CommKokkos::reverse_comm_device()
             MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),size_reverse_recv[iswap],MPI_DOUBLE,
                     sendproc[iswap],0,world,&request);
         if (size_reverse_send[iswap]) {
-          atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,F_MASK);
           buf = atomKK->k_f.view<DeviceType>().ptr_on_device() +
             firstrecv[iswap]*atomKK->k_f.view<DeviceType>().dimension_1();
   
           MPI_Send(buf,size_reverse_send[iswap],MPI_DOUBLE,
                    recvproc[iswap],0,world);
         }
-        if (size_reverse_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE);
-        atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
-                         space,F_MASK);
+        if (size_reverse_recv[iswap]) {
+          MPI_Wait(&request,MPI_STATUS_IGNORE);
+          atomKK->modified(ExecutionSpaceFromDevice<DeviceType>::
+                           space,F_MASK);
+        }
       } else {
         if (size_reverse_recv[iswap])
           MPI_Irecv(k_buf_recv.view<DeviceType>().ptr_on_device(),
@@ -710,9 +715,7 @@ void CommKokkos::borders()
   }
 
   atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
   k_sendlist.sync<LMPHostType>();
-  k_sendlist.modify<LMPHostType>();
   CommBrick::borders();
   k_sendlist.modify<LMPHostType>();
   atomKK->modified(Host,ALL_MASK);
@@ -783,7 +786,7 @@ void CommKokkos::borders_device() {
   AtomVecKokkos *avec = (AtomVecKokkos *) atom->avec;
 
   ExecutionSpace exec_space = ExecutionSpaceFromDevice<DeviceType>::space;
-  k_sendlist.modify<DeviceType>();
+  k_sendlist.sync<DeviceType>();
   atomKK->sync(exec_space,ALL_MASK);
 
   // do swaps over all 3 dimensions
@@ -845,20 +848,24 @@ void CommKokkos::borders_device() {
             k_total_send.template modify<DeviceType>();
             k_total_send.template sync<LMPHostType>();
 
+            k_sendlist.modify<DeviceType>();
+
             if(k_total_send.h_view() >= maxsendlist[iswap]) {
               grow_list(iswap,k_total_send.h_view());
-              k_sendlist.modify<DeviceType>();
+
               k_total_send.h_view() = 0;
-              if(exec_space == Device) {
-                k_total_send.template modify<LMPHostType>();
-                k_total_send.template sync<LMPDeviceType>();
-              }
+              k_total_send.template modify<LMPHostType>();
+              k_total_send.template sync<LMPDeviceType>();
+
               BuildBorderListFunctor<DeviceType> f(atomKK->k_x,k_sendlist,
                   k_total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
               Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
               Kokkos::parallel_for(config,f);
+
               k_total_send.template modify<DeviceType>();
               k_total_send.template sync<LMPHostType>();
+
+              k_sendlist.modify<DeviceType>();
             }
             nsend = k_total_send.h_view();
           } else {
@@ -983,7 +990,6 @@ void CommKokkos::borders_device() {
 
   // reset global->local map
 
-  if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
   atomKK->modified(exec_space,ALL_MASK);
   if (map_style) {
     atomKK->sync(Host,TAG_MASK);
@@ -1057,6 +1063,11 @@ void CommKokkos::grow_list(int iswap, int n)
 {
   int size = static_cast<int> (BUFFACTOR * n);
 
+  if (exchange_comm_classic) { // force realloc on Host
+    k_sendlist.sync<LMPHostType>();
+    k_sendlist.modify<LMPHostType>();
+  }
+
   memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
 
   for(int i=0;i<maxswap;i++) {
@@ -1080,6 +1091,11 @@ void CommKokkos::grow_swap(int n)
   maxswap = n;
   int size = MAX(k_sendlist.d_view.dimension_1(),BUFMIN);
 
+  if (exchange_comm_classic) { // force realloc on Host
+    k_sendlist.sync<LMPHostType>();
+    k_sendlist.modify<LMPHostType>();
+  }
+
   memory->grow_kokkos(k_sendlist,sendlist,maxswap,size,"comm:sendlist");
 
   memory->grow(maxsendlist,n,"comm:maxsendlist");
diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index e4a3f857d3..cb9d60f9ca 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -526,8 +526,11 @@ void VerletKokkos::run(int n)
 
     // reverse communication of forces
 
-    if (force->newton) comm->reverse_comm();
-    timer->stamp(Timer::COMM);
+    if (force->newton) {
+      Kokkos::fence();
+      comm->reverse_comm();
+      timer->stamp(Timer::COMM);
+    }
 
     // force modifications, final time integration, diagnostics
 

From 44d2e8ff74d515d601b2cd330c5fb2e724016f3f Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 5 Oct 2017 15:27:00 -0600
Subject: [PATCH 47/53] Add pre_reverse to verlet_kokkos and comment out timer

---
 src/KOKKOS/verlet_kokkos.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/verlet_kokkos.cpp b/src/KOKKOS/verlet_kokkos.cpp
index cb9d60f9ca..adec5ff1bd 100644
--- a/src/KOKKOS/verlet_kokkos.cpp
+++ b/src/KOKKOS/verlet_kokkos.cpp
@@ -294,6 +294,7 @@ void VerletKokkos::run(int n)
   int n_pre_exchange = modify->n_pre_exchange;
   int n_pre_neighbor = modify->n_pre_neighbor;
   int n_pre_force = modify->n_pre_force;
+  int n_pre_reverse = modify->n_pre_reverse;
   int n_post_force = modify->n_post_force;
   int n_end_of_step = modify->n_end_of_step;
 
@@ -304,9 +305,9 @@ void VerletKokkos::run(int n)
 
   f_merge_copy = DAT::t_f_array("VerletKokkos::f_merge_copy",atomKK->k_f.dimension_0());
 
-  static double time = 0.0;
   atomKK->sync(Device,ALL_MASK);
-  Kokkos::Impl::Timer ktimer;
+  //static double time = 0.0;
+  //Kokkos::Impl::Timer ktimer;
 
   timer->init_timeout();
   for (int i = 0; i < n; i++) {
@@ -320,10 +321,10 @@ void VerletKokkos::run(int n)
 
     // initial time integration
 
-    ktimer.reset();
+    //ktimer.reset();
     timer->stamp();
     modify->initial_integrate(vflag);
-    time += ktimer.seconds();
+    //time += ktimer.seconds();
     if (n_post_integrate) modify->post_integrate();
     timer->stamp(Timer::MODIFY);
 
@@ -523,6 +524,10 @@ void VerletKokkos::run(int n)
       atomKK->k_f.modify<LMPDeviceType>();
     }
 
+    if (n_pre_reverse) {
+      modify->pre_reverse(eflag,vflag);
+      timer->stamp(Timer::MODIFY);
+    }
 
     // reverse communication of forces
 

From e0efdd50fab3c2f98c22ebd58e1c63e7be4045b4 Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Thu, 5 Oct 2017 15:47:46 -0600
Subject: [PATCH 48/53] Switch to classic comm if ghost_velocity. The
 check_forward and check_reverse tests aren't necessary because the
 fix/pair/etc. comm is done in a separate routine.

---
 src/KOKKOS/comm_kokkos.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 3276d0cdb0..5534341342 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -135,10 +135,9 @@ void CommKokkos::init()
   if (force->newton == 0) check_reverse = 0;
   if (force->pair) check_reverse += force->pair->comm_reverse_off;
 
-  //if (check_forward)
-  //  forward_comm_classic = true;
+  if (ghost_velocity)
+    forward_comm_classic = true;
 
-  //if (check_reverse || !comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
   if (!comm_f_only) // not all Kokkos atom_vec styles have reverse pack/unpack routines yet
     reverse_comm_classic = true;
 }

From 214c0cfb2b7e83697b3e70db81b12f46eb0d7370 Mon Sep 17 00:00:00 2001
From: Steve Plimpton <sjplimp@sandia.gov>
Date: Thu, 5 Oct 2017 16:44:24 -0600
Subject: [PATCH 49/53] add atom_modify map yes, also timers to create_atoms
 and replicate

---
 doc/src/Section_packages.txt |  2 +-
 doc/src/atom_modify.txt      | 52 +++++++++++++++++++-----------------
 doc/src/fix_nh.txt           | 36 ++++++++++++++-----------
 src/atom.cpp                 |  5 ++--
 src/atom_map.cpp             |  4 +--
 src/create_atoms.cpp         | 16 +++++++++--
 src/output.cpp               |  2 +-
 src/replicate.cpp            | 17 ++++++++++++
 8 files changed, 86 insertions(+), 48 deletions(-)

diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index d9a9fb4163..b0b2d9fa63 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -727,7 +727,7 @@ make lib-latte                          # print help message
 make lib-latte args="-b"                # download and build in lib/latte/LATTE-master
 make lib-latte args="-p $HOME/latte"    # use existing LATTE installation in $HOME/latte
 make lib-latte args="-b -m gfortran"    # download and build in lib/latte and 
-                                        #   copy Makefile.lammps.gfortran to Makefile.lammps
+                                        #   copy Makefile.lammps.gfortran to Makefile.lammps :pre
 
 Note that 3 symbolic (soft) links, "includelink" and "liblink" and
 "filelink", are created in lib/latte to point into the LATTE home dir.
diff --git a/doc/src/atom_modify.txt b/doc/src/atom_modify.txt
index d5c82f16ac..1dc0fa6bfb 100644
--- a/doc/src/atom_modify.txt
+++ b/doc/src/atom_modify.txt
@@ -16,7 +16,7 @@ atom_modify keyword values ... :pre
 one or more keyword/value pairs may be appended :ulb,l
 keyword = {id} or {map} or {first} or {sort} :l
    {id} value = {yes} or {no}
-   {map} value = {array} or {hash}
+   {map} value = {yes} or {array} or {hash}
    {first} value = group-ID = group whose atoms will appear first in internal atom lists
    {sort} values = Nfreq binsize
      Nfreq = sort atoms spatially every this many time steps
@@ -25,8 +25,8 @@ keyword = {id} or {map} or {first} or {sort} :l
 
 [Examples:]
 
-atom_modify map hash
-atom_modify map array sort 10000 2.0
+atom_modify map yes
+atom_modify map hash sort 10000 2.0
 atom_modify first colloid :pre
 
 [Description:]
@@ -62,29 +62,33 @@ switch.  This is described in "Section 2.2"_Section_start.html#start_2
 of the manual.  If atom IDs are not used, they must be specified as 0
 for all atoms, e.g. in a data or restart file.
 
-The {map} keyword determines how atom ID lookup is done for molecular
-atom styles.  Lookups are performed by bond (angle, etc) routines in
-LAMMPS to find the local atom index associated with a global atom ID.
+The {map} keyword determines how atoms with specific IDs are found
+when required.  An example are the bond (angle, etc) methods which
+need to find the local index of an atom with a specific global ID
+which is a bond (angle, etc) partner.  LAMMPS performs this operation
+efficiently by creating a "map", which is either an {array} or {hash}
+table, as descibed below.
 
-When the {array} value is used, each processor stores a lookup table
-of length N, where N is the largest atom ID in the system.  This is a
+When the {map} keyword is not specified in your input script, LAMMPS
+only creates a map for "atom_styles"_atom_style.html for molecular
+systems which have permanent bonds (angles, etc).  No map is created
+for atomic systems, since it is normally not needed.  However some
+LAMMPS commands require a map, even for atomic systems, and will
+generate an error if one does not exist.  The {map} keyword thus
+allows you to force the creation of a map.  The {yes} value will
+create either an {array} or {hash} style map, as explained in the next
+paragraph.  The {array} and {hash} values create an atom-style or
+hash-style map respectively.
+
+For an {array}-style map, each processor stores a lookup table of
+length N, where N is the largest atom ID in the system.  This is a
 fast, simple method for many simulations, but requires too much memory
-for large simulations.  The {hash} value uses a hash table to perform
-the lookups.  This can be slightly slower than the {array} method, but
-its memory cost is proportional to the number of atoms owned by a
-processor, i.e. N/P when N is the total number of atoms in the system
-and P is the number of processors.
-
-When this setting is not specified in your input script, LAMMPS
-creates a map, if one is needed, as an array or hash.  See the
-discussion of default values below for how LAMMPS chooses which kind
-of map to build.  Note that atomic systems do not normally need to
-create a map.  However, even in this case some LAMMPS commands will
-create a map to find atoms (and then destroy it), or require a
-permanent map.  An example of the former is the "velocity loop
-all"_velocity.html command, which uses a map when looping over all
-atoms and insuring the same velocity values are assigned to an atom
-ID, no matter which processor owns it.
+for large simulations.  For a {hash}-style map, a hash table is
+created on each processor, which finds an atom ID in constant time
+(independent of the global number of atom IDs).  It can be slightly
+slower than the {array} map, but its memory cost is proportional to
+the number of atoms owned by a processor, i.e. N/P when N is the total
+number of atoms in the system and P is the number of processors.
 
 The {first} keyword allows a "group"_group.html to be specified whose
 atoms will be maintained as the first atoms in each processor's list
diff --git a/doc/src/fix_nh.txt b/doc/src/fix_nh.txt
index 8fa30ac222..41d0e6438f 100644
--- a/doc/src/fix_nh.txt
+++ b/doc/src/fix_nh.txt
@@ -393,32 +393,36 @@ thermostatting and barostatting.
 :line
 
 These fixes compute a temperature and pressure each timestep.  To do
-this, the fix creates its own computes of style "temp" and "pressure",
-as if one of these two sets of commands had been issued:
+this, the thermostat and barostat fixes create their own computes of
+style "temp" and "pressure", as if one of these sets of commands had
+been issued:
 
+For fix nvt:
 compute fix-ID_temp group-ID temp
-compute fix-ID_press group-ID pressure fix-ID_temp :pre
 
+For fix npt and fix nph:
 compute fix-ID_temp all temp
 compute fix-ID_press all pressure fix-ID_temp :pre
 
-See the "compute temp"_compute_temp.html and "compute
-pressure"_compute_pressure.html commands for details.  Note that the
-IDs of the new computes are the fix-ID + underscore + "temp" or fix_ID
-+ underscore + "press".  For fix nvt, the group for the new computes
-is the same as the fix group.  For fix nph and fix npt, the group for
-the new computes is "all" since pressure is computed for the entire
-system.
+For fix nvt, the group for the new temperature compute is the same as
+the fix group.  For fix npt and fix nph, the group for both the new
+temperature and pressure compute is "all" since pressure is computed
+for the entire system.  In the case of fix nph, the temperature
+compute is not used for thermostatting, but just for a kinetic-energy
+contribution to the pressure.  See the "compute
+temp"_compute_temp.html and "compute pressure"_compute_pressure.html
+commands for details.  Note that the IDs of the new computes are the
+fix-ID + underscore + "temp" or fix_ID + underscore + "press".
 
 Note that these are NOT the computes used by thermodynamic output (see
 the "thermo_style"_thermo_style.html command) with ID = {thermo_temp}
-and {thermo_press}.  This means you can change the attributes of this
+and {thermo_press}.  This means you can change the attributes of these
 fix's temperature or pressure via the
-"compute_modify"_compute_modify.html command or print this temperature
-or pressure during thermodynamic output via the "thermo_style
-custom"_thermo_style.html command using the appropriate compute-ID.
-It also means that changing attributes of {thermo_temp} or
-{thermo_press} will have no effect on this fix.
+"compute_modify"_compute_modify.html command.  Or you can print this
+temperature or pressure during thermodynamic output via the
+"thermo_style custom"_thermo_style.html command using the appropriate
+compute-ID.  It also means that changing attributes of {thermo_temp}
+or {thermo_press} will have no effect on this fix.
 
 Like other fixes that perform thermostatting, fix nvt and fix npt can
 be used with "compute commands"_compute.html that calculate a
diff --git a/src/atom.cpp b/src/atom.cpp
index 1191f0f2b5..7d343a0807 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -453,12 +453,12 @@ void Atom::create_avec(const char *style, int narg, char **arg, int trysuffix)
   // if molecular system:
   // atom IDs must be defined
   // force atom map to be created
-  // map style may be reset by map_init() and its call to map_style_set()
+  // map style will be reset to array vs hash to by map_init()
 
   molecular = avec->molecular;
   if (molecular && tag_enable == 0)
     error->all(FLERR,"Atom IDs must be used for molecular systems");
-  if (molecular) map_style = 1;
+  if (molecular) map_style = 3;
 }
 
 /* ----------------------------------------------------------------------
@@ -593,6 +593,7 @@ void Atom::modify_params(int narg, char **arg)
                    "Atom_modify map command after simulation box is defined");
       if (strcmp(arg[iarg+1],"array") == 0) map_user = 1;
       else if (strcmp(arg[iarg+1],"hash") == 0) map_user = 2;
+      else if (strcmp(arg[iarg+1],"yes") == 0) map_user = 3;
       else error->all(FLERR,"Illegal atom_modify command");
       map_style = map_user;
       iarg += 2;
diff --git a/src/atom_map.cpp b/src/atom_map.cpp
index bbfe014dec..9d257d99de 100644
--- a/src/atom_map.cpp
+++ b/src/atom_map.cpp
@@ -298,12 +298,12 @@ int Atom::map_style_set()
   MPI_Allreduce(&max,&map_tag_max,1,MPI_LMP_TAGINT,MPI_MAX,world);
 
   // set map_style for new map
-  // if user-selected, use that setting
+  // if user-selected to array/hash, use that setting
   // else if map_tag_max > 1M, use hash
   // else use array
 
   int map_style_old = map_style;
-  if (map_user) map_style = map_user;
+  if (map_user == 1 || map_user == 2) map_style = map_user;
   else if (map_tag_max > 1000000) map_style = 2;
   else map_style = 1;
 
diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp
index 04a2df91f8..444b0c5bcd 100644
--- a/src/create_atoms.cpp
+++ b/src/create_atoms.cpp
@@ -343,6 +343,11 @@ void CreateAtoms::command(int narg, char **arg)
     }
   }
 
+  // CPU time
+
+  MPI_Barrier(world);
+  double time1 = MPI_Wtime();
+
   // clear ghost count and any ghost bonus data internal to AtomVec
   // same logic as beginning of Comm::exchange()
   // do it now b/c creating atoms will overwrite ghost atoms
@@ -509,6 +514,9 @@ void CreateAtoms::command(int narg, char **arg)
     if (domain->triclinic) domain->lamda2x(atom->nlocal);
   }
 
+  MPI_Barrier(world);
+  double time2 = MPI_Wtime();
+
   // clean up
 
   delete ranmol;
@@ -521,12 +529,16 @@ void CreateAtoms::command(int narg, char **arg)
   // print status
 
   if (comm->me == 0) {
-    if (screen)
+    if (screen) {
       fprintf(screen,"Created " BIGINT_FORMAT " atoms\n",
               atom->natoms-natoms_previous);
-    if (logfile)
+      fprintf(screen,"  CPU time = %g secs\n",time2-time1);
+    }
+    if (logfile) {
       fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n",
               atom->natoms-natoms_previous);
+      fprintf(logfile,"  CPU time = %g secs\n",time2-time1);
+    }
   }
 
   // for MOLECULE mode:
diff --git a/src/output.cpp b/src/output.cpp
index ce7fcb7cca..ce593ec6ae 100644
--- a/src/output.cpp
+++ b/src/output.cpp
@@ -827,9 +827,9 @@ void Output::create_restart(int narg, char **arg)
    sum and print memory usage
    result is only memory on proc 0, not averaged across procs
 ------------------------------------------------------------------------- */
+
 void Output::memory_usage()
 {
-
   bigint bytes = 0;
   bytes += atom->memory_usage();
   bytes += neighbor->memory_usage();
diff --git a/src/replicate.cpp b/src/replicate.cpp
index e2ed718f65..9c1a271be2 100644
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@@ -74,6 +74,11 @@ void Replicate::command(int narg, char **arg)
   if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store)
     error->all(FLERR,"Cannot replicate with fixes that store atom quantities");
 
+  // CPU time
+
+  MPI_Barrier(world);
+  double time1 = MPI_Wtime();
+
   // maxtag = largest atom tag across all existing atoms
 
   tagint maxtag = 0;
@@ -424,4 +429,16 @@ void Replicate::command(int narg, char **arg)
     Special special(lmp);
     special.build();
   }
+
+  // CPU time
+
+  MPI_Barrier(world);
+  double time2 = MPI_Wtime();
+
+  if (me == 0) {
+    if (screen)
+      fprintf(screen,"  CPU time = %g secs\n",time2-time1);
+    if (logfile)
+      fprintf(logfile,"  CPU time = %g secs\n",time2-time1);
+  }
 }

From dc0e20947ee70a6ecaa98726a9fbe76fdab31953 Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Fri, 6 Oct 2017 16:37:52 +0200
Subject: [PATCH 50/53] MAINT: Return error when 'at' keyword is used without
 'append yes'.

---
 src/USER-NETCDF/dump_netcdf.cpp       | 2 ++
 src/USER-NETCDF/dump_netcdf_mpiio.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index 7156b773b3..a9532d1077 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -924,6 +924,8 @@ int DumpNetCDF::modify_param(int narg, char **arg)
     return 2;
   }
   else if (strcmp(arg[iarg],"at") == 0) {
+    if (!append_flag)
+      error->all(FLERR,"expected 'append yes' before 'at' keyword");
     iarg++;
     framei = force->inumeric(FLERR,arg[iarg]);
     if (framei < 0)  framei--;
diff --git a/src/USER-NETCDF/dump_netcdf_mpiio.cpp b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
index 29c2b6cb1f..746b904655 100644
--- a/src/USER-NETCDF/dump_netcdf_mpiio.cpp
+++ b/src/USER-NETCDF/dump_netcdf_mpiio.cpp
@@ -920,6 +920,8 @@ int DumpNetCDFMPIIO::modify_param(int narg, char **arg)
     return 2;
   }
   else if (strcmp(arg[iarg],"at") == 0) {
+    if (!append_flag)
+      error->all(FLERR,"expected 'append yes' before 'at' keyword");
     iarg++;
     framei = force->inumeric(FLERR,arg[iarg]);
     if (framei < 0)  framei--;

From 352a20fc1cb775a79fc23dfc3a56d05b5de664f4 Mon Sep 17 00:00:00 2001
From: Lars Pastewka <lars.pastewka@imtek.uni-freiburg.de>
Date: Fri, 6 Oct 2017 16:38:15 +0200
Subject: [PATCH 51/53] DOC: Updated doc to separate description of 'append'
 and 'at' keywords.

---
 doc/src/dump_modify.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/src/dump_modify.txt b/doc/src/dump_modify.txt
index 6ccf40a8c5..38d9aad4d9 100644
--- a/doc/src/dump_modify.txt
+++ b/doc/src/dump_modify.txt
@@ -15,9 +15,11 @@ dump_modify dump-ID keyword values ... :pre
 dump-ID = ID of dump to modify :ulb,l
 one or more keyword/value pairs may be appended :l
 these keywords apply to various dump styles :l
-keyword = {append} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
-  {append} arg = {yes} or {no} or {yes at} N
+keyword = {append} or {at} or {buffer} or {element} or {every} or {fileper} or {first} or {flush} or {format} or {image} or {label} or {nfile} or {pad} or {precision} or {region} or {scale} or {sort} or {thresh} or {unwrap} :l
+  {append} arg = {yes} or {no}
+  {at} arg = N
     N = index of frame written upon first dump
+    only available after "append yes"
   {buffer} arg = {yes} or {no}
   {element} args = E1 E2 ... EN, where N = # of atom types
     E1,...,EN = element name, e.g. C or Fe or Ga

From 58e1969de2413a96b57084cf1ffb7d86e54997c0 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 6 Oct 2017 14:34:10 -0400
Subject: [PATCH 52/53] rename misleading "CPU time" into "Time spent"

---
 src/create_atoms.cpp | 6 +++---
 src/replicate.cpp    | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/create_atoms.cpp b/src/create_atoms.cpp
index 444b0c5bcd..992049a81f 100644
--- a/src/create_atoms.cpp
+++ b/src/create_atoms.cpp
@@ -343,7 +343,7 @@ void CreateAtoms::command(int narg, char **arg)
     }
   }
 
-  // CPU time
+  // Record wall time for atom creation
 
   MPI_Barrier(world);
   double time1 = MPI_Wtime();
@@ -532,12 +532,12 @@ void CreateAtoms::command(int narg, char **arg)
     if (screen) {
       fprintf(screen,"Created " BIGINT_FORMAT " atoms\n",
               atom->natoms-natoms_previous);
-      fprintf(screen,"  CPU time = %g secs\n",time2-time1);
+      fprintf(screen,"  Time spent = %g secs\n",time2-time1);
     }
     if (logfile) {
       fprintf(logfile,"Created " BIGINT_FORMAT " atoms\n",
               atom->natoms-natoms_previous);
-      fprintf(logfile,"  CPU time = %g secs\n",time2-time1);
+      fprintf(logfile,"  Time spent = %g secs\n",time2-time1);
     }
   }
 
diff --git a/src/replicate.cpp b/src/replicate.cpp
index 9c1a271be2..f3d1964169 100644
--- a/src/replicate.cpp
+++ b/src/replicate.cpp
@@ -74,7 +74,7 @@ void Replicate::command(int narg, char **arg)
   if (atom->nextra_grow || atom->nextra_restart || atom->nextra_store)
     error->all(FLERR,"Cannot replicate with fixes that store atom quantities");
 
-  // CPU time
+  // Record wall time for atom replication
 
   MPI_Barrier(world);
   double time1 = MPI_Wtime();
@@ -430,15 +430,15 @@ void Replicate::command(int narg, char **arg)
     special.build();
   }
 
-  // CPU time
+  // Wall time
 
   MPI_Barrier(world);
   double time2 = MPI_Wtime();
 
   if (me == 0) {
     if (screen)
-      fprintf(screen,"  CPU time = %g secs\n",time2-time1);
+      fprintf(screen,"  Time spent = %g secs\n",time2-time1);
     if (logfile)
-      fprintf(logfile,"  CPU time = %g secs\n",time2-time1);
+      fprintf(logfile,"  Time spent = %g secs\n",time2-time1);
   }
 }

From 6820db99e22fedd7cdda297ad3897436ea04ee54 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 6 Oct 2017 14:41:38 -0400
Subject: [PATCH 53/53] avoid merge conflict

---
 doc/src/Section_packages.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index b0b2d9fa63..7539d99cd0 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -727,7 +727,8 @@ make lib-latte                          # print help message
 make lib-latte args="-b"                # download and build in lib/latte/LATTE-master
 make lib-latte args="-p $HOME/latte"    # use existing LATTE installation in $HOME/latte
 make lib-latte args="-b -m gfortran"    # download and build in lib/latte and 
-                                        #   copy Makefile.lammps.gfortran to Makefile.lammps :pre
+                                        #   copy Makefile.lammps.gfortran to Makefile.lammps
+:pre
 
 Note that 3 symbolic (soft) links, "includelink" and "liblink" and
 "filelink", are created in lib/latte to point into the LATTE home dir.