diff --git a/doc/src/fix_dpd_energy.txt b/doc/src/fix_dpd_energy.txt
index ed49e5a671..1c10d954d6 100644
--- a/doc/src/fix_dpd_energy.txt
+++ b/doc/src/fix_dpd_energy.txt
@@ -7,6 +7,7 @@
 :line
 
 fix dpd/energy command :h3
+fix dpd/energy/kk command :h3
 
 [Syntax:]
 
@@ -46,6 +47,29 @@ examples/USER/dpd directory.
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_eos_table_rx.txt b/doc/src/fix_eos_table_rx.txt
index e5e4f772f6..0c87874347 100644
--- a/doc/src/fix_eos_table_rx.txt
+++ b/doc/src/fix_eos_table_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 fix eos/table/rx command :h3
+fix eos/table/rx/kk command :h3
 
 [Syntax:]
 
@@ -152,6 +153,29 @@ no      0.93 0.00 0.000 -1.76 :pre
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_rx.txt b/doc/src/fix_rx.txt
index 6a800f3865..0810a34740 100644
--- a/doc/src/fix_rx.txt
+++ b/doc/src/fix_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 fix rx command :h3
+fix rx/kk command :h3
 
 [Syntax:]
 
@@ -182,6 +183,29 @@ read_data    data.dpd fix foo_SPECIES NULL Species
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_shardlow.txt b/doc/src/fix_shardlow.txt
index 8354b4c41c..24726d8610 100644
--- a/doc/src/fix_shardlow.txt
+++ b/doc/src/fix_shardlow.txt
@@ -7,6 +7,7 @@
 :line
 
 fix shardlow command :h3
+fix shardlow/kk command :h3
 
 [Syntax:]
 
@@ -52,6 +53,29 @@ examples/USER/dpd directory.
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/fix_wall.txt b/doc/src/fix_wall.txt
index 6d76956620..6bbfccf9db 100644
--- a/doc/src/fix_wall.txt
+++ b/doc/src/fix_wall.txt
@@ -7,6 +7,7 @@
 :line
 
 fix wall/lj93 command :h3
+fix wall/lj93/kk command :h3
 fix wall/lj126 command :h3
 fix wall/lj1043 command :h3
 fix wall/colloid command :h3
@@ -277,6 +278,31 @@ the total potential energy of the system (the quantity being
 minimized), you MUST enable the "fix_modify"_fix_modify.html {energy}
 option for this fix.
 
+:line
+
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:] none
 
 [Related commands:]
diff --git a/doc/src/pair_dpd_fdt.txt b/doc/src/pair_dpd_fdt.txt
index b75e7c323c..867f3f2315 100644
--- a/doc/src/pair_dpd_fdt.txt
+++ b/doc/src/pair_dpd_fdt.txt
@@ -8,6 +8,7 @@
 
 pair_style dpd/fdt command :h3
 pair_style dpd/fdt/energy command :h3
+pair_style dpd/fdt/energy/kk command :h3
 
 [Syntax:]
 
@@ -125,6 +126,29 @@ significantly larger timesteps to be taken.
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 These commands are part of the USER-DPD package.  They are only
diff --git a/doc/src/pair_exp6_rx.txt b/doc/src/pair_exp6_rx.txt
index cbc17d357d..7eafa23543 100644
--- a/doc/src/pair_exp6_rx.txt
+++ b/doc/src/pair_exp6_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style exp6/rx command :h3
+pair_style exp6/rx/kk command :h3
 
 [Syntax:]
 
@@ -147,6 +148,31 @@ This style does not support the pair_modify tail option for adding long-range
 tail corrections to energy and pressure for the A,C terms in the
 pair interaction.
 
+:line
+
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/pair_hybrid.txt b/doc/src/pair_hybrid.txt
index fc1824cf62..d37dedc709 100644
--- a/doc/src/pair_hybrid.txt
+++ b/doc/src/pair_hybrid.txt
@@ -10,6 +10,7 @@ pair_style hybrid command :h3
 pair_style hybrid/omp command :h3
 pair_style hybrid/overlay command :h3
 pair_style hybrid/overlay/omp command :h3
+pair_style hybrid/overlay/kk command :h3
 
 [Syntax:]
 
diff --git a/doc/src/pair_multi_lucy_rx.txt b/doc/src/pair_multi_lucy_rx.txt
index 77ed223e2a..57abcf4a4c 100644
--- a/doc/src/pair_multi_lucy_rx.txt
+++ b/doc/src/pair_multi_lucy_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style multi/lucy/rx command :h3
+pair_style multi/lucy/rx/kk command :h3
 
 [Syntax:]
 
@@ -200,6 +201,29 @@ This pair style can only be used via the {pair} keyword of the
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/doc/src/pair_table_rx.txt b/doc/src/pair_table_rx.txt
index f93af21da4..cd3a7ef31b 100644
--- a/doc/src/pair_table_rx.txt
+++ b/doc/src/pair_table_rx.txt
@@ -7,6 +7,7 @@
 :line
 
 pair_style table/rx command :h3
+pair_style table/rx/kk command :h3
 
 [Syntax:]
 
@@ -223,6 +224,29 @@ This pair style can only be used via the {pair} keyword of the
 
 :line
 
+Styles with a {gpu}, {intel}, {kk}, {omp}, or {opt} suffix are
+functionally the same as the corresponding style without the suffix.
+They have been optimized to run faster, depending on your available
+hardware, as discussed in "Section 5"_Section_accelerate.html
+of the manual.  The accelerated styles take the same arguments and
+should produce the same results, except for round-off and precision
+issues.
+
+These accelerated styles are part of the GPU, USER-INTEL, KOKKOS,
+USER-OMP and OPT packages, respectively.  They are only enabled if
+LAMMPS was built with those packages.  See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+You can specify the accelerated styles explicitly in your input script
+by including their suffix, or you can use the "-suffix command-line
+switch"_Section_start.html#start_6 when you invoke LAMMPS, or you can
+use the "suffix"_suffix.html command in your input script.
+
+See "Section 5"_Section_accelerate.html of the manual for
+more instructions on how to use the accelerated styles effectively.
+
+:line
+
 [Restrictions:]
 
 This command is part of the USER-DPD package.  It is only enabled if
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
index 42c115b7a5..9082e47052 100644
--- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -755,6 +755,12 @@ namespace Kokkos {
       return Random_XorShift64<DeviceType>(state_(i),i);
     }
 
+    // NOTE: state_idx MUST be unique and less than num_states
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64<DeviceType> get_state(const int state_idx) const {
+      return Random_XorShift64<DeviceType>(state_(state_idx),state_idx);
+    }
+
     KOKKOS_INLINE_FUNCTION
     void free_state(const Random_XorShift64<DeviceType>& state) const {
       state_(state.state_idx_) = state.state_;
@@ -1010,6 +1016,12 @@ namespace Kokkos {
       return Random_XorShift1024<DeviceType>(state_,p_(i),i);
     };
 
+    // NOTE: state_idx MUST be unique and less than num_states
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024<DeviceType> get_state(const int state_idx) const {
+      return Random_XorShift1024<DeviceType>(state_,p_(state_idx),state_idx);
+    }
+
     KOKKOS_INLINE_FUNCTION
     void free_state(const Random_XorShift1024<DeviceType>& state) const {
       for(int i = 0; i<16; i++)
@@ -1208,8 +1220,8 @@ Random_XorShift64<Kokkos::Cuda> Random_XorShift64_Pool<Kokkos::Cuda>::get_state(
 template<>
 KOKKOS_INLINE_FUNCTION
 void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(const Random_XorShift64<Kokkos::Cuda> &state) const {
-#ifdef __CUDA_ARCH__
   state_(state.state_idx_) = state.state_;
+#ifdef __CUDA_ARCH__
   locks_(state.state_idx_) = 0;
   return;
 #endif
@@ -1244,9 +1256,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
 template<>
 KOKKOS_INLINE_FUNCTION
 void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift1024<Kokkos::Cuda> &state) const {
-#ifdef __CUDA_ARCH__
   for(int i=0; i<16; i++)
     state_(state.state_idx_,i) = state.state_[i];
+#ifdef __CUDA_ARCH__
   locks_(state.state_idx_) = 0;
   return;
 #endif
diff --git a/src/Depend.sh b/src/Depend.sh
index 0962dace51..9463607960 100644
--- a/src/Depend.sh
+++ b/src/Depend.sh
@@ -115,6 +115,10 @@ if (test $1 = "USER-CGSDK") then
   depend USER-OMP
 fi
 
+if (test $1 = "USER-DPD") then
+  depend KOKKOS
+fi
+
 if (test $1 = "USER-FEP") then
   depend USER-OMP
 fi
diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 790b9224c2..df5fc3e5f1 100644
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -49,8 +49,12 @@ action atom_vec_bond_kokkos.cpp atom_vec_bond.cpp
 action atom_vec_bond_kokkos.h atom_vec_bond.h
 action atom_vec_charge_kokkos.cpp
 action atom_vec_charge_kokkos.h
+action atom_vec_dpd_kokkos.cpp atom_vec_dpd.cpp
+action atom_vec_dpd_kokkos.h atom_vec_dpd.h
 action atom_vec_full_kokkos.cpp atom_vec_full.cpp
 action atom_vec_full_kokkos.h atom_vec_full.h
+action atom_vec_hybrid_kokkos.cpp
+action atom_vec_hybrid_kokkos.h
 action atom_vec_kokkos.cpp
 action atom_vec_kokkos.h
 action atom_vec_molecular_kokkos.cpp atom_vec_molecular.cpp
@@ -77,6 +81,8 @@ action domain_kokkos.cpp
 action domain_kokkos.h
 action fix_deform_kokkos.cpp
 action fix_deform_kokkos.h
+action fix_eos_table_rx_kokkos.cpp fix_eos_table_rx.cpp
+action fix_eos_table_rx_kokkos.h fix_eos_table_rx.h  
 action fix_langevin_kokkos.cpp
 action fix_langevin_kokkos.h
 action fix_nh_kokkos.cpp
@@ -89,6 +95,8 @@ action fix_nve_kokkos.cpp
 action fix_nve_kokkos.h
 action fix_nvt_kokkos.cpp
 action fix_nvt_kokkos.h
+action fix_property_atom_kokkos.cpp
+action fix_property_atom_kokkos.h
 action fix_qeq_reax_kokkos.cpp fix_qeq_reax.cpp
 action fix_qeq_reax_kokkos.h fix_qeq_reax.h
 action fix_reaxc_bonds_kokkos.cpp fix_reaxc_bonds.cpp
@@ -97,10 +105,18 @@ action fix_reaxc_species_kokkos.cpp fix_reaxc_species.cpp
 action fix_reaxc_species_kokkos.h fix_reaxc_species.h
 action fix_setforce_kokkos.cpp
 action fix_setforce_kokkos.h
+action fix_shardlow_kokkos.cpp fix_shardlow.cpp
+action fix_shardlow_kokkos.h fix_shardlow.h
 action fix_momentum_kokkos.cpp
 action fix_momentum_kokkos.h
+action fix_wall_lj93_kokkos.cpp
+action fix_wall_lj93_kokkos.h
 action fix_wall_reflect_kokkos.cpp
 action fix_wall_reflect_kokkos.h
+action fix_dpd_energy_kokkos.cpp fix_dpd_energy.cpp
+action fix_dpd_energy_kokkos.h fix_dpd_energy.h
+action fix_rx_kokkos.cpp fix_rx.cpp
+action fix_rx_kokkos.h fix_rx.h
 action gridcomm_kokkos.cpp gridcomm.cpp
 action gridcomm_kokkos.h gridcomm.h
 action improper_class2_kokkos.cpp improper_class2.cpp 
@@ -124,8 +140,12 @@ action npair_copy_kokkos.cpp
 action npair_copy_kokkos.h
 action npair_kokkos.cpp
 action npair_kokkos.h
+action npair_ssa_kokkos.cpp npair_half_bin_newton_ssa.cpp
+action npair_ssa_kokkos.h npair_half_bin_newton_ssa.h
 action nbin_kokkos.cpp
 action nbin_kokkos.h
+action nbin_ssa_kokkos.cpp nbin_ssa.cpp
+action nbin_ssa_kokkos.h nbin_ssa.h
 action math_special_kokkos.cpp
 action math_special_kokkos.h
 action pair_buck_coul_cut_kokkos.cpp
@@ -144,12 +164,20 @@ action pair_coul_long_kokkos.cpp pair_coul_long.cpp
 action pair_coul_long_kokkos.h pair_coul_long.h
 action pair_coul_wolf_kokkos.cpp
 action pair_coul_wolf_kokkos.h
+action pair_dpd_fdt_energy_kokkos.cpp pair_dpd_fdt_energy.cpp
+action pair_dpd_fdt_energy_kokkos.h pair_dpd_fdt_energy.h
 action pair_eam_kokkos.cpp pair_eam.cpp
 action pair_eam_kokkos.h pair_eam.h
 action pair_eam_alloy_kokkos.cpp pair_eam_alloy.cpp
 action pair_eam_alloy_kokkos.h pair_eam_alloy.h
 action pair_eam_fs_kokkos.cpp pair_eam_fs.cpp
 action pair_eam_fs_kokkos.h pair_eam_fs.h
+action pair_exp6_rx_kokkos.cpp pair_exp6_rx.cpp
+action pair_exp6_rx_kokkos.h pair_exp6_rx.h
+action pair_hybrid_kokkos.cpp
+action pair_hybrid_kokkos.h
+action pair_hybrid_overlay_kokkos.cpp
+action pair_hybrid_overlay_kokkos.h
 action pair_kokkos.h
 action pair_lj_charmm_coul_charmm_implicit_kokkos.cpp pair_lj_charmm_coul_charmm_implicit.cpp
 action pair_lj_charmm_coul_charmm_implicit_kokkos.h pair_lj_charmm_coul_charmm_implicit.h
@@ -183,6 +211,8 @@ action pair_lj_sdk_kokkos.cpp pair_lj_sdk.cpp
 action pair_lj_sdk_kokkos.h pair_lj_sdk.h
 action pair_morse_kokkos.cpp
 action pair_morse_kokkos.h
+action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp
+action pair_multi_lucy_rx_kokkos.h pair_multi_lucy_rx.h
 action pair_reaxc_kokkos.cpp pair_reaxc.cpp
 action pair_reaxc_kokkos.h pair_reaxc.h
 action pair_sw_kokkos.cpp pair_sw.cpp
@@ -191,6 +221,8 @@ action pair_vashishta_kokkos.cpp pair_vashishta.cpp
 action pair_vashishta_kokkos.h pair_vashishta.h
 action pair_table_kokkos.cpp
 action pair_table_kokkos.h
+action pair_table_rx_kokkos.cpp pair_table_rx.cpp
+action pair_table_rx_kokkos.h pair_table_rx.h
 action pair_tersoff_kokkos.cpp pair_tersoff.cpp
 action pair_tersoff_kokkos.h pair_tersoff.h
 action pair_tersoff_mod_kokkos.cpp pair_tersoff_mod.cpp
@@ -199,6 +231,8 @@ action pair_tersoff_zbl_kokkos.cpp pair_tersoff_zbl.cpp
 action pair_tersoff_zbl_kokkos.h pair_tersoff_zbl.h
 action pppm_kokkos.cpp pppm.cpp
 action pppm_kokkos.h pppm.h
+action rand_pool_wrap_kokkos.cpp
+action rand_pool_wrap_kokkos.h
 action region_block_kokkos.cpp
 action region_block_kokkos.h
 action verlet_kokkos.cpp
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index 577eff2364..31b33dbdc9 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -49,6 +49,7 @@ AtomKokkos::~AtomKokkos()
   memory->destroy_kokkos(k_radius, radius);
   memory->destroy_kokkos(k_rmass, rmass);
   memory->destroy_kokkos(k_omega, omega);
+  memory->destroy_kokkos(k_angmom, angmom);
   memory->destroy_kokkos(k_torque, torque);
 
   memory->destroy_kokkos(k_nspecial, nspecial);
@@ -73,6 +74,19 @@ AtomKokkos::~AtomKokkos()
   memory->destroy_kokkos(k_improper_atom2, improper_atom2);
   memory->destroy_kokkos(k_improper_atom3, improper_atom3);
   memory->destroy_kokkos(k_improper_atom4, improper_atom4);
+
+  // USER-DPD package
+  memory->destroy_kokkos(k_uCond,uCond);
+  memory->destroy_kokkos(k_uMech,uMech);
+  memory->destroy_kokkos(k_uChem,uChem);
+  memory->destroy_kokkos(k_uCG,uCG);
+  memory->destroy_kokkos(k_uCGnew,uCGnew);
+  memory->destroy_kokkos(k_rho,rho);
+  memory->destroy_kokkos(k_dpdTheta,dpdTheta);
+  memory->destroy_kokkos(k_duChem,duChem);
+
+  memory->destroy_kokkos(k_dvector,dvector);
+  dvector = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -227,6 +241,63 @@ void AtomKokkos::grow(unsigned int mask){
   }
 }
 
+/* ----------------------------------------------------------------------
+   add a custom variable with name of type flag = 0/1 for int/double
+   assumes name does not already exist
+   return index in ivector or dvector of its location
+------------------------------------------------------------------------- */
+
+int AtomKokkos::add_custom(const char *name, int flag)
+{
+  int index;
+
+  if (flag == 0) {
+    index = nivector;
+    nivector++;
+    iname = (char **) memory->srealloc(iname,nivector*sizeof(char *),
+                                       "atom:iname");
+    int n = strlen(name) + 1;
+    iname[index] = new char[n];
+    strcpy(iname[index],name);
+    ivector = (int **) memory->srealloc(ivector,nivector*sizeof(int *),
+                                        "atom:ivector");
+    memory->create(ivector[index],nmax,"atom:ivector");
+  } else {
+    index = ndvector;
+    ndvector++;
+    dname = (char **) memory->srealloc(dname,ndvector*sizeof(char *),
+                                       "atom:dname");
+    int n = strlen(name) + 1;
+    dname[index] = new char[n];
+    strcpy(dname[index],name);
+    memory->grow_kokkos(k_dvector,dvector,ndvector,nmax,
+                        "atom:dvector");
+  }
+
+  return index;
+}
+
+/* ----------------------------------------------------------------------
+   remove a custom variable of type flag = 0/1 for int/double at index
+   free memory for vector and name and set ptrs to NULL
+   ivector/dvector and iname/dname lists never shrink
+------------------------------------------------------------------------- */
+
+void AtomKokkos::remove_custom(int flag, int index)
+{
+  if (flag == 0) {
+    memory->destroy(ivector[index]);
+    ivector[index] = NULL;
+    delete [] iname[index];
+    iname[index] = NULL;
+  } else {
+    //memory->destroy_kokkos(dvector);
+    dvector[index] = NULL;
+    delete [] dname[index];
+    dname[index] = NULL;
+  }
+}
+
 /* ---------------------------------------------------------------------- */
 
 void AtomKokkos::deallocate_topology()
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index 05aae712d9..2245023189 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -34,6 +34,7 @@ class AtomKokkos : public Atom {
   DAT::tdual_float_1d k_radius;
   DAT::tdual_float_1d k_rmass;
   DAT::tdual_v_array k_omega;
+  DAT::tdual_v_array k_angmom;
   DAT::tdual_f_array k_torque;
   DAT::tdual_tagint_1d k_molecule;
   DAT::tdual_int_2d k_nspecial;
@@ -51,6 +52,14 @@ class AtomKokkos : public Atom {
   DAT::tdual_int_2d k_improper_type;
   DAT::tdual_tagint_2d k_improper_atom1, k_improper_atom2, k_improper_atom3, k_improper_atom4;
 
+  DAT::tdual_float_2d k_dvector;
+
+
+// USER-DPD package
+  DAT::tdual_efloat_1d k_uCond, k_uMech, k_uChem, k_uCG, k_uCGnew,
+                       k_rho,k_dpdTheta,k_duChem;
+
+
   AtomKokkos(class LAMMPS *);
   ~AtomKokkos();
 
@@ -60,6 +69,8 @@ class AtomKokkos : public Atom {
   void sync_overlapping_device(const ExecutionSpace space, unsigned int mask);
   virtual void sort();
   virtual void grow(unsigned int mask);
+  int add_custom(const char *, int);
+  void remove_custom(int, int);
   virtual void deallocate_topology();
   void sync_modify(ExecutionSpace, unsigned int, unsigned int);
  private:
diff --git a/src/KOKKOS/atom_vec_angle_kokkos.cpp b/src/KOKKOS/atom_vec_angle_kokkos.cpp
index 34b868aadc..05414cf2e4 100644
--- a/src/KOKKOS/atom_vec_angle_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_angle_kokkos.cpp
@@ -308,7 +308,6 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -336,7 +335,6 @@ int AtomVecAngleKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
   return n*size_forward;
@@ -430,7 +428,6 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -463,7 +460,6 @@ int AtomVecAngleKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &li
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -501,13 +497,11 @@ void AtomVecAngleKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecAngleKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecAngleKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -753,13 +747,11 @@ int AtomVecAngleKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAngleKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -769,13 +761,11 @@ int AtomVecAngleKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAngleKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -977,12 +967,10 @@ void AtomVecAngleKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecAngleKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecAngleKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1241,13 +1229,11 @@ int AtomVecAngleKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_
     AtomVecAngleKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecAngleKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1405,7 +1391,6 @@ int AtomVecAngleKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int n
     AtomVecAngleKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1414,7 +1399,6 @@ int AtomVecAngleKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int n
     AtomVecAngleKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_atomic_kokkos.cpp b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
index d040bd3553..b63dc5fb8c 100644
--- a/src/KOKKOS/atom_vec_atomic_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_atomic_kokkos.cpp
@@ -224,7 +224,6 @@ int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -252,7 +251,6 @@ int AtomVecAtomicKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -340,7 +338,6 @@ int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -369,7 +366,6 @@ int AtomVecAtomicKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -407,13 +403,11 @@ void AtomVecAtomicKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecAtomicKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecAtomicKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -655,13 +649,11 @@ int AtomVecAtomicKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAtomicKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -671,13 +663,11 @@ int AtomVecAtomicKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecAtomicKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*6;
@@ -853,11 +843,9 @@ void AtomVecAtomicKokkos::unpack_border_kokkos(const int &n, const int &first,
   if(space==Host) {
     struct AtomVecAtomicKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecAtomicKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1009,12 +997,10 @@ int AtomVecAtomicKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat
   if(space == Host) {
     AtomVecAtomicKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*11;
   } else {
     AtomVecAtomicKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*11;
   }
 }
@@ -1106,7 +1092,6 @@ int AtomVecAtomicKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     k_count.h_view(0) = nlocal;
     AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/11,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1114,7 +1099,6 @@ int AtomVecAtomicKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     k_count.sync<LMPDeviceType>();
     AtomVecAtomicKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/11,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_bond_kokkos.cpp b/src/KOKKOS/atom_vec_bond_kokkos.cpp
index c46c49cb29..e0f29a27bb 100644
--- a/src/KOKKOS/atom_vec_bond_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_bond_kokkos.cpp
@@ -266,7 +266,6 @@ int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -294,7 +293,6 @@ int AtomVecBondKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -382,7 +380,6 @@ int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -411,7 +408,6 @@ int AtomVecBondKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -449,13 +445,11 @@ void AtomVecBondKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecBondKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecBondKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -701,13 +695,11 @@ int AtomVecBondKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecBondKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -717,13 +709,11 @@ int AtomVecBondKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecBondKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -925,12 +915,10 @@ void AtomVecBondKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecBondKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecBondKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1157,13 +1145,11 @@ int AtomVecBondKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2
     AtomVecBondKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecBondKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1299,7 +1285,6 @@ int AtomVecBondKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecBondKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1308,7 +1293,6 @@ int AtomVecBondKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecBondKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_charge_kokkos.cpp b/src/KOKKOS/atom_vec_charge_kokkos.cpp
index 856660d1e9..89f7e91c2b 100644
--- a/src/KOKKOS/atom_vec_charge_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_charge_kokkos.cpp
@@ -236,7 +236,6 @@ int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -264,7 +263,6 @@ int AtomVecChargeKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -352,7 +350,6 @@ int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -381,7 +378,6 @@ int AtomVecChargeKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -419,13 +415,11 @@ void AtomVecChargeKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecChargeKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecChargeKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -669,13 +663,11 @@ int AtomVecChargeKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecChargeKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -685,13 +677,11 @@ int AtomVecChargeKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecChargeKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -890,12 +880,10 @@ void AtomVecChargeKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecChargeKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_q,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecChargeKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_q,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
   modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|Q_MASK);
 }
@@ -1078,13 +1066,11 @@ int AtomVecChargeKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat
     AtomVecChargeKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*12;
   } else {
     AtomVecChargeKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*12;
   }
 }
@@ -1181,7 +1167,6 @@ int AtomVecChargeKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     k_count.h_view(0) = nlocal;
     AtomVecChargeKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/12,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1190,7 +1175,6 @@ int AtomVecChargeKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int
     AtomVecChargeKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/12,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.cpp b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
new file mode 100644
index 0000000000..c4e493bd85
--- /dev/null
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.cpp
@@ -0,0 +1,2002 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include "atom_vec_dpd_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "atom_masks.h"
+#include "memory.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+#define DELTA 10000
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecDPDKokkos::AtomVecDPDKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
+{
+  molecular = 0;
+  mass_type = 1;
+
+  comm_x_only = comm_f_only = 0;
+  size_forward = 7;
+  size_reverse = 3;
+  size_border = 12;
+  size_velocity = 3;
+  size_data_atom = 6;
+  size_data_vel = 4;
+  xcol_data = 4;
+
+  atom->rho_flag = 1;
+  atom->dpd_flag = 1;
+
+  k_count = DAT::tdual_int_1d("atom::k_count",1);
+  atomKK = (AtomKokkos *) atom;
+  commKK = (CommKokkos *) comm;
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by DELTA
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::grow(int n)
+{
+  if (n == 0) nmax += DELTA;
+  else nmax = n;
+  atomKK->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  sync(Device,ALL_MASK);
+  modified(Device,ALL_MASK);
+
+  memory->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
+  memory->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
+  memory->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
+  memory->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
+
+  memory->grow_kokkos(atomKK->k_x,atomKK->x,nmax,3,"atom:x");
+  memory->grow_kokkos(atomKK->k_v,atomKK->v,nmax,3,"atom:v");
+  memory->grow_kokkos(atomKK->k_f,atomKK->f,nmax,3,"atom:f");
+
+
+  memory->grow_kokkos(atomKK->k_rho,atomKK->rho,nmax,"atom:rho");
+  memory->grow_kokkos(atomKK->k_dpdTheta,atomKK->dpdTheta,nmax,"atom:dpdTheta");
+  memory->grow_kokkos(atomKK->k_uCond,atomKK->uCond,nmax,"atom:uCond");
+  memory->grow_kokkos(atomKK->k_uMech,atomKK->uMech,nmax,"atom:uMech");
+  memory->grow_kokkos(atomKK->k_uChem,atomKK->uChem,nmax,"atom:uChem");
+  memory->grow_kokkos(atomKK->k_uCG,atomKK->uCG,nmax,"atom:uCG");
+  memory->grow_kokkos(atomKK->k_uCGnew,atomKK->uCGnew,nmax,"atom:uCGnew");
+  memory->grow_kokkos(atomKK->k_duChem,atomKK->duChem,nmax,"atom:duChem");
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+
+  grow_reset();
+  sync(Host,ALL_MASK);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::grow_reset()
+{
+  tag = atomKK->tag;
+  d_tag = atomKK->k_tag.d_view;
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type;
+  d_type = atomKK->k_type.d_view;
+  h_type = atomKK->k_type.h_view;
+  mask = atomKK->mask;
+  d_mask = atomKK->k_mask.d_view;
+  h_mask = atomKK->k_mask.h_view;
+  image = atomKK->image;
+  d_image = atomKK->k_image.d_view;
+  h_image = atomKK->k_image.h_view;
+
+  x = atomKK->x;
+  d_x = atomKK->k_x.d_view;
+  h_x = atomKK->k_x.h_view;
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+  f = atomKK->f;
+  d_f = atomKK->k_f.d_view;
+  h_f = atomKK->k_f.h_view;
+
+  rho = atomKK->rho;
+  d_rho = atomKK->k_rho.d_view;
+  h_rho = atomKK->k_rho.h_view;
+  dpdTheta = atomKK->dpdTheta;
+  d_dpdTheta = atomKK->k_dpdTheta.d_view;
+  h_dpdTheta = atomKK->k_dpdTheta.h_view;
+  uCond = atomKK->uCond;
+  d_uCond = atomKK->k_uCond.d_view;;
+  h_uCond = atomKK->k_uCond.h_view;
+  uMech = atomKK->uMech;
+  d_uMech = atomKK->k_uMech.d_view;;
+  h_uMech = atomKK->k_uMech.h_view;
+  uChem = atomKK->uChem;
+  d_uChem = atomKK->k_uChem.d_view;;
+  h_uChem = atomKK->k_uChem.h_view;
+  uCG = atomKK->uCG;
+  d_uCG = atomKK->k_uCG.d_view;;
+  h_uCG = atomKK->k_uCG.h_view;
+  uCGnew = atomKK->uCGnew;
+  d_uCGnew = atomKK->k_uCGnew.d_view;;
+  h_uCGnew = atomKK->k_uCGnew.h_view;
+  duChem = atomKK->duChem;
+  d_duChem = atomKK->k_duChem.d_view;;
+  h_duChem = atomKK->k_duChem.h_view;
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::copy(int i, int j, int delflag)
+{
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
+
+  h_tag[j] = h_tag[i];
+  h_type[j] = h_type[i];
+  mask[j] = mask[i];
+  h_image[j] = h_image[i];
+  h_x(j,0) = h_x(i,0);
+  h_x(j,1) = h_x(i,1);
+  h_x(j,2) = h_x(i,2);
+  h_v(j,0) = h_v(i,0);
+  h_v(j,1) = h_v(i,1);
+  h_v(j,2) = h_v(i,2);
+  h_dpdTheta[j] = h_dpdTheta[i];
+  h_uCond[j] = h_uCond[i];
+  h_uMech[j] = h_uMech[i];
+  h_uChem[j] = h_uChem[i];
+  h_uCG[j] = h_uCG[i];
+  h_uCGnew[j] = h_uCGnew[i];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecDPDKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecDPDKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),
+      _dpdTheta(dpdTheta.view<DeviceType>()),
+      _uCond(uCond.view<DeviceType>()),
+      _uMech(uMech.view<DeviceType>()),
+      _uChem(uChem.view<DeviceType>()),
+      _list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().dimension_0()*buf.view<DeviceType>().dimension_1())/3;
+        const size_t elements = 3;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+      _buf(i,3) = _dpdTheta(j);
+      _buf(i,4) = _uCond(j);
+      _buf(i,5) = _uMech(j);
+      _buf(i,6) = _uChem(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_kokkos(const int &n,
+                                          const DAT::tdual_int_2d &list,
+                                          const int & iswap,
+                                          const DAT::tdual_xfloat_2d &buf,
+                                          const int &pbc_flag,
+                                          const int* const pbc)
+{
+  // Check whether to always run forward communication on the host
+  // Choose correct forward PackComm kernel
+
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPHostType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      } else {
+        struct AtomVecDPDKokkos_PackComm<LMPDeviceType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          buf,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+        Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+
+	return n*size_forward;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecDPDKokkos_PackCommSelf {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_x_array _xw;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  int _nfirst;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+
+  AtomVecDPDKokkos_PackCommSelf(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const int &nfirst,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_xw(x.view<DeviceType>()),
+      _dpdTheta(dpdTheta.view<DeviceType>()),
+      _uCond(uCond.view<DeviceType>()),
+      _uMech(uMech.view<DeviceType>()),
+      _uChem(uChem.view<DeviceType>()),      
+      _nfirst(nfirst),_list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _xw(i+_nfirst,0) = _x(j,0);
+          _xw(i+_nfirst,1) = _x(j,1);
+          _xw(i+_nfirst,2) = _x(j,2);
+      } else {
+        if (TRICLINIC == 0) {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        } else {
+          _xw(i+_nfirst,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _xw(i+_nfirst,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _xw(i+_nfirst,2) = _x(j,2) + _pbc[2]*_zprd;
+        }
+      }
+      _dpdTheta(i+_nfirst) = _dpdTheta(j);
+      _uCond(i+_nfirst) = _uCond(j);
+      _uMech(i+_nfirst) = _uMech(j);
+      _uChem(i+_nfirst) = _uChem(j); 
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list, const int & iswap,
+										const int nfirst, const int &pbc_flag, const int* const pbc) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPHostType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  } else {
+    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    if(pbc_flag) {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,1,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    } else {
+      if(domain->triclinic) {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,1> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      } else {
+      struct AtomVecDPDKokkos_PackCommSelf<LMPDeviceType,0,0> f(atomKK->k_x,
+          atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+          nfirst,list,iswap,
+          domain->xprd,domain->yprd,domain->zprd,
+          domain->xy,domain->xz,domain->yz,pbc);
+      Kokkos::parallel_for(n,f);
+      }
+    }
+  }
+	return n*3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  int _first;
+
+  AtomVecDPDKokkos_UnpackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_efloat_1d &dpdTheta,
+      const typename DAT::tdual_efloat_1d &uCond,
+      const typename DAT::tdual_efloat_1d &uMech,
+      const typename DAT::tdual_efloat_1d &uChem,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const int& first):_x(x.view<DeviceType>()),
+                        _dpdTheta(dpdTheta.view<DeviceType>()),
+                        _uCond(uCond.view<DeviceType>()),
+                        _uMech(uMech.view<DeviceType>()),
+                        _uChem(uChem.view<DeviceType>()),
+                        _buf(buf.view<DeviceType>()),
+                        _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _dpdTheta(i+_first) = _buf(i,3);
+      _uCond(i+_first) = _buf(i,4);
+      _uMech(i+_first) = _buf(i,5);
+      _uChem(i+_first) = _buf(i,6);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm_kokkos(const int &n, const int &first,
+    const DAT::tdual_xfloat_2d &buf ) {
+  if(commKK->forward_comm_on_host) {
+    sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    struct AtomVecDPDKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,
+    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+    buf,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    sync(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    modified(Device,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+    struct AtomVecDPDKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,
+    atomKK->k_dpdTheta,atomKK->k_uCond,atomKK->k_uMech,atomKK->k_uChem,
+    buf,first);
+    Kokkos::parallel_for(n,f);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  sync(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  sync(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      buf[m++] = h_dpdTheta[j];
+      buf[m++] = h_uCond[j];
+      buf[m++] = h_uMech[j];
+      buf[m++] = h_uChem[j];
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_dpdTheta[j];
+        buf[m++] = h_uCond[j];
+        buf[m++] = h_uMech[j];
+        buf[m++] = h_uChem[j];
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j); 
+      }
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_dpdTheta[i] = buf[m++];
+    h_uCond[i] = buf[m++];
+    h_uMech[i] = buf[m++];
+    h_uChem[i] = buf[m++];
+  }
+
+  modified(Host,X_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    h_dpdTheta[i] = buf[m++];
+    h_uCond[i] = buf[m++];
+    h_uMech[i] = buf[m++];
+    h_uChem[i] = buf[m++];
+  }
+
+  modified(Host,X_MASK|V_MASK|DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_reverse(int n, int first, double *buf)
+{
+  if(n > 0)
+    sync(Host,F_MASK);
+
+  int m = 0;
+  const int last = first + n;
+  for (int i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  if(n > 0) {
+    sync(Host,F_MASK);
+    modified(Host,F_MASK);
+  }
+
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG>
+struct AtomVecDPDKokkos_PackBorder {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
+  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  const typename ArrayTypes<DeviceType>::t_int_1d _type;
+  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  X_FLOAT _dx,_dy,_dz;
+
+  AtomVecDPDKokkos_PackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
+      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
+      const int & iswap,
+      const typename ArrayTypes<DeviceType>::t_x_array &x,
+      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      const typename ArrayTypes<DeviceType>::t_int_1d &type,
+      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
+      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
+      _buf(buf),_list(list),_iswap(iswap),
+      _x(x),_tag(tag),_type(type),_mask(mask),
+      _dpdTheta(dpdTheta),
+      _uCond(uCond),
+      _uMech(uMech),
+      _uChem(uChem),
+      _uCG(uCGnew),
+      _uCGnew(uCGnew),
+      _dx(dx),_dy(dy),_dz(dz) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+      } else {
+          _buf(i,0) = _x(j,0) + _dx;
+          _buf(i,1) = _x(j,1) + _dy;
+          _buf(i,2) = _x(j,2) + _dz;
+      }
+      _buf(i,3) = _tag(j);
+      _buf(i,4) = _type(j);
+      _buf(i,5) = _mask(j);
+      _buf(i,6) = _dpdTheta(j);
+      _buf(i,7) = _uCond(j);
+      _buf(i,8) = _uMech(j);
+      _buf(i,9) = _uChem(j);
+      _buf(i,10) = _uCG(j);
+      _buf(i,11) = _uCGnew(j);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
+                               int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  X_FLOAT dx,dy,dz;
+
+  sync(space,ALL_MASK);
+
+  if (pbc_flag != 0) {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if(space==Host) {
+      AtomVecDPDKokkos_PackBorder<LMPHostType,1> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    } else {
+      AtomVecDPDKokkos_PackBorder<LMPDeviceType,1> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    }
+
+  } else {
+    dx = dy = dz = 0;
+    if(space==Host) {
+      AtomVecDPDKokkos_PackBorder<LMPHostType,0> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,
+        h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    } else {
+      AtomVecDPDKokkos_PackBorder<LMPDeviceType,0> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,
+        d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+        dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    }
+  }
+  return n*6;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  sync(Host,ALL_MASK);
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  sync(Host,ALL_MASK);
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      buf[m++] = h_dpdTheta(j);
+      buf[m++] = h_uCond(j);
+      buf[m++] = h_uMech(j);
+      buf[m++] = h_uChem(j);
+      buf[m++] = h_uCG(j);
+      buf[m++] = h_uCGnew(j);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j);
+        buf[m++] = h_uCG(j);
+        buf[m++] = h_uCGnew(j);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        buf[m++] = h_dpdTheta(j);
+        buf[m++] = h_uCond(j);
+        buf[m++] = h_uMech(j);
+        buf[m++] = h_uChem(j);
+        buf[m++] = h_uCG(j);
+        buf[m++] = h_uCGnew(j);
+      }
+    }
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_comm_hybrid(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_dpdTheta[j];
+    buf[m++] = h_uCond[j];
+    buf[m++] = h_uMech[j];
+    buf[m++] = h_uChem[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_border_hybrid(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  sync(Host,DPDTHETA_MASK | UCOND_MASK |
+            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_dpdTheta[j];
+    buf[m++] = h_uCond[j];
+    buf[m++] = h_uMech[j];
+    buf[m++] = h_uChem[j];
+    buf[m++] = h_uCG[j];
+    buf[m++] = h_uCGnew[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackBorder {
+  typedef DeviceType device_type;
+
+  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  typename ArrayTypes<DeviceType>::t_int_1d _type;
+  typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_efloat_1d _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  int _first;
+
+
+  AtomVecDPDKokkos_UnpackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
+      typename ArrayTypes<DeviceType>::t_x_array &x,
+      typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      typename ArrayTypes<DeviceType>::t_int_1d &type,
+      typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &dpdTheta,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCond,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uMech,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uChem,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCG,
+      const typename ArrayTypes<DeviceType>::t_efloat_1d &uCGnew,
+      const int& first):
+      _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),
+      _dpdTheta(dpdTheta),
+      _uCond(uCond),
+      _uMech(uMech),
+      _uChem(uChem),
+      _uCG(uCGnew),
+      _uCGnew(uCGnew),
+      _first(first) {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _tag(i+_first) = static_cast<int> (_buf(i,3));
+      _type(i+_first) = static_cast<int>  (_buf(i,4));
+      _mask(i+_first) = static_cast<int>  (_buf(i,5));
+      _dpdTheta(i+_first) = _buf(i,6);
+      _uCond(i+_first) = _buf(i,7);
+      _uMech(i+_first) = _buf(i,8);
+      _uChem(i+_first) = _buf(i,9);
+      _uCG(i+_first) = _buf(i,10);
+      _uCGnew(i+_first) = _buf(i,11);
+//      printf("%i %i %lf %lf %lf %i BORDER\n",_tag(i+_first),i+_first,_x(i+_first,0),_x(i+_first,1),_x(i+_first,2),_type(i+_first));
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border_kokkos(const int &n, const int &first,
+                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                 UCG_MASK|UCGNEW_MASK);
+  while (first+n >= nmax) grow(0);
+  modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                 DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                 UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
+  if(space==Host) {
+    struct AtomVecDPDKokkos_UnpackBorder<LMPHostType> f(buf.view<LMPHostType>(),
+      h_x,h_tag,h_type,h_mask,
+      h_dpdTheta,h_uCond,h_uMech,h_uChem,h_uCG,h_uCGnew,
+      first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    struct AtomVecDPDKokkos_UnpackBorder<LMPDeviceType> f(buf.view<LMPDeviceType>(),
+      d_x,d_tag,d_type,d_mask,
+      d_dpdTheta,d_uCond,d_uMech,d_uChem,d_uCG,d_uCGnew,
+      first);
+    Kokkos::parallel_for(n,f);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+
+  modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|
+                DPDTHETA_MASK|UCOND_MASK|UMECH_MASK|UCHEM_MASK|
+                UCG_MASK|UCGNEW_MASK|DVECTOR_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_comm_hybrid(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+  }
+
+  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+                UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_border_hybrid(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_dpdTheta(i) = buf[m++];
+    h_uCond(i) = buf[m++];
+    h_uMech(i) = buf[m++];
+    h_uChem(i) = buf[m++];
+    h_uCG(i) = buf[m++];
+    h_uCGnew(i) = buf[m++];
+  }
+
+  modified(Host,DPDTHETA_MASK | UCOND_MASK |
+                UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_PackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array_randomread _x;
+  typename AT::t_v_array_randomread _v;
+  typename AT::t_tagint_1d_randomread _tag;
+  typename AT::t_int_1d_randomread _type;
+  typename AT::t_int_1d_randomread _mask;
+  typename AT::t_imageint_1d_randomread _image;
+  typename AT::t_efloat_1d_randomread _dpdTheta,_uCond,_uMech,_uChem,_uCG,_uCGnew;
+  typename AT::t_x_array _xw;
+  typename AT::t_v_array _vw;
+  typename AT::t_tagint_1d _tagw;
+  typename AT::t_int_1d _typew;
+  typename AT::t_int_1d _maskw;
+  typename AT::t_imageint_1d _imagew;
+  typename AT::t_efloat_1d _dpdThetaw,_uCondw,_uMechw,_uChemw,_uCGw,_uCGneww;
+
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d_const _sendlist;
+  typename AT::t_int_1d_const _copylist;
+  int _nlocal,_dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecDPDKokkos_PackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d sendlist,
+      typename AT::tdual_int_1d copylist,int nlocal, int dim,
+                X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _dpdTheta(atom->k_dpdTheta.view<DeviceType>()),
+                _uCond(atom->k_uCond.view<DeviceType>()),
+                _uMech(atom->k_uMech.view<DeviceType>()),
+                _uChem(atom->k_uChem.view<DeviceType>()),
+                _uCG(atom->k_uCG.view<DeviceType>()),
+                _uCGnew(atom->k_uCGnew.view<DeviceType>()),
+                _xw(atom->k_x.view<DeviceType>()),
+                _vw(atom->k_v.view<DeviceType>()),
+                _tagw(atom->k_tag.view<DeviceType>()),
+                _typew(atom->k_type.view<DeviceType>()),
+                _maskw(atom->k_mask.view<DeviceType>()),
+                _imagew(atom->k_image.view<DeviceType>()),
+                _dpdThetaw(atom->k_dpdTheta.view<DeviceType>()),
+                _uCondw(atom->k_uCond.view<DeviceType>()),
+                _uMechw(atom->k_uMech.view<DeviceType>()),
+                _uChemw(atom->k_uChem.view<DeviceType>()),
+                _uCGw(atom->k_uCG.view<DeviceType>()),
+                _uCGneww(atom->k_uCGnew.view<DeviceType>()),
+                _sendlist(sendlist.template view<DeviceType>()),
+                _copylist(copylist.template view<DeviceType>()),
+                _nlocal(nlocal),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 17;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &mysend) const {
+    const int i = _sendlist(mysend);
+    _buf(mysend,0) = 17;
+    _buf(mysend,1) = _x(i,0);
+    _buf(mysend,2) = _x(i,1);
+    _buf(mysend,3) = _x(i,2);
+    _buf(mysend,4) = _v(i,0);
+    _buf(mysend,5) = _v(i,1);
+    _buf(mysend,6) = _v(i,2);
+    _buf(mysend,7) = _tag[i];
+    _buf(mysend,8) = _type[i];
+    _buf(mysend,9) = _mask[i];
+    _buf(mysend,10) = _image[i];
+    _buf(mysend,11) = _dpdTheta[i];
+    _buf(mysend,12) = _uCond[i];
+    _buf(mysend,13) = _uMech[i];
+    _buf(mysend,14) = _uChem[i];
+    _buf(mysend,15) = _uCG[i];
+    _buf(mysend,16) = _uCGnew[i];
+    const int j = _copylist(mysend);
+
+    if(j>-1) {
+    _xw(i,0) = _x(j,0);
+    _xw(i,1) = _x(j,1);
+    _xw(i,2) = _x(j,2);
+    _vw(i,0) = _v(j,0);
+    _vw(i,1) = _v(j,1);
+    _vw(i,2) = _v(j,2);
+    _tagw[i] = _tag(j);
+    _typew[i] = _type(j);
+    _maskw[i] = _mask(j);
+    _imagew[i] = _image(j);
+    _dpdThetaw[i] = _dpdTheta(j);
+    _uCondw[i] = _uCond(j);
+    _uMechw[i] = _uMech(j);
+    _uChemw[i] = _uChem(j);
+    _uCGw[i] = _uCG(j);
+    _uCGneww[i] = _uCGnew(j);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf, DAT::tdual_int_1d k_sendlist,DAT::tdual_int_1d k_copylist,ExecutionSpace space,int dim,X_FLOAT lo,X_FLOAT hi )
+{
+  if(nsend > (int) (k_buf.view<LMPHostType>().dimension_0()*k_buf.view<LMPHostType>().dimension_1())/17) {
+    int newsize = nsend*17/k_buf.view<LMPHostType>().dimension_1()+1;
+    k_buf.resize(newsize,k_buf.view<LMPHostType>().dimension_1());
+  }
+  sync(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+             MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+             UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+             DVECTOR_MASK);
+  if(space == Host) {
+    AtomVecDPDKokkos_PackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+  } else {
+    AtomVecDPDKokkos_PackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+  }
+  return nsend*17;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_exchange(int i, double *buf)
+{
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+            UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+            DVECTOR_MASK);
+
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_dpdTheta[i];
+  buf[m++] = h_uCond[i];
+  buf[m++] = h_uMech[i];
+  buf[m++] = h_uChem[i];
+  buf[m++] = h_uCG[i];
+  buf[m++] = h_uCGnew[i];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecDPDKokkos_UnpackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array _x;
+  typename AT::t_v_array _v;
+  typename AT::t_tagint_1d _tag;
+  typename AT::t_int_1d _type;
+  typename AT::t_int_1d _mask;
+  typename AT::t_imageint_1d _image;
+  typename AT::t_efloat_1d _dpdTheta;
+  typename AT::t_efloat_1d _uCond;
+  typename AT::t_efloat_1d _uMech;
+  typename AT::t_efloat_1d _uChem;
+  typename AT::t_efloat_1d _uCG;
+  typename AT::t_efloat_1d _uCGnew;
+
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d _nlocal;
+  int _dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecDPDKokkos_UnpackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d nlocal,
+      int dim, X_FLOAT lo, X_FLOAT hi):
+                _x(atom->k_x.view<DeviceType>()),
+                _v(atom->k_v.view<DeviceType>()),
+                _tag(atom->k_tag.view<DeviceType>()),
+                _type(atom->k_type.view<DeviceType>()),
+                _mask(atom->k_mask.view<DeviceType>()),
+                _image(atom->k_image.view<DeviceType>()),
+                _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
+                _lo(lo),_hi(hi){
+    const size_t elements = 17;
+    const int maxsendlist = (buf.template view<DeviceType>().dimension_0()*buf.template view<DeviceType>().dimension_1())/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &myrecv) const {
+    X_FLOAT x = _buf(myrecv,_dim+1);
+    if (x >= _lo && x < _hi) {
+      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
+      _x(i,0) = _buf(myrecv,1);
+      _x(i,1) = _buf(myrecv,2);
+      _x(i,2) = _buf(myrecv,3);
+      _v(i,0) = _buf(myrecv,4);
+      _v(i,1) = _buf(myrecv,5);
+      _v(i,2) = _buf(myrecv,6);
+      _tag[i] = _buf(myrecv,7);
+      _type[i] = _buf(myrecv,8);
+      _mask[i] = _buf(myrecv,9);
+      _image[i] = _buf(myrecv,10);
+      _dpdTheta[i] = _buf(myrecv,11);
+      _uCond[i] = _buf(myrecv,12);
+      _uMech[i] = _buf(myrecv,13);
+      _uChem[i] = _buf(myrecv,14);
+      _uCG[i] = _buf(myrecv,15);
+      _uCGnew[i] = _buf(myrecv,16);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,ExecutionSpace space) {
+  if(space == Host) {
+    k_count.h_view(0) = nlocal;
+    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/17,f);
+  } else {
+    k_count.h_view(0) = nlocal;
+    k_count.modify<LMPHostType>();
+    k_count.sync<LMPDeviceType>();
+    AtomVecDPDKokkos_UnpackExchangeFunctor<LMPDeviceType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/17,f);
+    k_count.modify<LMPDeviceType>();
+    k_count.sync<LMPHostType>();
+  }
+
+  modified(space,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                 MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+                 UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+                 DVECTOR_MASK);
+
+  return k_count.h_view(0);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_exchange(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_dpdTheta[nlocal] = buf[m++];
+  h_uCond[nlocal] = buf[m++];
+  h_uMech[nlocal] = buf[m++];
+  h_uChem[nlocal] = buf[m++];
+  h_uCG[nlocal] = buf[m++];
+  h_uCGnew[nlocal] = buf[m++];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+           MASK_MASK | IMAGE_MASK| DPDTHETA_MASK | UCOND_MASK |
+           UMECH_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK |
+           DVECTOR_MASK);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::size_restart()
+{
+  int i;
+
+  int nlocal = atom->nlocal;
+  int n = 15 * nlocal; // 11 + dpdTheta + uCond + uMech + uChem
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   molecular types may be negative, but write as positive
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_restart(int i, double *buf)
+{
+  sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+            UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
+
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = h_dpdTheta[i];
+  buf[m++] = h_uCond[i];
+  buf[m++] = h_uMech[i];
+  buf[m++] = h_uChem[i];
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_dpdTheta[nlocal] = buf[m++];
+  h_uCond[nlocal] = buf[m++];
+  h_uMech[nlocal] = buf[m++];
+  h_uChem[nlocal] = buf[m++];
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (buf[0]) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+                MASK_MASK | IMAGE_MASK | DPDTHETA_MASK |
+                UCOND_MASK | UMECH_MASK | UCHEM_MASK | DVECTOR_MASK);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   set other values to defaults
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    //if(nlocal>2) printf("typeA: %i %i\n",type[0],type[1]);
+    atomKK->modified(Host,ALL_MASK);
+    grow(0);
+    //if(nlocal>2) printf("typeB: %i %i\n",type[0],type[1]);
+  }
+  atomKK->modified(Host,ALL_MASK);
+
+  tag[nlocal] = 0;
+  type[nlocal] = itype;
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+  h_mask[nlocal] = 1;
+  h_image[nlocal] = ((tagint) IMGMAX << IMG2BITS) |
+    ((tagint) IMGMAX << IMGBITS) | IMGMAX;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+  h_rho[nlocal] = 0.0;
+  h_dpdTheta[nlocal] = 0.0;
+  h_uCond[nlocal] = 0.0;
+  h_uMech[nlocal] = 0.0;
+  h_uChem[nlocal] = 0.0;
+  h_uCG[nlocal] = 0.0;
+  h_uCGnew[nlocal] = 0.0;
+  h_duChem[nlocal] = 0.0;
+
+  //atomKK->modified(Host,TAG_MASK|TYPE_MASK|DPDTHETA_MASK|X_MASK|IMAGE_MASK|
+  //                      MASK_MASK|V_MASK|DPDRHO_MASK|UCOND_MASK|UMECH_MASK|
+  //                      UCHEM_MASK|UCG_MASK|UCGNEW_MASK);
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   initialize other atom quantities
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::data_atom(double *coord, tagint imagetmp,
+                                    char **values)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  h_tag[nlocal] = ATOTAGINT(values[0]);
+  h_type[nlocal] = atoi(values[1]);
+  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom type in Atoms section of data file");
+
+  h_dpdTheta[nlocal] = atof(values[2]);
+  if (h_dpdTheta[nlocal] <= 0)
+    error->one(FLERR,"Internal temperature in Atoms section of date file must be > zero");
+
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+
+  h_image[nlocal] = imagetmp;
+
+  h_mask[nlocal] = 1;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  h_rho[nlocal] = 0.0;
+  h_uCond[nlocal] = 0.0;
+  h_uMech[nlocal] = 0.0;
+  h_uChem[nlocal] = 0.0;
+  h_uCG[nlocal] = 0.0;
+  h_uCGnew[nlocal] = 0.0;
+
+  atomKK->modified(Host,ALL_MASK);
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack hybrid quantities from one line in Atoms section of data file
+   initialize other atom quantities for this sub-style
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::data_atom_hybrid(int nlocal, char **values)
+{
+  h_dpdTheta(nlocal) = atof(values[0]);
+
+  atomKK->modified(Host,DPDTHETA_MASK);
+
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::pack_data(double **buf)
+{
+  atomKK->sync(Host,TAG_MASK|TYPE_MASK|DPDTHETA_MASK|X_MASK|IMAGE_MASK);
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag(i)).d;
+    buf[i][1] = ubuf(h_type(i)).d;
+    buf[i][2] = h_dpdTheta(i);
+    buf[i][3] = h_x(i,0);
+    buf[i][4] = h_x(i,1);
+    buf[i][5] = h_x(i,2);
+    buf[i][6] = (h_image[i] & IMGMASK) - IMGMAX;
+    buf[i][7] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX;
+    buf[i][8] = (h_image[i] >> IMG2BITS) - IMGMAX;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack hybrid atom info for data file
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::pack_data_hybrid(int i, double *buf)
+{
+  atomKK->sync(Host,DPDTHETA_MASK);
+
+  buf[0] = h_dpdTheta(i);
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  for (int i = 0; i < n; i++)
+    fprintf(fp,TAGINT_FORMAT " %d %-1.16e %-1.16e %-1.16e %-1.16e %d %d %d\n",
+            (tagint) ubuf(buf[i][0]).i,(int) ubuf(buf[i][1]).i,
+            buf[i][2],buf[i][3],buf[i][4],buf[i][5],
+            (int) ubuf(buf[i][6]).i,(int) ubuf(buf[i][7]).i,
+            (int) ubuf(buf[i][8]).i);
+}
+
+/* ----------------------------------------------------------------------
+   write hybrid atom info to data file
+------------------------------------------------------------------------- */
+
+int AtomVecDPDKokkos::write_data_hybrid(FILE *fp, double *buf)
+{
+  fprintf(fp," %-1.16e",buf[0]);
+  return 1;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecDPDKokkos::memory_usage()
+{
+  bigint bytes = 0;
+
+  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
+  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
+  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
+  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
+  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
+  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
+  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3);
+  if (atom->memcheck("rho")) bytes += memory->usage(rho,nmax);
+  if (atom->memcheck("dpdTheta")) bytes += memory->usage(dpdTheta,nmax);
+  if (atom->memcheck("uCond")) bytes += memory->usage(uCond,nmax);
+  if (atom->memcheck("uMech")) bytes += memory->usage(uMech,nmax);
+  if (atom->memcheck("uChem")) bytes += memory->usage(uChem,nmax);
+  if (atom->memcheck("uCG")) bytes += memory->usage(uCG,nmax);
+  if (atom->memcheck("uCGnew")) bytes += memory->usage(uCGnew,nmax);
+  if (atom->memcheck("duChem")) bytes += memory->usage(duChem,nmax);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::sync(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.sync<LMPDeviceType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.sync<LMPDeviceType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.sync<LMPDeviceType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.sync<LMPDeviceType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.sync<LMPDeviceType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.sync<LMPDeviceType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.sync<LMPDeviceType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.sync<LMPDeviceType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.sync<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.sync<LMPHostType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.sync<LMPHostType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.sync<LMPHostType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.sync<LMPHostType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.sync<LMPHostType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.sync<LMPHostType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.sync<LMPHostType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.sync<LMPHostType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & DPDRHO_MASK) && atomKK->k_rho.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_rho,space);
+    if ((mask & DPDTHETA_MASK) && atomKK->k_dpdTheta.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_dpdTheta,space);
+    if ((mask & UCOND_MASK) && atomKK->k_uCond.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCond,space);
+    if ((mask & UMECH_MASK) && atomKK->k_uMech.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uMech,space);
+    if ((mask & UCHEM_MASK) && atomKK->k_uChem.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uChem,space);
+    if ((mask & UCG_MASK) && atomKK->k_uCG.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCG,space);
+    if ((mask & UCGNEW_MASK) && atomKK->k_uCGnew.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCGnew,space);
+    if ((mask & DUCHEM_MASK) && atomKK->k_duChem.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_duChem,space);
+    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
+  } else {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & DPDRHO_MASK) && atomKK->k_rho.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_rho,space);
+    if ((mask & DPDTHETA_MASK) && atomKK->k_dpdTheta.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_dpdTheta,space);
+    if ((mask & UCOND_MASK) && atomKK->k_uCond.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCond,space);
+    if ((mask & UMECH_MASK) && atomKK->k_uMech.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uMech,space);
+    if ((mask & UCHEM_MASK) && atomKK->k_uChem.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uChem,space);
+    if ((mask & UCG_MASK) && atomKK->k_uCG.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCG,space);
+    if ((mask & UCGNEW_MASK) && atomKK->k_uCGnew.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_uCGnew,space);
+    if ((mask & DUCHEM_MASK) && atomKK->k_duChem.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_efloat_1d>(atomKK->k_duChem,space);
+    if ((mask & DVECTOR_MASK) && atomKK->k_dvector.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_float_2d>(atomKK->k_dvector,space);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecDPDKokkos::modified(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.modify<LMPDeviceType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.modify<LMPDeviceType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.modify<LMPDeviceType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.modify<LMPDeviceType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.modify<LMPDeviceType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.modify<LMPDeviceType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.modify<LMPDeviceType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.modify<LMPDeviceType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.modify<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
+    if (mask & DPDRHO_MASK) atomKK->k_rho.modify<LMPHostType>();
+    if (mask & DPDTHETA_MASK) atomKK->k_dpdTheta.modify<LMPHostType>();
+    if (mask & UCOND_MASK) atomKK->k_uCond.modify<LMPHostType>();
+    if (mask & UMECH_MASK) atomKK->k_uMech.modify<LMPHostType>();
+    if (mask & UCHEM_MASK) atomKK->k_uChem.modify<LMPHostType>();
+    if (mask & UCG_MASK) atomKK->k_uCG.modify<LMPHostType>();
+    if (mask & UCGNEW_MASK) atomKK->k_uCGnew.modify<LMPHostType>();
+    if (mask & DUCHEM_MASK) atomKK->k_duChem.modify<LMPHostType>();
+    if (mask & DVECTOR_MASK) atomKK->k_dvector.modify<LMPHostType>();
+  }
+}
+
diff --git a/src/KOKKOS/atom_vec_dpd_kokkos.h b/src/KOKKOS/atom_vec_dpd_kokkos.h
new file mode 100644
index 0000000000..372404cc7d
--- /dev/null
+++ b/src/KOKKOS/atom_vec_dpd_kokkos.h
@@ -0,0 +1,137 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale AtomicKokkos/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(dpd/kk,AtomVecDPDKokkos)
+AtomStyle(dpd/kk/device,AtomVecDPDKokkos)
+AtomStyle(dpd/kk/host,AtomVecDPDKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_DPD_KOKKOS_H
+#define LMP_ATOM_VEC_DPD_KOKKOS_H
+
+#include "atom_vec_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecDPDKokkos : public AtomVecKokkos {
+ public:
+  AtomVecDPDKokkos(class LAMMPS *);
+  virtual ~AtomVecDPDKokkos() {}
+  void grow(int);
+  void copy(int, int, int);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  int pack_comm_hybrid(int, int *, double *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int unpack_comm_hybrid(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  int pack_border_hybrid(int, int *, double *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int unpack_border_hybrid(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, tagint, char **);
+  int data_atom_hybrid(int, char **);
+  void pack_data(double **);
+  int pack_data_hybrid(int, double *);
+  void write_data(FILE *, int, double **);
+  int write_data_hybrid(FILE *, double *);
+  bigint memory_usage();
+
+  void grow_reset();
+  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                       const int & iswap,
+                       const DAT::tdual_xfloat_2d &buf,
+                       const int &pbc_flag, const int pbc[]);
+  void unpack_comm_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf);
+  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                     const int & iswap, const int nfirst,
+                     const int &pbc_flag, const int pbc[]);
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst,
+                            const DAT::tdual_xfloat_2d &buf,
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim,
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+  double *uCond,*uMech,*uChem,*uCG,*uCGnew,*rho,*dpdTheta;
+  double *duChem;
+
+ protected:
+  DAT::t_efloat_1d d_uCond, d_uMech, d_uChem, d_uCG, d_uCGnew,d_rho,d_dpdTheta,d_duChem;
+  HAT::t_efloat_1d h_uCond, h_uMech, h_uChem, h_uCG, h_uCGnew,h_rho,h_dpdTheta,h_duChem;
+
+  tagint *tag;
+  imageint *image;
+  int *type,*mask;
+  double **x,**v,**f;
+
+  DAT::t_tagint_1d d_tag;
+  HAT::t_tagint_1d h_tag;
+  DAT::t_imageint_1d d_image;
+  HAT::t_imageint_1d h_image;
+  DAT::t_int_1d d_type, d_mask;
+  HAT::t_int_1d h_type, h_mask;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
+  DAT::tdual_int_1d k_count;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Per-processor system is too big
+
+The number of owned atoms plus ghost atoms on a single
+processor must fit in 32-bit integer.
+
+E: Invalid atom type in Atoms section of data file
+
+Atom types must range from 1 to specified # of types.
+
+*/
diff --git a/src/KOKKOS/atom_vec_full_kokkos.cpp b/src/KOKKOS/atom_vec_full_kokkos.cpp
index fa4cf18ae3..fd7eaf7c81 100644
--- a/src/KOKKOS/atom_vec_full_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_full_kokkos.cpp
@@ -396,7 +396,6 @@ int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -424,7 +423,6 @@ int AtomVecFullKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -515,7 +513,6 @@ int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -544,7 +541,6 @@ int AtomVecFullKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &lis
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -582,13 +578,11 @@ void AtomVecFullKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecFullKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecFullKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -838,13 +832,11 @@ int AtomVecFullKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecFullKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -854,13 +846,11 @@ int AtomVecFullKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_q,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecFullKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_q,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -1071,12 +1061,10 @@ void AtomVecFullKokkos::unpack_border_kokkos(const int &n, const int &first,
     struct AtomVecFullKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_q,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecFullKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_q,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1422,13 +1410,11 @@ int AtomVecFullKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2
     AtomVecFullKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecFullKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1643,7 +1629,6 @@ int AtomVecFullKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecFullKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1652,7 +1637,6 @@ int AtomVecFullKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nr
     AtomVecFullKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
new file mode 100644
index 0000000000..e5e361e70a
--- /dev/null
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp
@@ -0,0 +1,1218 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include "atom_vec_hybrid_kokkos.h"
+#include "atom_kokkos.h"
+#include "domain.h"
+#include "modify.h"
+#include "fix.h"
+#include "memory.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecHybridKokkos::AtomVecHybridKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp) {}
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecHybridKokkos::~AtomVecHybridKokkos()
+{
+  for (int k = 0; k < nstyles; k++) delete styles[k];
+  delete [] styles;
+  for (int k = 0; k < nstyles; k++) delete [] keywords[k];
+  delete [] keywords;
+}
+
+/* ----------------------------------------------------------------------
+   process sub-style args
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::process_args(int narg, char **arg)
+{
+  // build list of all known atom styles
+
+  build_styles();
+
+  // allocate list of sub-styles as big as possibly needed if no extra args
+
+  styles = new AtomVec*[narg];
+  keywords = new char*[narg];
+
+  // allocate each sub-style
+  // call process_args() with set of args that are not atom style names
+  // use known_style() to determine which args these are
+
+  int i,jarg,dummy;
+
+  int iarg = 0;
+  nstyles = 0;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"hybrid") == 0)
+      error->all(FLERR,"Atom style hybrid cannot have hybrid as an argument");
+    for (i = 0; i < nstyles; i++)
+      if (strcmp(arg[iarg],keywords[i]) == 0)
+        error->all(FLERR,"Atom style hybrid cannot use same atom style twice");
+    styles[nstyles] = atom->new_avec(arg[iarg],1,dummy);
+    keywords[nstyles] = new char[strlen(arg[iarg])+1];
+    strcpy(keywords[nstyles],arg[iarg]);
+    jarg = iarg + 1;
+    while (jarg < narg && !known_style(arg[jarg])) jarg++;
+    styles[nstyles]->process_args(jarg-iarg-1,&arg[iarg+1]);
+    iarg = jarg;
+    nstyles++;
+  }
+
+  // free allstyles created by build_styles()
+
+  for (int i = 0; i < nallstyles; i++) delete [] allstyles[i];
+  delete [] allstyles;
+
+  // hybrid settings are MAX or MIN of sub-style settings
+  // hybrid sizes are minimal values plus extra values for each sub-style
+
+  molecular = 0;
+  comm_x_only = comm_f_only = 1;
+
+  size_forward = 3;
+  size_reverse = 3;
+  size_border = 6;
+  size_data_atom = 5;
+  size_data_vel = 4;
+  xcol_data = 3;
+
+  for (int k = 0; k < nstyles; k++) {
+    if ((styles[k]->molecular == 1 && molecular == 2) ||
+        (styles[k]->molecular == 2 && molecular == 1))
+      error->all(FLERR,"Cannot mix molecular and molecule template "
+                 "atom styles");
+    molecular = MAX(molecular,styles[k]->molecular);
+
+    bonds_allow = MAX(bonds_allow,styles[k]->bonds_allow);
+    angles_allow = MAX(angles_allow,styles[k]->angles_allow);
+    dihedrals_allow = MAX(dihedrals_allow,styles[k]->dihedrals_allow);
+    impropers_allow = MAX(impropers_allow,styles[k]->impropers_allow);
+    mass_type = MAX(mass_type,styles[k]->mass_type);
+    dipole_type = MAX(dipole_type,styles[k]->dipole_type);
+    forceclearflag = MAX(forceclearflag,styles[k]->forceclearflag);
+
+    if (styles[k]->molecular == 2) onemols = styles[k]->onemols;
+
+    comm_x_only = MIN(comm_x_only,styles[k]->comm_x_only);
+    comm_f_only = MIN(comm_f_only,styles[k]->comm_f_only);
+    size_forward += styles[k]->size_forward - 3;
+    size_reverse += styles[k]->size_reverse - 3;
+    size_border += styles[k]->size_border - 6;
+    size_data_atom += styles[k]->size_data_atom - 5;
+    size_data_vel += styles[k]->size_data_vel - 4;
+  }
+
+  size_velocity = 3;
+  if (atom->omega_flag) size_velocity += 3;
+  if (atom->angmom_flag) size_velocity += 3;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::init()
+{
+  AtomVec::init();
+  for (int k = 0; k < nstyles; k++) styles[k]->init();
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by a chunk
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::grow(int n)
+{
+  if (n == 0) grow_nmax();
+  else nmax = n;
+  atom->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  // sub-styles perform all reallocation
+  // turn off nextra_grow so hybrid can do that once below
+
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+  for (int k = 0; k < nstyles; k++) styles[k]->grow(nmax);
+  atom->nextra_grow = tmp;
+
+  // insure hybrid local ptrs and sub-style ptrs are up to date
+  // for sub-styles, do this in case
+  //   multiple sub-style reallocs of same array occurred
+
+  grow_reset();
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::grow_reset()
+{
+  tag = atomKK->tag;
+  d_tag = atomKK->k_tag.d_view;
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type;
+  d_type = atomKK->k_type.d_view;
+  h_type = atomKK->k_type.h_view;
+
+  mask = atomKK->mask;
+  d_mask = atomKK->k_mask.d_view;
+  h_mask = atomKK->k_mask.h_view;
+
+  image = atomKK->image;
+  d_image = atomKK->k_image.d_view;
+  h_image = atomKK->k_image.h_view;
+
+  x = atomKK->x;
+  d_x = atomKK->k_x.d_view;
+  h_x = atomKK->k_x.h_view;
+
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+
+  f = atomKK->f;
+  d_f = atomKK->k_f.d_view;
+  h_f = atomKK->k_f.h_view;
+
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+
+  omega = atomKK->omega;
+  d_omega = atomKK->k_omega.d_view;
+  h_omega = atomKK->k_omega.h_view;
+
+  angmom = atomKK->angmom;
+  d_angmom = atomKK->k_angmom.d_view;
+  h_angmom = atomKK->k_angmom.h_view;
+
+  for (int k = 0; k < nstyles; k++) styles[k]->grow_reset();
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J for all sub-styles
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::copy(int i, int j, int delflag)
+{
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+  for (int k = 0; k < nstyles; k++) styles[k]->copy(i,j,delflag);
+  atom->nextra_grow = tmp;
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::clear_bonus()
+{
+  for (int k = 0; k < nstyles; k++) styles[k]->clear_bonus();
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::force_clear(int n, size_t nbytes)
+{
+  for (int k = 0; k < nstyles; k++)
+    if (styles[k]->forceclearflag) styles[k]->force_clear(n,nbytes);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                     const int & iswap,
+                     const DAT::tdual_xfloat_2d &buf,
+                     const int &pbc_flag, const int pbc[])
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+void AtomVecHybridKokkos::unpack_comm_kokkos(const int &n, const int &nfirst,
+                        const DAT::tdual_xfloat_2d &buf)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                   const int & iswap, const int nfirst,
+                   const int &pbc_flag, const int pbc[])
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                       DAT::tdual_xfloat_2d buf,int iswap,
+                       int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+void AtomVecHybridKokkos::unpack_border_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf,
+                          ExecutionSpace space)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                         DAT::tdual_int_1d k_sendlist,
+                         DAT::tdual_int_1d k_copylist,
+                         ExecutionSpace space, int dim,
+                         X_FLOAT lo, X_FLOAT hi)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+int AtomVecHybridKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                           int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                           ExecutionSpace space)
+{
+  error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm");
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_comm(int n, int *list, double *buf,
+                             int pbc_flag, int *pbc)
+{
+  sync(Host,X_MASK);
+
+  int i,j,k,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_comm_hybrid(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_comm_vel(int n, int *list, double *buf,
+                                 int pbc_flag, int *pbc)
+{
+  sync(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
+  int i,j,k,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      if (omega_flag) {
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+      if (angmom_flag) {
+        buf[m++] = h_angmom(j,0);
+        buf[m++] = h_angmom(j,1);
+        buf[m++] = h_angmom(j,2);
+      }
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
+      dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
+      dz = pbc[2]*domain->zprd;
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        if (h_mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_comm_hybrid(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_comm(int n, int first, double *buf)
+{
+  int i,k,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+  }
+
+  modified(Host,X_MASK);
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_comm_hybrid(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_comm_vel(int n, int first, double *buf)
+{
+  int i,k,m,last;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    if (omega_flag) {
+      h_omega(i,0) = buf[m++];
+      h_omega(i,1) = buf[m++];
+      h_omega(i,2) = buf[m++];
+    }
+    if (angmom_flag) {
+      h_angmom(i,0) = buf[m++];
+      h_angmom(i,1) = buf[m++];
+      h_angmom(i,2) = buf[m++];
+    }
+  }
+
+  modified(Host,X_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_comm_hybrid(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_reverse(int n, int first, double *buf)
+{
+  sync(Host,F_MASK);
+
+  int i,k,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = h_f(i,0);
+    buf[m++] = h_f(i,1);
+    buf[m++] = h_f(i,2);
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_reverse_hybrid(n,first,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_reverse(int n, int *list, double *buf)
+{
+  int i,j,k,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    h_f(j,0) += buf[m++];
+    h_f(j,1) += buf[m++];
+    h_f(j,2) += buf[m++];
+  }
+
+  modified(Host,F_MASK);
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_reverse_hybrid(n,list,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+
+  int i,j,k,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_border_hybrid(n,list,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+  int i,j,k,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag[j]).d;
+      buf[m++] = ubuf(h_type[j]).d;
+      buf[m++] = ubuf(h_mask[j]).d;
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+      if (omega_flag) {
+        buf[m++] = h_omega(j,0);
+        buf[m++] = h_omega(j,1);
+        buf[m++] = h_omega(j,2);
+      }
+      if (angmom_flag) {
+        buf[m++] = h_angmom(j,0);
+        buf[m++] = h_angmom(j,1);
+        buf[m++] = h_angmom(j,2);
+      }
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag[j]).d;
+        buf[m++] = ubuf(h_type[j]).d;
+        buf[m++] = ubuf(h_mask[j]).d;
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag[j]).d;
+        buf[m++] = ubuf(h_type[j]).d;
+        buf[m++] = ubuf(h_mask[j]).d;
+        if (h_mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+        if (omega_flag) {
+          buf[m++] = h_omega(j,0);
+          buf[m++] = h_omega(j,1);
+          buf[m++] = h_omega(j,2);
+        }
+        if (angmom_flag) {
+          buf[m++] = h_angmom(j,0);
+          buf[m++] = h_angmom(j,1);
+          buf[m++] = h_angmom(j,2);
+        }
+      }
+    }
+  }
+
+  // pack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_border_hybrid(n,list,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_border(int n, int first, double *buf)
+{
+  int i,k,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = (tagint) ubuf(buf[m++]).i;
+    h_type[i] = (int) ubuf(buf[m++]).i;
+    h_mask[i] = (int) ubuf(buf[m++]).i;
+  }
+
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK);
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_border_hybrid(n,first,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int i,k,m,last;
+  int omega_flag = atom->omega_flag;
+  int angmom_flag = atom->angmom_flag;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag[i] = (tagint) ubuf(buf[m++]).i;
+    h_type[i] = (int) ubuf(buf[m++]).i;
+    h_mask[i] = (int) ubuf(buf[m++]).i;
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+    if (omega_flag) {
+      h_omega(i,0) = buf[m++];
+      h_omega(i,1) = buf[m++];
+      h_omega(i,2) = buf[m++];
+    }
+    if (angmom_flag) {
+      h_angmom(i,0) = buf[m++];
+      h_angmom(i,1) = buf[m++];
+      h_angmom(i,2) = buf[m++];
+    }
+  }
+
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
+  // unpack sub-style contributions as contiguous chunks
+
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->unpack_border_hybrid(n,first,&buf[m]);
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ----------------------------------------------------------------------
+   pack data for atom I for sending to another proc
+   pack each sub-style one after the other
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_exchange(int i, double *buf)
+{
+  int k,m;
+
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+
+  m = 0;
+  for (k = 0; k < nstyles; k++)
+    m += styles[k]->pack_exchange(i,&buf[m]);
+
+  atom->nextra_grow = tmp;
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for single atom received from another proc
+   unpack each sub-style one after the other
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::unpack_exchange(double *buf)
+{
+  int k,m;
+
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  int tmp = atom->nextra_grow;
+  atom->nextra_grow = 0;
+
+  m = 0;
+  for (k = 0; k < nstyles; k++) {
+    m += styles[k]->unpack_exchange(&buf[m]);
+    atom->nlocal--;
+  }
+
+  atom->nextra_grow = tmp;
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::size_restart()
+{
+  int tmp = atom->nextra_restart;
+  atom->nextra_restart = 0;
+
+  int n = 0;
+  for (int k = 0; k < nstyles; k++)
+    n += styles[k]->size_restart();
+
+  atom->nextra_restart = tmp;
+
+  int nlocal = atom->nlocal;
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (int i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   pack each sub-style one after the other
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::pack_restart(int i, double *buf)
+{
+  int tmp = atom->nextra_restart;
+  atom->nextra_restart = 0;
+
+  int m = 0;
+  for (int k = 0; k < nstyles; k++)
+    m += styles[k]->pack_restart(i,&buf[m]);
+
+  atom->nextra_restart = tmp;
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+   unpack each sub-style one after the other
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+
+  int tmp = atom->nextra_store;
+  atom->nextra_store = 0;
+
+  int m = 0;
+  for (int k = 0; k < nstyles; k++) {
+    m += styles[k]->unpack_restart(&buf[m]);
+    atom->nlocal--;
+  }
+  atom->nextra_store = tmp;
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (buf[0]) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   create each sub-style one after the other
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  for (int k = 0; k < nstyles; k++) {
+    styles[k]->create_atom(itype,coord);
+    atom->nlocal--;
+  }
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   grow() occurs here so arrays for all sub-styles are grown
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::data_atom(double *coord, imageint imagetmp, char **values)
+{
+  sync(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  h_tag[nlocal] = ATOTAGINT(values[0]);
+  h_type[nlocal] = atoi(values[1]);
+  if (h_type[nlocal] <= 0 || h_type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom h_type in Atoms section of data file");
+
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+
+  h_image[nlocal] = imagetmp;
+  h_mask[nlocal] = 1;
+
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+  if (atom->omega_flag) {
+    h_omega(nlocal,0) = 0.0;
+    h_omega(nlocal,1) = 0.0;
+    h_omega(nlocal,2) = 0.0;
+  }
+  if (atom->angmom_flag) {
+    h_angmom(nlocal,0) = 0.0;
+    h_angmom(nlocal,1) = 0.0;
+    h_angmom(nlocal,2) = 0.0;
+  }
+
+  modified(Host,X_MASK|TAG_MASK|TYPE_MASK|IMAGE_MASK|MASK_MASK|V_MASK|OMEGA_MASK/*|ANGMOM_MASK*/);
+
+  // each sub-style parses sub-style specific values
+
+  int m = 5;
+  for (int k = 0; k < nstyles; k++)
+    m += styles[k]->data_atom_hybrid(nlocal,&values[m]);
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Velocities section of data file
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::data_vel(int m, char **values)
+{
+  sync(Host,V_MASK);
+
+  h_v(m,0) = atof(values[0]);
+  h_v(m,1) = atof(values[1]);
+  h_v(m,2) = atof(values[2]);
+
+  modified(Host,V_MASK);
+
+  // each sub-style parses sub-style specific values
+
+  int n = 3;
+  for (int k = 0; k < nstyles; k++)
+    n += styles[k]->data_vel_hybrid(m,&values[n]);
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::pack_data(double **buf)
+{
+  sync(Host,TAG_MASK|TYPE_MASK|X_MASK);
+
+  int k,m;
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag[i]).d;
+    buf[i][1] = ubuf(h_type[i]).d;
+    buf[i][2] = h_x(i,0);
+    buf[i][3] = h_x(i,1);
+    buf[i][4] = h_x(i,2);
+
+    m = 5;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->pack_data_hybrid(i,&buf[i][m]);
+
+    buf[i][m] = ubuf((h_image[i] & IMGMASK) - IMGMAX).d;
+    buf[i][m+1] = ubuf((h_image[i] >> IMGBITS & IMGMASK) - IMGMAX).d;
+    buf[i][m+2] = ubuf((h_image[i] >> IMG2BITS) - IMGMAX).d;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 h_image flags
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  int k,m;
+
+  for (int i = 0; i < n; i++) {
+    fprintf(fp,TAGINT_FORMAT " %d %-1.16e %-1.16e %-1.16e",
+            (tagint) ubuf(buf[i][0]).i,(int) ubuf(buf[i][1]).i,
+            buf[i][2],buf[i][3],buf[i][4]);
+
+    m = 5;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->write_data_hybrid(fp,&buf[i][m]);
+
+    fprintf(fp," %d %d %d\n",
+            (int) ubuf(buf[i][m]).i,(int) ubuf(buf[i][m+1]).i,
+            (int) ubuf(buf[i][m+2]).i);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack velocity info for data file
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::pack_vel(double **buf)
+{
+  sync(Host,V_MASK);
+
+  int k,m;
+
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = ubuf(h_tag[i]).d;
+    buf[i][1] = h_v(i,0);
+    buf[i][2] = h_v(i,1);
+    buf[i][3] = h_v(i,2);
+
+    m = 4;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->pack_vel_hybrid(i,&buf[i][m]);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   write velocity info to data file
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::write_vel(FILE *fp, int n, double **buf)
+{
+  int k,m;
+
+  for (int i = 0; i < n; i++) {
+    fprintf(fp,TAGINT_FORMAT " %g %g %g",
+            (tagint) ubuf(buf[i][0]).i,buf[i][1],buf[i][2],buf[i][3]);
+
+    m = 4;
+    for (k = 0; k < nstyles; k++)
+      m += styles[k]->write_vel_hybrid(fp,&buf[i][m]);
+
+    fprintf(fp,"\n");
+  }
+}
+
+/* ----------------------------------------------------------------------
+   assign an index to named atom property and return index
+   returned value encodes which sub-style and index returned by sub-style
+   return -1 if name is unknown to any sub-styles
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::property_atom(char *name)
+{
+  for (int k = 0; k < nstyles; k++) {
+    int index = styles[k]->property_atom(name);
+    if (index >= 0) return index*nstyles + k;
+  }
+  return -1;
+}
+
+/* ----------------------------------------------------------------------
+   pack per-atom data into buf for ComputePropertyAtom
+   index maps to data specific to this atom style
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::pack_property_atom(int multiindex, double *buf,
+                                       int nvalues, int groupbit)
+{
+  int k = multiindex % nstyles;
+  int index = multiindex/nstyles;
+  styles[k]->pack_property_atom(index,buf,nvalues,groupbit);
+}
+
+/* ----------------------------------------------------------------------
+   allstyles = list of all atom styles in this LAMMPS executable
+------------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::build_styles()
+{
+  nallstyles = 0;
+#define ATOM_CLASS
+#define AtomStyle(key,Class) nallstyles++;
+#include "style_atom.h"
+#undef AtomStyle
+#undef ATOM_CLASS
+
+  allstyles = new char*[nallstyles];
+
+  int n;
+  nallstyles = 0;
+#define ATOM_CLASS
+#define AtomStyle(key,Class)                \
+  n = strlen(#key) + 1;                     \
+  allstyles[nallstyles] = new char[n];      \
+  strcpy(allstyles[nallstyles],#key);       \
+  nallstyles++;
+#include "style_atom.h"
+#undef AtomStyle
+#undef ATOM_CLASS
+}
+
+/* ----------------------------------------------------------------------
+   allstyles = list of all known atom styles
+------------------------------------------------------------------------- */
+
+int AtomVecHybridKokkos::known_style(char *str)
+{
+  for (int i = 0; i < nallstyles; i++)
+    if (strcmp(str,allstyles[i]) == 0) return 1;
+  return 0;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecHybridKokkos::memory_usage()
+{
+  bigint bytes = 0;
+  for (int k = 0; k < nstyles; k++) bytes += styles[k]->memory_usage();
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::sync(ExecutionSpace space, unsigned int h_mask)
+{
+  for (int k = 0; k < nstyles; k++) ((AtomVecKokkos*) styles[k])->sync(space,h_mask);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int h_mask)
+{
+  for (int k = 0; k < nstyles; k++) ((AtomVecKokkos*) styles[k])->sync_overlapping_device(space,h_mask);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecHybridKokkos::modified(ExecutionSpace space, unsigned int h_mask)
+{
+  for (int k = 0; k < nstyles; k++) ((AtomVecKokkos*) styles[k])->modified(space,h_mask);
+}
diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.h b/src/KOKKOS/atom_vec_hybrid_kokkos.h
new file mode 100644
index 0000000000..fcf48f6c74
--- /dev/null
+++ b/src/KOKKOS/atom_vec_hybrid_kokkos.h
@@ -0,0 +1,161 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(hybrid/kk,AtomVecHybridKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_HYBRID_KOKKOS_H
+#define LMP_ATOM_VEC_HYBRID_KOKKOS_H
+
+#include <stdio.h>
+#include "atom_vec_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecHybridKokkos : public AtomVecKokkos {
+ public:
+  int nstyles;
+  class AtomVec **styles;
+  char **keywords;
+
+  AtomVecHybridKokkos(class LAMMPS *);
+  ~AtomVecHybridKokkos();
+  void process_args(int, char **);
+  void init();
+  void grow(int);
+  void grow_reset();
+  void copy(int, int, int);
+  void clear_bonus();
+  void force_clear(int, size_t);
+  int pack_comm(int, int *, double *, int, int *);
+  int pack_comm_vel(int, int *, double *, int, int *);
+  void unpack_comm(int, int, double *);
+  void unpack_comm_vel(int, int, double *);
+  int pack_reverse(int, int, double *);
+  void unpack_reverse(int, int *, double *);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, imageint, char **);
+  int data_atom_hybrid(int, char **) {return 0;}
+  void data_vel(int, char **);
+  void pack_data(double **);
+  void write_data(FILE *, int, double **);
+  void pack_vel(double **);
+  void write_vel(FILE *, int, double **);
+  int property_atom(char *);
+  void pack_property_atom(int, double *, int, int);
+  bigint memory_usage();
+
+  int pack_comm_kokkos(const int &n, const DAT::tdual_int_2d &k_sendlist,
+                       const int & iswap,
+                       const DAT::tdual_xfloat_2d &buf,
+                       const int &pbc_flag, const int pbc[]);
+  void unpack_comm_kokkos(const int &n, const int &nfirst,
+                          const DAT::tdual_xfloat_2d &buf);
+  int pack_comm_self(const int &n, const DAT::tdual_int_2d &list,
+                     const int & iswap, const int nfirst,
+                     const int &pbc_flag, const int pbc[]);
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst,
+                            const DAT::tdual_xfloat_2d &buf,
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim,
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+
+ private:
+  tagint *tag;
+  int *type,*mask;
+  imageint *image;
+  double **x,**v,**f;
+  double **omega,**angmom;
+
+  DAT::t_tagint_1d d_tag;
+  DAT::t_int_1d d_type, d_mask;
+  HAT::t_tagint_1d h_tag;
+  HAT::t_int_1d h_type, h_mask;
+
+  DAT::t_imageint_1d d_image;
+  HAT::t_imageint_1d h_image;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+  HAT::t_x_array h_x;
+  HAT::t_v_array h_v;
+  HAT::t_f_array h_f;
+
+  DAT::t_v_array d_omega, d_angmom;
+  HAT::t_v_array h_omega, h_angmom;
+
+  DAT::tdual_int_1d k_count;
+
+  int nallstyles;
+  char **allstyles;
+
+  void build_styles();
+  int known_style(char *);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Atom style hybrid cannot have hybrid as an argument
+
+Self-explanatory.
+
+E: Atom style hybrid cannot use same atom style twice
+
+Self-explanatory.
+
+E: Cannot mix molecular and molecule template atom styles
+
+Self-explanatory.
+
+E: Per-processor system is too big
+
+The number of owned atoms plus ghost atoms on a single
+processor must fit in 32-bit integer.
+
+E: Invalid atom type in Atoms section of data file
+
+Atom types must range from 1 to specified # of types.
+
+*/
diff --git a/src/KOKKOS/atom_vec_molecular_kokkos.cpp b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
index 5c16ac1513..dbf6a857b2 100644
--- a/src/KOKKOS/atom_vec_molecular_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_molecular_kokkos.cpp
@@ -387,7 +387,6 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     if(pbc_flag) {
@@ -415,7 +414,6 @@ int AtomVecMolecularKokkos::pack_comm_kokkos(const int &n,
         Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 
 	return n*size_forward;
@@ -506,7 +504,6 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPHostType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
@@ -535,7 +532,6 @@ int AtomVecMolecularKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d
       Kokkos::parallel_for(n,f);
       }
     }
-    LMPDeviceType::fence();
   }
 	return n*3;
 }
@@ -573,13 +569,11 @@ void AtomVecMolecularKokkos::unpack_comm_kokkos(const int &n, const int &first,
     modified(Host,X_MASK);
     struct AtomVecMolecularKokkos_UnpackComm<LMPHostType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   } else {
     sync(Device,X_MASK);
     modified(Device,X_MASK);
     struct AtomVecMolecularKokkos_UnpackComm<LMPDeviceType> f(atomKK->k_x,buf,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -825,13 +819,11 @@ int AtomVecMolecularKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendli
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecMolecularKokkos_PackBorder<LMPDeviceType,1> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
 
   } else {
@@ -841,13 +833,11 @@ int AtomVecMolecularKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendli
         buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
         iswap,h_x,h_tag,h_type,h_mask,h_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPHostType::fence();
     } else {
       AtomVecMolecularKokkos_PackBorder<LMPDeviceType,0> f(
         buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
         iswap,d_x,d_tag,d_type,d_mask,d_molecule,dx,dy,dz);
       Kokkos::parallel_for(n,f);
-      LMPDeviceType::fence();
     }
   }
   return n*size_border;
@@ -1049,12 +1039,10 @@ void AtomVecMolecularKokkos::unpack_border_kokkos(const int &n, const int &first
     struct AtomVecMolecularKokkos_UnpackBorder<LMPHostType>
       f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPHostType::fence();
   } else {
     struct AtomVecMolecularKokkos_UnpackBorder<LMPDeviceType>
       f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_molecule,first);
     Kokkos::parallel_for(n,f);
-    LMPDeviceType::fence();
   }
 }
 
@@ -1389,13 +1377,11 @@ int AtomVecMolecularKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfl
     AtomVecMolecularKokkos_PackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPHostType::fence();
     return nsend*elements;
   } else {
     AtomVecMolecularKokkos_PackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
     Kokkos::parallel_for(nsend,f);
-    LMPDeviceType::fence();
     return nsend*elements;
   }
 }
@@ -1608,7 +1594,6 @@ int AtomVecMolecularKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,i
     AtomVecMolecularKokkos_UnpackExchangeFunctor<LMPHostType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPHostType::fence();
     return k_count.h_view(0);
   } else {
     k_count.h_view(0) = nlocal;
@@ -1617,7 +1602,6 @@ int AtomVecMolecularKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,i
     AtomVecMolecularKokkos_UnpackExchangeFunctor<LMPDeviceType>
       f(atomKK,k_buf,k_count,dim,lo,hi);
     Kokkos::parallel_for(nrecv/elements,f);
-    LMPDeviceType::fence();
     k_count.modify<LMPDeviceType>();
     k_count.sync<LMPHostType>();
 
diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp
index 2b19908396..da1f4a89fe 100644
--- a/src/KOKKOS/comm_kokkos.cpp
+++ b/src/KOKKOS/comm_kokkos.cpp
@@ -499,7 +499,6 @@ void CommKokkos::exchange_device()
           f(atomKK->k_x,k_exchange_sendlist,k_count,k_sendflag,
             nlocal,dim,lo,hi);
         Kokkos::parallel_for(nlocal,f);
-        DeviceType::fence();
         k_exchange_sendlist.modify<DeviceType>();
         k_sendflag.modify<DeviceType>();
         k_count.modify<DeviceType>();
@@ -535,7 +534,6 @@ void CommKokkos::exchange_device()
                                    k_exchange_sendlist,k_exchange_copylist,
                                    ExecutionSpaceFromDevice<DeviceType>::
                                    space,dim,lo,hi);
-      DeviceType::fence();
 
     } else {
       while (i < nlocal) {
@@ -560,7 +558,6 @@ void CommKokkos::exchange_device()
         atom->nlocal=avec->
           unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi,
                                  ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
       }
     } else {
       MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0,
@@ -593,7 +590,6 @@ void CommKokkos::exchange_device()
         atom->nlocal = avec->
           unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi,
                                  ExecutionSpaceFromDevice<DeviceType>::space);
-        DeviceType::fence();
       }
     }
 
@@ -765,7 +761,6 @@ void CommKokkos::borders_device() {
                 total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
             Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
             Kokkos::parallel_for(config,f);
-            DeviceType::fence();
 
             total_send.template modify<DeviceType>();
             total_send.template sync<LMPHostType>();
@@ -782,7 +777,6 @@ void CommKokkos::borders_device() {
                   total_send,nfirst,nlast,dim,lo,hi,iswap,maxsendlist[iswap]);
               Kokkos::TeamPolicy<DeviceType> config((nlast-nfirst+127)/128,128);
               Kokkos::parallel_for(config,f);
-              DeviceType::fence();
               total_send.template modify<DeviceType>();
               total_send.template sync<LMPHostType>();
             }
@@ -911,7 +905,6 @@ void CommKokkos::borders_device() {
 
   if (exec_space == Host) k_sendlist.sync<LMPDeviceType>();
   atomKK->modified(exec_space,ALL_MASK);
-  DeviceType::fence();
   atomKK->sync(Host,TAG_MASK);
   if (map_style) atom->map_set();
 }
diff --git a/src/KOKKOS/domain_kokkos.cpp b/src/KOKKOS/domain_kokkos.cpp
index 4bf8dc9841..d9c1332778 100644
--- a/src/KOKKOS/domain_kokkos.cpp
+++ b/src/KOKKOS/domain_kokkos.cpp
@@ -99,7 +99,6 @@ void DomainKokkos::reset_box()
     DomainResetBoxFunctor<LMPDeviceType>
       f(atomKK->k_x);
     Kokkos::parallel_reduce(nlocal,f,result);
-    LMPDeviceType::fence();
 
     double (*extent)[2] = result.value;
     double all[3][2];
@@ -384,7 +383,6 @@ void DomainKokkos::pbc()
       Kokkos::parallel_for(nlocal,f);
     }
   }
-  LMPDeviceType::fence();
 
   atomKK->modified(Device,X_MASK|V_MASK|IMAGE_MASK);
 }
@@ -424,7 +422,6 @@ void DomainKokkos::remap_all()
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_remap_all>(0,nlocal),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,X_MASK | IMAGE_MASK);
@@ -528,7 +525,6 @@ void DomainKokkos::image_flip(int m_in, int n_in, int p_in)
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_image_flip>(0,nlocal),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,IMAGE_MASK);
@@ -561,7 +557,6 @@ void DomainKokkos::lamda2x(int n)
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_lamda2x>(0,n),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,X_MASK);
@@ -587,7 +582,6 @@ void DomainKokkos::x2lamda(int n)
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagDomain_x2lamda>(0,n),*this);
-  LMPDeviceType::fence();
   copymode = 0;
 
   atomKK->modified(Device,X_MASK);
diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.cpp b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
new file mode 100644
index 0000000000..e6878afed4
--- /dev/null
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.cpp
@@ -0,0 +1,96 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <string.h>
+#include "fix_dpd_energy_kokkos.h"
+#include "atom_masks.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "modify.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+FixDPDenergyKokkos<DeviceType>::FixDPDenergyKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixDPDenergy(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+  pairDPDEKK = dynamic_cast<decltype(pairDPDEKK)>(pairDPDE);
+  if (!pairDPDEKK)
+    error->all(FLERR,"Must use pair_style dpd/fdt/energy/kk with fix dpd/energy/kk");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixDPDenergyKokkos<DeviceType>::take_half_step()
+{
+  int nlocal = atom->nlocal;
+  if (igroup == atom->firstgroup) nlocal = atom->nfirst;
+
+  using AT = ArrayTypes<DeviceType>;
+
+  atomKK->sync(execution_space, UCOND_MASK);
+  typename AT::t_efloat_1d uCond = atomKK->k_uCond.view<DeviceType>();
+  atomKK->sync(execution_space, UMECH_MASK);
+  typename AT::t_efloat_1d uMech = atomKK->k_uMech.view<DeviceType>();
+
+  pairDPDEKK->k_duCond.template sync<DeviceType>();
+  typename AT::t_efloat_1d_const duCond = pairDPDEKK->k_duCond.template view<DeviceType>();
+  pairDPDEKK->k_duMech.template sync<DeviceType>();
+  typename AT::t_efloat_1d_const duMech = pairDPDEKK->k_duMech.template view<DeviceType>();
+
+  auto dt = update->dt;
+
+  Kokkos::parallel_for(nlocal, LAMMPS_LAMBDA(int i) {
+    uCond(i) += 0.5*dt*duCond(i);
+    uMech(i) += 0.5*dt*duMech(i);
+  });
+
+  atomKK->modified(execution_space, UCOND_MASK);
+  atomKK->modified(execution_space, UMECH_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixDPDenergyKokkos<DeviceType>::initial_integrate(int)
+{
+  take_half_step();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixDPDenergyKokkos<DeviceType>::final_integrate()
+{
+  take_half_step();
+}
+
+namespace LAMMPS_NS {
+template class FixDPDenergyKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixDPDenergyKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_dpd_energy_kokkos.h b/src/KOKKOS/fix_dpd_energy_kokkos.h
new file mode 100644
index 0000000000..ebf3a796fe
--- /dev/null
+++ b/src/KOKKOS/fix_dpd_energy_kokkos.h
@@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(dpd/energy/kk,FixDPDenergyKokkos<LMPDeviceType>)
+FixStyle(dpd/energy/kk/device,FixDPDenergyKokkos<LMPDeviceType>)
+FixStyle(dpd/energy/kk/host,FixDPDenergyKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_DPDE_KOKKOS_H
+#define LMP_FIX_DPDE_KOKKOS_H
+
+#include "fix_dpd_energy.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template <typename DeviceType>
+class FixDPDenergyKokkos : public FixDPDenergy {
+ public:
+  FixDPDenergyKokkos(class LAMMPS *, int, char **);
+  virtual ~FixDPDenergyKokkos() {}
+  virtual void initial_integrate(int);
+  virtual void final_integrate();
+
+  void take_half_step();
+ protected:
+  PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Must use pair_style dpd/fdt/energy/kk with fix dpd/energy/kk
+
+Self-explanatory.
+
+*/
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.cpp b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
new file mode 100644
index 0000000000..552141ced2
--- /dev/null
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.cpp
@@ -0,0 +1,569 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (Sandia)
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include "fix_eos_table_rx_kokkos.h"
+#include "atom_kokkos.h"
+#include "error.h"
+#include "force.h"
+#include "memory.h"
+#include "comm.h"
+#include <math.h>
+#include "modify.h"
+#include "atom_masks.h"
+
+#define MAXLINE 1024
+
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixEOStableRXKokkos<DeviceType>::FixEOStableRXKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixEOStableRX(lmp, narg, arg)
+{
+  kokkosable = 1;
+
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  update_table = 1;
+  h_table = new TableHost();
+  d_table = new TableDevice();
+
+  k_error_flag = DAT::tdual_int_scalar("fix:error_flag");
+  k_warning_flag = DAT::tdual_int_scalar("fix:warning_flag");
+
+  k_dHf = DAT::tdual_float_1d("fix:dHf",nspecies);
+  k_energyCorr = DAT::tdual_float_1d("fix:energyCorr",nspecies);
+  k_tempCorrCoeff = DAT::tdual_float_1d("fix:tempCorrCoeff",nspecies);
+  k_moleculeCorrCoeff = DAT::tdual_float_1d("fix:moleculeCorrCoeff",nspecies);
+  for (int n = 0; n < nspecies; n++) {
+    k_dHf.h_view(n) = dHf[n];
+    k_energyCorr.h_view(n) = energyCorr[n];
+    k_tempCorrCoeff.h_view(n) = tempCorrCoeff[n];
+    k_moleculeCorrCoeff.h_view(n) = moleculeCorrCoeff[n];
+  }
+
+  k_dHf.modify<LMPHostType>();
+  k_dHf.sync<DeviceType>();
+  d_dHf = k_dHf.view<DeviceType>();
+
+  k_energyCorr.modify<LMPHostType>();
+  k_energyCorr.sync<DeviceType>();
+  d_energyCorr = k_energyCorr.view<DeviceType>();
+
+  k_tempCorrCoeff.modify<LMPHostType>();
+  k_tempCorrCoeff.sync<DeviceType>();
+  d_tempCorrCoeff = k_tempCorrCoeff.view<DeviceType>();
+
+  k_moleculeCorrCoeff.modify<LMPHostType>();
+  k_moleculeCorrCoeff.sync<DeviceType>();
+  d_moleculeCorrCoeff = k_moleculeCorrCoeff.view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixEOStableRXKokkos<DeviceType>::~FixEOStableRXKokkos()
+{
+  if (copymode) return;
+
+  delete h_table;
+  delete d_table;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::setup(int vflag)
+{
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
+  int nlocal = atom->nlocal;
+  boltz = force->boltz;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  if (!this->restart_reset) {
+    atomKK->sync(execution_space,MASK_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
+    atomKK->modified(execution_space,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+  }
+
+  // Communicate the updated momenta and velocities to all nodes
+  atomKK->sync(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+  comm->forward_comm_fix(this);
+  atomKK->modified(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
+  atomKK->modified(execution_space,DPDTHETA_MASK);
+
+  error_check();
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXSetup, const int &i) const {
+  if (mask[i] & groupbit) {
+    const double duChem = uCG[i] - uCGnew[i];
+    uChem[i] += duChem;
+    uCG[i] = 0.0;
+    uCGnew[i] = 0.0;
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXTemperatureLookup, const int &i) const {
+  if (mask[i] & groupbit)
+    temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::init()
+{
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
+  int nlocal = atom->nlocal;
+  boltz = force->boltz;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  if (this->restart_reset) {
+    atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup>(0,nlocal),*this);
+    atomKK->modified(execution_space,DPDTHETA_MASK);
+  } else {
+    atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXInit>(0,nlocal),*this);
+    atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK | UCHEM_MASK);
+  }
+
+  error_check();
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXInit, const int &i) const {
+  double tmp;
+  if (mask[i] & groupbit) {
+    if(dpdTheta[i] <= 0.0)
+      k_error_flag.template view<DeviceType>()() = 1;
+    energy_lookup(i,dpdTheta[i],tmp);
+    uCond[i] = 0.0;
+    uMech[i] = tmp;
+    uChem[i] = 0.0;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::post_integrate()
+{
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
+  int nlocal = atom->nlocal;
+  boltz = force->boltz;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
+  atomKK->modified(execution_space,DPDTHETA_MASK);
+
+  error_check();
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::operator()(TagFixEOStableRXTemperatureLookup2, const int &i) const {
+  if (mask[i] & groupbit){
+    temperature_lookup(i,uCond[i]+uMech[i]+uChem[i],dpdTheta[i]);
+    if (dpdTheta[i] <= 0.0)
+      k_error_flag.template view<DeviceType>()() = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::end_of_step()
+{
+  if (update_table)
+    create_kokkos_tables();
+
+  copymode = 1;
+
+  int nlocal = atom->nlocal;
+  boltz = force->boltz;
+  mask = atomKK->k_mask.view<DeviceType>();
+  uCond = atomKK->k_uCond.view<DeviceType>();
+  uMech = atomKK->k_uMech.view<DeviceType>();
+  uChem = atomKK->k_uChem.view<DeviceType>();
+  dpdTheta= atomKK->k_dpdTheta.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+
+  // Communicate the ghost uCGnew
+  atomKK->sync(Host,UCG_MASK | UCGNEW_MASK);
+  comm->reverse_comm_fix(this);
+  atomKK->modified(Host,UCG_MASK | UCGNEW_MASK);
+
+  atomKK->sync(execution_space,MASK_MASK | UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXSetup>(0,nlocal),*this);
+  atomKK->modified(execution_space,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
+  // Communicate the updated momenta and velocities to all nodes
+  atomKK->sync(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+  comm->forward_comm_fix(this);
+  atomKK->modified(Host,UCHEM_MASK | UCG_MASK | UCGNEW_MASK);
+
+  atomKK->sync(execution_space,MASK_MASK | UCOND_MASK | UMECH_MASK | UCHEM_MASK | DPDTHETA_MASK | DVECTOR_MASK);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixEOStableRXTemperatureLookup2>(0,nlocal),*this);
+  atomKK->modified(execution_space,DPDTHETA_MASK);
+
+  error_check();
+
+  copymode = 0;
+}
+
+/* ----------------------------------------------------------------------
+   calculate potential ui at temperature thetai
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::energy_lookup(int id, double thetai, double &ui) const
+{
+  int itable, nPG;
+  double fraction, uTmp, nMolecules, nTotal, nTotalPG;
+  double tolerance = 1.0e-10;
+
+  ui = 0.0;
+  nTotal = 0.0;
+  nTotalPG = 0.0;
+  nPG = 0;
+
+  if (rx_flag) {
+    for (int ispecies = 0; ispecies < nspecies; ispecies++ ) {
+      nTotal += dvector(ispecies,id);
+      if (fabs(d_moleculeCorrCoeff[ispecies]) > tolerance) {
+        nPG++;
+        nTotalPG += dvector(ispecies,id);
+      }
+    }
+  } else {
+    nTotal = 1.0;
+  }
+
+  for(int ispecies=0;ispecies<nspecies;ispecies++){
+    //Table *tb = &tables[ispecies];
+    //thetai = MAX(thetai,tb->lo);
+    thetai = MAX(thetai,d_table_const.lo(ispecies));
+    //thetai = MIN(thetai,tb->hi);
+    thetai = MIN(thetai,d_table_const.hi(ispecies));
+
+    if (tabstyle == LINEAR) {
+      //itable = static_cast<int> ((thetai - tb->lo) * tb->invdelta);
+      itable = static_cast<int> ((thetai - d_table_const.lo(ispecies)) * d_table_const.invdelta(ispecies));
+      //fraction = (thetai - tb->r[itable]) * tb->invdelta;
+      fraction = (thetai - d_table_const.r(ispecies,itable)) * d_table_const.invdelta(ispecies);
+      //uTmp = tb->e[itable] + fraction*tb->de[itable];
+      uTmp = d_table_const.e(ispecies,itable) + fraction*d_table_const.de(ispecies,itable);
+
+      uTmp += d_dHf[ispecies];
+      uTmp += d_tempCorrCoeff[ispecies]*thetai; // temperature correction
+      uTmp += d_energyCorr[ispecies]; // energy correction
+      if (nPG > 0) ui += d_moleculeCorrCoeff[ispecies]*nTotalPG/double(nPG); // molecule correction
+
+      if (rx_flag) nMolecules = dvector(ispecies,id);
+      else nMolecules = 1.0;
+      ui += nMolecules*uTmp;
+    }
+  }
+  ui = ui - double(nTotal+1.5)*boltz*thetai;
+}
+
+/* ----------------------------------------------------------------------
+   calculate temperature thetai at energy ui
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixEOStableRXKokkos<DeviceType>::temperature_lookup(int id, double ui, double &thetai) const
+{
+  //Table *tb = &tables[0];
+
+  int it;
+  double t1,t2,u1,u2,f1,f2;
+  double maxit = 100;
+  double temp;
+  double delta = 0.001;
+  double tolerance = 1.0e-10;
+  int lo = d_table_const.lo(0);
+  int hi = d_table_const.hi(0);
+
+  // Store the current thetai in t1
+  t1 = MAX(thetai,lo);
+  t1 = MIN(t1,hi);
+  if(t1==hi) delta = -delta;
+
+  // Compute u1 at thetai
+  energy_lookup(id,t1,u1);
+
+  // Compute f1
+  f1 = u1 - ui;
+
+  // Compute guess of t2
+  t2 = (1.0 + delta)*t1;
+
+  // Compute u2 at t2
+  energy_lookup(id,t2,u2);
+
+  // Compute f1
+  f2 = u2 - ui;
+
+  // Apply the Secant Method
+  for(it=0; it<maxit; it++){
+    if(fabs(f2-f1) < MY_EPSILON){
+      if(isnan(f1) || isnan(f2)) k_error_flag.template view<DeviceType>()() = 2;
+      temp = t1;
+      temp = MAX(temp,lo);
+      temp = MIN(temp,hi);
+      k_warning_flag.template view<DeviceType>()() = 1;
+      break;
+    }
+    temp = t2 - f2*(t2-t1)/(f2-f1);
+    if(fabs(temp-t2) < tolerance) break;
+    f1 = f2;
+    t1 = t2;
+    t2 = temp;
+    energy_lookup(id,t2,u2);
+    f2 = u2 - ui;
+  }
+  if(it==maxit){
+    if(isnan(f1) || isnan(f2) || isnan(ui) || isnan(thetai) || isnan(t1) || isnan(t2))
+      k_error_flag.template view<DeviceType>()() = 2;
+    else
+      k_error_flag.template view<DeviceType>()() = 3;
+  }
+  thetai = temp;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixEOStableRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  int ii,jj,m;
+  HAT::t_efloat_1d h_uChem = atomKK->k_uChem.h_view;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
+
+  m = 0;
+  for (ii = 0; ii < n; ii++) {
+    jj = list[ii];
+    buf[m++] = h_uChem[jj];
+    buf[m++] = h_uCG[jj];
+    buf[m++] = h_uCGnew[jj];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  int ii,m,last;
+  HAT::t_efloat_1d h_uChem = atomKK->k_uChem.h_view;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
+
+  m = 0;
+  last = first + n ;
+  for (ii = first; ii < last; ii++){
+    h_uChem[ii]  = buf[m++];
+    h_uCG[ii]    = buf[m++];
+    h_uCGnew[ii] = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixEOStableRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = h_uCG[i];
+    buf[m++] = h_uCGnew[i];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i,j,m;
+  HAT::t_efloat_1d h_uCG = atomKK->k_uCG.h_view;
+  HAT::t_efloat_1d h_uCGnew = atomKK->k_uCGnew.h_view;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+
+    h_uCG[j] += buf[m++];
+    h_uCGnew[j] += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::error_check()
+{
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view() == 1)
+    error->one(FLERR,"Internal temperature <= zero");
+  else if (k_error_flag.h_view() == 2)
+    error->one(FLERR,"NaN detected in secant solver.");
+  else if (k_error_flag.h_view() == 3)
+    error->one(FLERR,"Maxit exceeded in secant solver.");
+
+  k_warning_flag.template modify<DeviceType>();
+  k_warning_flag.template sync<LMPHostType>();
+  if (k_warning_flag.h_view()) {
+    error->warning(FLERR,"Secant solver did not converge because table bounds were exceeded.");
+    k_warning_flag.h_view() = 0;
+    k_warning_flag.template modify<LMPHostType>();
+    k_warning_flag.template sync<DeviceType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixEOStableRXKokkos<DeviceType>::create_kokkos_tables()
+{
+  const int tlm1 = tablength-1;
+
+  memory->create_kokkos(d_table->lo,h_table->lo,ntables,"Table::lo");
+  memory->create_kokkos(d_table->hi,h_table->hi,ntables,"Table::hi");
+  memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
+
+  if(tabstyle == LINEAR) {
+    memory->create_kokkos(d_table->r,h_table->r,ntables,tablength,"Table::r");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de");
+  }
+
+  for(int i=0; i < ntables; i++) {
+    Table* tb = &tables[i];
+
+    h_table->lo[i] = tb->lo;
+    h_table->hi[i] = tb->hi;
+    h_table->invdelta[i] = tb->invdelta;
+
+    for(int j = 0; j<h_table->r.dimension_1(); j++)
+      h_table->r(i,j) = tb->r[j];
+    for(int j = 0; j<h_table->e.dimension_1(); j++)
+      h_table->e(i,j) = tb->e[j];
+    for(int j = 0; j<h_table->de.dimension_1(); j++)
+      h_table->de(i,j) = tb->de[j];
+  }
+
+  Kokkos::deep_copy(d_table->lo,h_table->lo);
+  Kokkos::deep_copy(d_table->hi,h_table->hi);
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
+  Kokkos::deep_copy(d_table->r,h_table->r);
+  Kokkos::deep_copy(d_table->e,h_table->e);
+  Kokkos::deep_copy(d_table->de,h_table->de);
+
+  d_table_const.lo = d_table->lo;
+  d_table_const.hi = d_table->hi;
+  d_table_const.invdelta = d_table->invdelta;
+  d_table_const.r = d_table->r;
+  d_table_const.e = d_table->e;
+  d_table_const.de = d_table->de;
+
+  update_table = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class FixEOStableRXKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixEOStableRXKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_eos_table_rx_kokkos.h b/src/KOKKOS/fix_eos_table_rx_kokkos.h
new file mode 100644
index 0000000000..91d73f1036
--- /dev/null
+++ b/src/KOKKOS/fix_eos_table_rx_kokkos.h
@@ -0,0 +1,212 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(eos/table/rx/kk,FixEOStableRXKokkos<LMPDeviceType>)
+FixStyle(eos/table/rx/kk/device,FixEOStableRXKokkos<LMPDeviceType>)
+FixStyle(eos/table/rx/kk/host,FixEOStableRXKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_EOS_TABLE_RX_KOKKOS_H
+#define LMP_FIX_EOS_TABLE_RX_KOKKOS_H
+
+#include "fix_eos_table_rx.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+struct TagFixEOStableRXInit{};
+struct TagFixEOStableRXSetup{};
+struct TagFixEOStableRXTemperatureLookup{};
+struct TagFixEOStableRXTemperatureLookup2{};
+
+template<class DeviceType>
+class FixEOStableRXKokkos : public FixEOStableRX {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  FixEOStableRXKokkos(class LAMMPS *, int, char **);
+  virtual ~FixEOStableRXKokkos();
+  void setup(int);
+  void init();
+  void post_integrate();
+  void end_of_step();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXInit, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXSetup, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXTemperatureLookup, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixEOStableRXTemperatureLookup2, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void energy_lookup(int, double, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void temperature_lookup(int, double, double &) const;
+
+ protected:
+  //struct Table {
+  //  int ninput;
+  //  double lo,hi;
+  //  double *rfile,*efile;
+  //  double *e2file;
+  //  double delta,invdelta,deltasq6;
+  //  double *r,*e,*de,*e2;
+  //};
+  //Table *tables, *tables2;
+
+  /*struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread lo,hi;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d_randomread invdelta;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread r,e,de;
+  };*/
+ //Its faster not to use texture fetch if the number of tables is less than 32!
+  struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_int_1d lo,hi;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d invdelta;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread r,e,de;
+  };
+
+  struct TableDevice {
+    typename ArrayTypes<DeviceType>::t_int_1d lo,hi;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d invdelta;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d r,e,de;
+  };
+
+  struct TableHost {
+    typename ArrayTypes<LMPHostType>::t_int_1d lo,hi;
+    typename ArrayTypes<LMPHostType>::t_ffloat_1d invdelta;
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d r,e,de;
+  };
+
+  TableDeviceConst d_table_const;
+  TableDevice* d_table;
+  TableHost* h_table;
+
+  int **tabindex;
+
+  double boltz;
+
+  void allocate();
+  void error_check();
+  int update_table;
+  void create_kokkos_tables();
+
+  DAT::tdual_float_1d k_dHf,k_energyCorr,k_tempCorrCoeff,k_moleculeCorrCoeff;
+  typename AT::t_float_1d d_dHf,d_energyCorr,d_tempCorrCoeff,d_moleculeCorrCoeff;
+
+  typename AT::t_int_1d mask;
+  typename AT::t_efloat_1d uCond,uMech,uChem,uCG,uCGnew,rho,dpdTheta,duChem;
+  typename AT::t_float_2d dvector;
+
+  DAT::tdual_int_scalar k_error_flag;
+  DAT::tdual_int_scalar k_warning_flag;
+
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  int pack_forward_comm(int , int *, double *, int, int *);
+  void unpack_forward_comm(int , int , double *);
+
+  };
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: FixEOStableRXKokkos requires a fix rx command.
+
+The fix rx command must come before the pair style command in the input file
+
+E:  There are no rx species specified
+
+There must be at least one species specified through the fix rx command
+
+E:  Invalid eos/table/rx length
+
+The eos/table/rx table must have more than one entry.
+
+E:  eos/table/rx values are not increasing
+
+The equation-of-state must an increasing function
+
+E:  FixEOStableRX requires atom_style with internal temperature and energies (e.g. dpd)
+
+Self-explanatory.
+
+E:  Internal temperature <= zero.
+
+Self-explanatory.
+
+E:  Cannot open eos table/rx potential file %s
+
+Self-explanatory.
+
+E:  Incorrect format in eos table/rx file
+
+Self-explanatory.
+
+E:  Cannot open file %s
+
+Self-explanatory.
+
+E:  Did not find keyword in table file
+
+Self-explanatory.
+
+E:  Illegal fix eos/table/rx command
+
+Incorrect number of arguments specified for the fix eos/table/rx command.
+
+E:  Invalid keyword in fix eos/table/rx parameters
+
+Self-explanatory.
+
+E:  The number of columns in fix eos/table/rx does not match the number of species.
+
+Self-explanatory.  Check format for fix eos/table/rx file.
+
+E:  fix eos/table/rx parameters did not set N
+
+The number of table entries was not set in the eos/table/rx file
+
+W:  Secant solver did not converge because table bounds were exceeded
+
+The secant solver failed to converge, resulting in the lower or upper table bound temperature to be returned
+
+E: NaN detected in secant solver.
+
+Self-explanatory.
+
+E: Maxit exceeded in secant solver
+
+The maximum number of interations was exceeded in the secant solver
+
+*/
diff --git a/src/KOKKOS/fix_property_atom_kokkos.cpp b/src/KOKKOS/fix_property_atom_kokkos.cpp
new file mode 100644
index 0000000000..cb52988c31
--- /dev/null
+++ b/src/KOKKOS/fix_property_atom_kokkos.cpp
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include "fix_property_atom_kokkos.h"
+#include "atom_kokkos.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+enum{MOLECULE,CHARGE,RMASS,INTEGER,DOUBLE};
+
+/* ---------------------------------------------------------------------- */
+
+FixPropertyAtomKokkos::FixPropertyAtomKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixPropertyAtom(lmp, narg, arg)
+{
+  atomKK = (AtomKokkos *) atom;
+}
+
+/* ----------------------------------------------------------------------
+   allocate atom-based arrays
+   initialize new values to 0,
+   since AtomVec class won't do it as atoms are added,
+   e.g. in create_atom() or data_atom()
+------------------------------------------------------------------------- */
+
+void FixPropertyAtomKokkos::grow_arrays(int nmax)
+{
+  for (int m = 0; m < nvalue; m++) {
+    if (style[m] == MOLECULE) {
+      memory->grow(atom->molecule,nmax,"atom:molecule");
+      size_t nbytes = (nmax-nmax_old) * sizeof(tagint);
+      memset(&atom->molecule[nmax_old],0,nbytes);
+    } else if (style[m] == CHARGE) {
+      memory->grow(atom->q,nmax,"atom:q");
+      size_t nbytes = (nmax-nmax_old) * sizeof(double);
+      memset(&atom->q[nmax_old],0,nbytes);
+    } else if (style[m] == RMASS) {
+      memory->grow(atom->rmass,nmax,"atom:rmass");
+      size_t nbytes = (nmax-nmax_old) * sizeof(double);
+      memset(&atom->rmass[nmax_old],0,nbytes);
+    } else if (style[m] == INTEGER) {
+      memory->grow(atom->ivector[index[m]],nmax,"atom:ivector");
+      size_t nbytes = (nmax-nmax_old) * sizeof(int);
+      memset(&atom->ivector[index[m]][nmax_old],0,nbytes);
+    } else if (style[m] == DOUBLE) {
+      memory->grow_kokkos(atomKK->k_dvector,atomKK->dvector,atomKK->k_dvector.dimension_0(),nmax,
+                          "atom:dvector");
+      //memory->grow(atom->dvector[index[m]],nmax,"atom:dvector");
+      //size_t nbytes = (nmax-nmax_old) * sizeof(double);
+      //memset(&atom->dvector[index[m]][nmax_old],0,nbytes);
+    }
+  }
+
+  nmax_old = nmax;
+}
diff --git a/src/KOKKOS/fix_property_atom_kokkos.h b/src/KOKKOS/fix_property_atom_kokkos.h
new file mode 100644
index 0000000000..ed1e4d7cfb
--- /dev/null
+++ b/src/KOKKOS/fix_property_atom_kokkos.h
@@ -0,0 +1,90 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(property/atom/kk,FixPropertyAtomKokkos)
+
+#else
+
+#ifndef LMP_FIX_PROPERTY_ATOM_KOKKOS_H
+#define LMP_FIX_PROPERTY_ATOM_KOKKOS_H
+
+#include "fix_property_atom.h"
+
+namespace LAMMPS_NS {
+
+class FixPropertyAtomKokkos : public FixPropertyAtom {
+ public:
+  FixPropertyAtomKokkos(class LAMMPS *, int, char **);
+  virtual ~FixPropertyAtomKokkos() {}
+
+  void grow_arrays(int);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Fix property/atom mol when atom_style already has molecule attribute
+
+Self-explanatory.
+
+E: Fix property/atom cannot specify mol twice
+
+Self-explanatory.
+
+E: Fix property/atom q when atom_style already has charge attribute
+
+Self-explanatory.
+
+E: Fix property/atom cannot specify q twice
+
+Self-explanatory.
+
+E: Fix property/atom vector name already exists
+
+The name for an integer or floating-point vector must be unique.
+
+W: Fix property/atom mol or charge w/out ghost communication
+
+A model typically needs these properties defined for ghost atoms.
+
+E: Atom style was redefined after using fix property/atom
+
+This is not allowed.
+
+E: Incorrect %s format in data file
+
+A section of the data file being read by fix property/atom does
+not have the correct number of values per line.
+
+E: Too few lines in %s section of data file
+
+Self-explanatory.
+
+E: Invalid atom ID in %s section of data file
+
+An atom in a section of the data file being read by fix property/atom
+has an invalid atom ID that is <= 0 or > the maximum existing atom ID.
+
+*/
diff --git a/src/KOKKOS/fix_rx_kokkos.cpp b/src/KOKKOS/fix_rx_kokkos.cpp
new file mode 100644
index 0000000000..b1cfd20be2
--- /dev/null
+++ b/src/KOKKOS/fix_rx_kokkos.cpp
@@ -0,0 +1,2279 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <string.h>
+#include "fix_rx_kokkos.h"
+#include "atom_masks.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "memory.h"
+#include "update.h"
+#include "respa.h"
+#include "modify.h"
+#include "neighbor.h"
+#include "neigh_list_kokkos.h"
+#include "neigh_request.h"
+#include "error.h"
+#include "math_special_kokkos.h"
+
+#include <float.h> // DBL_EPSILON
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+using namespace MathSpecialKokkos;
+
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+#define SparseKinetics_enableIntegralReactions (true)
+#define SparseKinetics_invalidIndex (-1)
+
+// From fix_rx.cpp ... this should be lifted into fix_rx.h or fix_rx_kokkos.h?
+enum{NONE,HARMONIC};
+enum{LUCY};
+
+namespace /* anonymous */
+{
+
+typedef double TimerType;
+TimerType getTimeStamp(void) { return MPI_Wtime(); }
+double getElapsedTime( const TimerType &t0, const TimerType &t1) { return t1-t0; }
+
+} // end namespace
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+FixRxKokkos<DeviceType>::FixRxKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixRX(lmp, narg, arg),
+  pairDPDEKK(NULL),
+  update_kinetics_data(true)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_error_flag = DAT::tdual_int_scalar("FixRxKokkos::k_error_flag");
+
+  //printf("Inside FixRxKokkos::FixRxKokkos\n");
+}
+
+template <typename DeviceType>
+FixRxKokkos<DeviceType>::~FixRxKokkos()
+{
+  //printf("Inside FixRxKokkos::~FixRxKokkos copymode= %d\n", copymode);
+  if (copymode) return;
+
+  if (localTempFlag)
+    memory->destroy_kokkos(k_dpdThetaLocal, dpdThetaLocal);
+
+  memory->destroy_kokkos(k_sumWeights, sumWeights);
+  //memory->destroy_kokkos(k_sumWeights);
+
+  //delete [] scratchSpace;
+  memory->destroy_kokkos(d_scratchSpace);
+
+  memory->destroy_kokkos(k_cutsq);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::post_constructor()
+{
+  // Run the parents and then reset one value.
+  FixRX::post_constructor();
+
+  // Need a copy of this
+  this->my_restartFlag = modify->fix[modify->nfix-1]->restart_reset;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::init()
+{
+  //printf("Inside FixRxKokkos::init\n");
+
+  // Call the parent's version.
+  //FixRX::init();
+
+  pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
+
+  if (pairDPDE == NULL)
+    error->all(FLERR,"Must use pair_style dpd/fdt/energy with fix rx");
+
+  pairDPDEKK = dynamic_cast<decltype(pairDPDEKK)>(pairDPDE);
+  if (pairDPDEKK == NULL)
+    error->all(FLERR,"Must use pair_style dpd/fdt/energy/kk with fix rx/kk");
+
+  bool eos_flag = false;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strncmp(modify->fix[i]->style,"eos/table/rx",3) == 0) eos_flag = true;
+  if(!eos_flag) error->all(FLERR,"fix rx requires fix eos/table/rx to be specified");
+
+  if (update_kinetics_data)
+    create_kinetics_data();
+
+  // From FixRX::init()
+  // need a half neighbor list
+  // built whenever re-neighboring occurs
+
+  int irequest = neighbor->request(this,instance_me);
+  neighbor->requests[irequest]->pair = 0;
+  neighbor->requests[irequest]->fix = 1;
+
+  // Update the neighbor data for Kokkos.
+  int neighflag = lmp->kokkos->neighflag;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+ 
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else { //if (neighflag == HALF || neighflag == HALFTHREAD)
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::init_list(int, class NeighList* ptr)
+{
+  //printf("Inside FixRxKokkos::init_list\n");
+  this->list = ptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::rk4(const double t_stop, double *y, double *rwork, void* v_params) const
+{
+  double *k1 = rwork;
+  double *k2 = k1 + nspecies;
+  double *k3 = k2 + nspecies;
+  double *k4 = k3 + nspecies;
+  double *yp = k4 + nspecies;
+
+  const int numSteps = minSteps;
+
+  const double h = t_stop / double(numSteps);
+
+  // Run the requested steps with h.
+  for (int step = 0; step < numSteps; step++)
+  {
+    // k1
+    rhs(0.0,y,k1,v_params);
+
+    // k2
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k1[ispecies];
+
+    rhs(0.0,yp,k2,v_params);
+
+    // k3
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k2[ispecies];
+
+    rhs(0.0,yp,k3,v_params);
+
+    // k4
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + h*k3[ispecies];
+
+    rhs(0.0,yp,k4,v_params);
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);
+
+  } // end for (int step...
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rk4(const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData) const
+{
+  VectorType k1( rwork );
+  VectorType k2( &k1[nspecies] );
+  VectorType k3( &k2[nspecies] );
+  VectorType k4( &k3[nspecies] );
+  VectorType yp( &k4[nspecies] );
+
+  const int numSteps = minSteps;
+
+  const double h = t_stop / double(numSteps);
+
+  // Run the requested steps with h.
+  for (int step = 0; step < numSteps; step++)
+  {
+    // k1
+    k_rhs(0.0,y,k1, userData);
+
+    // k2
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k1[ispecies];
+
+    k_rhs(0.0,yp,k2, userData);
+
+    // k3
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + 0.5*h*k2[ispecies];
+
+    k_rhs(0.0,yp,k3, userData);
+
+    // k4
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      yp[ispecies] = y[ispecies] + h*k3[ispecies];
+
+    k_rhs(0.0,yp,k4, userData);
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+      y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);
+
+  } // end for (int step...
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+//     f1 = dt*f(t,x)
+//     f2 = dt*f(t+ c20*dt,x + c21*f1)
+//     f3 = dt*f(t+ c30*dt,x + c31*f1 + c32*f2)
+//     f4 = dt*f(t+ c40*dt,x + c41*f1 + c42*f2 + c43*f3)
+//     f5 = dt*f(t+dt,x + c51*f1 + c52*f2 + c53*f3 + c54*f4)
+//     f6 = dt*f(t+ c60*dt,x + c61*f1 + c62*f2 + c63*f3 + c64*f4 + c65*f5)
+//
+//     fifth-order runge-kutta integration
+//        x5 = x + b1*f1 + b3*f3 + b4*f4 + b5*f5 + b6*f6
+//     fourth-order runge-kutta integration
+//        x  = x + a1*f1 + a3*f3 + a4*f4 + a5*f5
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45_step (const int neq, const double h, VectorType& y, VectorType& y_out, VectorType& rwk, UserDataType& userData) const
+{
+   const double c21=0.25;
+   const double c31=0.09375;
+   const double c32=0.28125;
+   const double c41=0.87938097405553;
+   const double c42=-3.2771961766045;
+   const double c43=3.3208921256258;
+   const double c51=2.0324074074074;
+   const double c52=-8.0;
+   const double c53=7.1734892787524;
+   const double c54=-0.20589668615984;
+   const double c61=-0.2962962962963;
+   const double c62=2.0;
+   const double c63=-1.3816764132554;
+   const double c64=0.45297270955166;
+   const double c65=-0.275;
+   const double a1=0.11574074074074;
+   const double a3=0.54892787524366;
+   const double a4=0.5353313840156;
+   const double a5=-0.2;
+   const double b1=0.11851851851852;
+   const double b3=0.51898635477583;
+   const double b4=0.50613149034201;
+   const double b5=-0.18;
+   const double b6=0.036363636363636;
+
+   // local dependent variables (5 total)
+   VectorType& f1 = rwk;
+   VectorType  f2( &rwk[  neq] );
+   VectorType  f3( &rwk[2*neq] );
+   VectorType  f4( &rwk[3*neq] );
+   VectorType  f5( &rwk[4*neq] );
+   VectorType  f6( &rwk[5*neq] );
+
+   // scratch for the intermediate solution.
+   VectorType& ytmp = y_out;
+
+   // 1)
+   k_rhs (0.0, y, f1, userData);
+
+   for (int k = 0; k < neq; k++){
+      f1[k] *= h;
+      ytmp[k] = y[k] + c21 * f1[k];
+   }
+
+   // 2)
+   k_rhs(0.0, ytmp, f2, userData);
+
+   for (int k = 0; k < neq; k++){
+      f2[k] *= h;
+      ytmp[k] = y[k] + c31 * f1[k] + c32 * f2[k];
+   }
+
+   // 3)
+   k_rhs(0.0, ytmp, f3, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f3[k] *= h;
+      ytmp[k] = y[k] + c41 * f1[k] + c42 * f2[k] + c43 * f3[k];
+   }
+
+   // 4)
+   k_rhs(0.0, ytmp, f4, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f4[k] *= h;
+      ytmp[k] = y[k] + c51 * f1[k] + c52 * f2[k] + c53 * f3[k] + c54 * f4[k];
+   }
+
+   // 5)
+   k_rhs(0.0, ytmp, f5, userData);
+
+   for (int k = 0; k < neq; k++) {
+      f5[k] *= h;
+      ytmp[k] = y[k] + c61*f1[k] + c62*f2[k] + c63*f3[k] + c64*f4[k] + c65*f5[k];
+   }
+
+   // 6)
+   k_rhs(0.0, ytmp, f6, userData);
+
+   for (int k = 0; k < neq; k++)
+   {
+      //const double f6 = h * ydot[k];
+      f6[k] *= h;
+
+      // 5th-order solution.
+      const double r5 = b1*f1[k] + b3*f3[k] + b4*f4[k] + b5*f5[k] + b6*f6[k];
+
+      // 4th-order solution.
+      const double r4 = a1*f1[k] + a3*f3[k] + a4*f4[k] + a5*f5[k];
+
+      // Truncation error: difference between 4th and 5th-order solutions.
+      rwk[k] = fabs(r5 - r4);
+
+      // Update solution.
+    //y_out[k] = y[k] + r5; // Local extrapolation
+      y_out[k] = y[k] + r4;
+   }
+
+   return;
+}
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rkf45_h0
+                    (const int neq, const double t, const double t_stop,
+                     const double hmin, const double hmax,
+                     double& h0, VectorType& y, VectorType& rwk, UserDataType& userData) const
+{
+   // Set lower and upper bounds on h0, and take geometric mean as first trial value.
+   // Exit with this value if the bounds cross each other.
+
+   // Adjust upper bound based on ydot ...
+   double hg = sqrt(hmin*hmax);
+
+   //if (hmax < hmin)
+   //{
+   //   h0 = hg;
+   //   return;
+   //}
+
+   // Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1
+
+   VectorType& ydot  = rwk;
+   VectorType  y1    ( &ydot[  neq] );
+   VectorType  ydot1 ( &ydot[2*neq] );
+
+   const int max_iters = 10;
+   bool hnew_is_ok = false;
+   double hnew = hg;
+   int iter = 0;
+
+   // compute ydot at t=t0
+   k_rhs (t, y, ydot, userData);
+
+   while(1)
+   {
+      // Estimate y'' with finite-difference ...
+
+      for (int k = 0; k < neq; k++)
+         y1[k] = y[k] + hg * ydot[k];
+
+      // compute y' at t1
+      k_rhs (t + hg, y1, ydot1, userData);
+
+      // Compute WRMS norm of y''
+      double yddnrm = 0.0;
+      for (int k = 0; k < neq; k++){
+         double ydd = (ydot1[k] - ydot[k]) / hg;
+         double wterr = ydd / (relTol * fabs( y[k] ) + absTol);
+         yddnrm += wterr * wterr;
+      }
+
+      yddnrm = sqrt( yddnrm / double(neq) );
+
+      //std::cout << "iter " << _iter << " hg " << hg << " y'' " << yddnrm << std::endl;
+      //std::cout << "ydot " << ydot[neq-1] << std::endl;
+
+      // should we accept this?
+      if (hnew_is_ok || iter == max_iters){
+         hnew = hg;
+         //if (iter == max_iters)
+         //   fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
+         break;
+      }
+
+      // Get the new value of h ...
+      hnew = (yddnrm*hmax*hmax > 2.0) ? sqrt(2.0 / yddnrm) : sqrt(hg * hmax);
+
+      // test the stopping conditions.
+      double hrat = hnew / hg;
+
+      // Accept this value ... the bias factor should bring it within range.
+      if ( (hrat > 0.5) && (hrat < 2.0) )
+         hnew_is_ok = true;
+
+      // If y'' is still bad after a few iterations, just accept h and give up.
+      if ( (iter > 1) && hrat > 2.0 ) {
+         hnew = hg;
+         hnew_is_ok = true;
+      }
+
+      //printf("iter=%d, yddnrw=%e, hnew=%e, hmin=%e, hmax=%e\n", iter, yddnrm, hnew, hmin, hmax);
+
+      hg = hnew;
+      iter ++;
+   }
+
+   // bound and bias estimate
+   h0 = hnew * 0.5;
+   h0 = fmax(h0, hmin);
+   h0 = fmin(h0, hmax);
+   //printf("h0=%e, hmin=%e, hmax=%e\n", h0, hmin, hmax);
+
+   return (iter + 1);
+}
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+void FixRxKokkos<DeviceType>::k_rkf45(const int neq, const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData, CounterType& counter) const
+{
+  // Rounding coefficient.
+  const double uround = DBL_EPSILON;
+
+  // Adaption limit (shrink or grow)
+  const double adaption_limit = 4.0;
+
+  // Safety factor on the adaption. very specific but not necessary .. 0.9 is common.
+  const double hsafe = 0.840896415;
+
+  // Time rounding factor.
+  const double tround = t_stop * uround;
+
+  // Counters for diagnostics.
+  int nst = 0; // # of steps (accepted)
+  int nit = 0; // # of iterations total
+  int nfe = 0; // # of RHS evaluations
+
+  // Min/Max step-size limits.
+  const double h_min = 100.0 * tround;
+  const double h_max = (minSteps > 0) ? t_stop / double(minSteps) : t_stop;
+
+  // Set the initial step-size. 0 forces an internal estimate ... stable Euler step size.
+  double h = (minSteps > 0) ? t_stop / double(minSteps) : 0.0;
+
+  double t = 0.0;
+
+  if (h < h_min){
+    //fprintf(stderr,"hin not implemented yet\n");
+    //exit(-1);
+    nfe = k_rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, rwork, userData);
+  }
+
+  //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
+
+  // Integrate until we reach the end time.
+  while (fabs(t - t_stop) > tround)
+  {
+    VectorType& yout = rwork;
+    VectorType  eout ( &yout[neq] );
+
+    // Take a trial step.
+    k_rkf45_step (neq, h, y, yout, eout, userData);
+
+    // Estimate the solution error.
+      // ... weighted 2-norm of the error.
+      double err2 = 0.0;
+      for (int k = 0; k < neq; k++){
+        const double wterr = eout[k] / (relTol * fabs( y[k] ) + absTol);
+        err2 += wterr * wterr;
+      }
+
+    double err = fmax( uround, sqrt( err2 / double(nspecies) ));
+
+    // Accept the solution?
+    if (err <= 1.0 || h <= h_min){
+      t += h;
+      nst++;
+
+      for (int k = 0; k < neq; k++)
+        y[k] = yout[k];
+    }
+
+    // Adjust h for the next step.
+    double hfac = hsafe * sqrt( sqrt( 1.0 / err ) );
+
+    // Limit the adaption.
+    hfac = fmax( hfac, 1.0 / adaption_limit );
+    hfac = fmin( hfac,       adaption_limit );
+
+    // Apply the adaption factor...
+    h *= hfac;
+
+    // Limit h.
+    h = fmin( h, h_max );
+    h = fmax( h, h_min );
+
+    // Stretch h if we're within 5% ... and we didn't just fail.
+    if (err <= 1.0 && (t + 1.05*h) > t_stop)
+      h = t_stop - t;
+
+    // And don't overshoot the end.
+    if (t + h > t_stop)
+      h = t_stop - t;
+
+    nit++;
+    nfe += 6;
+
+    if (maxIters && nit > maxIters){
+      //fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
+      counter.nFails ++;
+      break;
+      // We should set an error here so that the solution is not used!
+    }
+
+  } // end while
+
+  counter.nSteps += nst;
+  counter.nIters += nit;
+  counter.nFuncs += nfe;
+
+  //printf("id= %d nst= %d nit= %d\n", id, nst, nit);
+}
+/* ---------------------------------------------------------------------- */
+
+//     f1 = dt*f(t,x)
+//     f2 = dt*f(t+ c20*dt,x + c21*f1)
+//     f3 = dt*f(t+ c30*dt,x + c31*f1 + c32*f2)
+//     f4 = dt*f(t+ c40*dt,x + c41*f1 + c42*f2 + c43*f3)
+//     f5 = dt*f(t+dt,x + c51*f1 + c52*f2 + c53*f3 + c54*f4)
+//     f6 = dt*f(t+ c60*dt,x + c61*f1 + c62*f2 + c63*f3 + c64*f4 + c65*f5)
+//
+//     fifth-order runge-kutta integration
+//        x5 = x + b1*f1 + b3*f3 + b4*f4 + b5*f5 + b6*f6
+//     fourth-order runge-kutta integration
+//        x  = x + a1*f1 + a3*f3 + a4*f4 + a5*f5
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::rkf45_step (const int neq, const double h, double y[], double y_out[], double rwk[], void* v_param) const
+{
+   const double c21=0.25;
+   const double c31=0.09375;
+   const double c32=0.28125;
+   const double c41=0.87938097405553;
+   const double c42=-3.2771961766045;
+   const double c43=3.3208921256258;
+   const double c51=2.0324074074074;
+   const double c52=-8.0;
+   const double c53=7.1734892787524;
+   const double c54=-0.20589668615984;
+   const double c61=-0.2962962962963;
+   const double c62=2.0;
+   const double c63=-1.3816764132554;
+   const double c64=0.45297270955166;
+   const double c65=-0.275;
+   const double a1=0.11574074074074;
+   const double a3=0.54892787524366;
+   const double a4=0.5353313840156;
+   const double a5=-0.2;
+   const double b1=0.11851851851852;
+   const double b3=0.51898635477583;
+   const double b4=0.50613149034201;
+   const double b5=-0.18;
+   const double b6=0.036363636363636;
+
+   // local dependent variables (5 total)
+   double* f1 = &rwk[    0];
+   double* f2 = &rwk[  neq];
+   double* f3 = &rwk[2*neq];
+   double* f4 = &rwk[3*neq];
+   double* f5 = &rwk[4*neq];
+   double* f6 = &rwk[5*neq];
+
+   // scratch for the intermediate solution.
+   //double* ytmp = &rwk[6*neq];
+   double* ytmp = y_out;
+
+   // 1)
+   rhs (0.0, y, f1, v_param);
+
+   for (int k = 0; k < neq; k++){
+      f1[k] *= h;
+      ytmp[k] = y[k] + c21 * f1[k];
+   }
+
+   // 2)
+   rhs(0.0, ytmp, f2, v_param);
+
+   for (int k = 0; k < neq; k++){
+      f2[k] *= h;
+      ytmp[k] = y[k] + c31 * f1[k] + c32 * f2[k];
+   }
+
+   // 3)
+   rhs(0.0, ytmp, f3, v_param);
+
+   for (int k = 0; k < neq; k++) {
+      f3[k] *= h;
+      ytmp[k] = y[k] + c41 * f1[k] + c42 * f2[k] + c43 * f3[k];
+   }
+
+   // 4)
+   rhs(0.0, ytmp, f4, v_param);
+
+   for (int k = 0; k < neq; k++) {
+      f4[k] *= h;
+      ytmp[k] = y[k] + c51 * f1[k] + c52 * f2[k] + c53 * f3[k] + c54 * f4[k];
+   }
+
+   // 5)
+   rhs(0.0, ytmp, f5, v_param);
+
+   for (int k = 0; k < neq; k++) {
+      f5[k] *= h;
+      ytmp[k] = y[k] + c61*f1[k] + c62*f2[k] + c63*f3[k] + c64*f4[k] + c65*f5[k];
+   }
+
+   // 6)
+   rhs(0.0, ytmp, f6, v_param);
+
+   for (int k = 0; k < neq; k++)
+   {
+      //const double f6 = h * ydot[k];
+      f6[k] *= h;
+
+      // 5th-order solution.
+      const double r5 = b1*f1[k] + b3*f3[k] + b4*f4[k] + b5*f5[k] + b6*f6[k];
+
+      // 4th-order solution.
+      const double r4 = a1*f1[k] + a3*f3[k] + a4*f4[k] + a5*f5[k];
+
+      // Truncation error: difference between 4th and 5th-order solutions.
+      rwk[k] = fabs(r5 - r4);
+
+      // Update solution.
+    //y_out[k] = y[k] + r5; // Local extrapolation
+      y_out[k] = y[k] + r4;
+   }
+
+   return;
+}
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rkf45_h0
+                    (const int neq, const double t, const double t_stop,
+                     const double hmin, const double hmax,
+                     double& h0, double y[], double rwk[], void* v_params) const
+{
+   // Set lower and upper bounds on h0, and take geometric mean as first trial value.
+   // Exit with this value if the bounds cross each other.
+
+   // Adjust upper bound based on ydot ...
+   double hg = sqrt(hmin*hmax);
+
+   //if (hmax < hmin)
+   //{
+   //   h0 = hg;
+   //   return;
+   //}
+
+   // Start iteration to find solution to ... {WRMS norm of (h0^2 y'' / 2)} = 1
+
+   double *ydot  = rwk;
+   double *y1    = ydot + neq;
+   double *ydot1 = y1 + neq;
+
+   const int max_iters = 10;
+   bool hnew_is_ok = false;
+   double hnew = hg;
+   int iter = 0;
+
+   // compute ydot at t=t0
+   rhs (t, y, ydot, v_params);
+
+   while(1)
+   {
+      // Estimate y'' with finite-difference ...
+
+      for (int k = 0; k < neq; k++)
+         y1[k] = y[k] + hg * ydot[k];
+
+      // compute y' at t1
+      rhs (t + hg, y1, ydot1, v_params);
+
+      // Compute WRMS norm of y''
+      double yddnrm = 0.0;
+      for (int k = 0; k < neq; k++){
+         double ydd = (ydot1[k] - ydot[k]) / hg;
+         double wterr = ydd / (relTol * fabs( y[k] ) + absTol);
+         yddnrm += wterr * wterr;
+      }
+
+      yddnrm = sqrt( yddnrm / double(neq) );
+
+      //std::cout << "iter " << _iter << " hg " << hg << " y'' " << yddnrm << std::endl;
+      //std::cout << "ydot " << ydot[neq-1] << std::endl;
+
+      // should we accept this?
+      if (hnew_is_ok || iter == max_iters){
+         hnew = hg;
+         if (iter == max_iters)
+            fprintf(stderr, "ERROR_HIN_MAX_ITERS\n");
+         break;
+      }
+
+      // Get the new value of h ...
+      hnew = (yddnrm*hmax*hmax > 2.0) ? sqrt(2.0 / yddnrm) : sqrt(hg * hmax);
+
+      // test the stopping conditions.
+      double hrat = hnew / hg;
+
+      // Accept this value ... the bias factor should bring it within range.
+      if ( (hrat > 0.5) && (hrat < 2.0) )
+         hnew_is_ok = true;
+
+      // If y'' is still bad after a few iterations, just accept h and give up.
+      if ( (iter > 1) && hrat > 2.0 ) {
+         hnew = hg;
+         hnew_is_ok = true;
+      }
+
+      //printf("iter=%d, yddnrw=%e, hnew=%e, hmin=%e, hmax=%e\n", iter, yddnrm, hnew, hmin, hmax);
+
+      hg = hnew;
+      iter ++;
+   }
+
+   // bound and bias estimate
+   h0 = hnew * 0.5;
+   h0 = fmax(h0, hmin);
+   h0 = fmin(h0, hmax);
+   //printf("h0=%e, hmin=%e, hmax=%e\n", h0, hmin, hmax);
+
+   return (iter + 1);
+}
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::rkf45(const int neq, const double t_stop, double *y, double *rwork, void *v_param, CounterType& counter) const
+{
+  // Rounding coefficient.
+  const double uround = DBL_EPSILON;
+
+  // Adaption limit (shrink or grow)
+  const double adaption_limit = 4.0;
+
+  // Safety factor on the adaption. very specific but not necessary .. 0.9 is common.
+  const double hsafe = 0.840896415;
+
+  // Time rounding factor.
+  const double tround = t_stop * uround;
+
+  // Counters for diagnostics.
+  int nst = 0; // # of steps (accepted)
+  int nit = 0; // # of iterations total
+  int nfe = 0; // # of RHS evaluations
+
+  // Min/Max step-size limits.
+  const double h_min = 100.0 * tround;
+  const double h_max = (minSteps > 0) ? t_stop / double(minSteps) : t_stop;
+
+  // Set the initial step-size. 0 forces an internal estimate ... stable Euler step size.
+  double h = (minSteps > 0) ? t_stop / double(minSteps) : 0.0;
+
+  double t = 0.0;
+
+  if (h < h_min){
+    //fprintf(stderr,"hin not implemented yet\n");
+    //exit(-1);
+    nfe = rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, rwork, v_param);
+  }
+
+  //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
+
+  // Integrate until we reach the end time.
+  while (fabs(t - t_stop) > tround){
+    double *yout = rwork;
+    double *eout = yout + neq;
+
+    // Take a trial step.
+    rkf45_step (neq, h, y, yout, eout, v_param);
+
+    // Estimate the solution error.
+      // ... weighted 2-norm of the error.
+      double err2 = 0.0;
+      for (int k = 0; k < neq; k++){
+        const double wterr = eout[k] / (relTol * fabs( y[k] ) + absTol);
+        err2 += wterr * wterr;
+      }
+
+    double err = fmax( uround, sqrt( err2 / double(nspecies) ));
+
+    // Accept the solution?
+    if (err <= 1.0 || h <= h_min){
+      t += h;
+      nst++;
+
+      for (int k = 0; k < neq; k++)
+        y[k] = yout[k];
+    }
+
+    // Adjust h for the next step.
+    double hfac = hsafe * sqrt( sqrt( 1.0 / err ) );
+
+    // Limit the adaption.
+    hfac = fmax( hfac, 1.0 / adaption_limit );
+    hfac = fmin( hfac,       adaption_limit );
+
+    // Apply the adaption factor...
+    h *= hfac;
+
+    // Limit h.
+    h = fmin( h, h_max );
+    h = fmax( h, h_min );
+
+    // Stretch h if we're within 5% ... and we didn't just fail.
+    if (err <= 1.0 && (t + 1.05*h) > t_stop)
+      h = t_stop - t;
+
+    // And don't overshoot the end.
+    if (t + h > t_stop)
+      h = t_stop - t;
+
+    nit++;
+    nfe += 6;
+
+    if (maxIters && nit > maxIters){
+      //fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
+      counter.nFails ++;
+      break;
+      // We should set an error here so that the solution is not used!
+    }
+
+  } // end while
+
+  counter.nSteps += nst;
+  counter.nIters += nit;
+  counter.nFuncs += nfe;
+
+  //printf("id= %d nst= %d nit= %d\n", id, nst, nit);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rhs(double t, const double *y, double *dydt, void *params) const
+{
+  // Use the sparse format instead.
+  if (useSparseKinetics)
+    return this->rhs_sparse( t, y, dydt, params);
+  else
+    return this->rhs_dense ( t, y, dydt, params);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rhs_dense(double t, const double *y, double *dydt, void *params) const
+{
+  UserRHSData *userData = (UserRHSData *) params;
+
+  double *rxnRateLaw = userData->rxnRateLaw;
+  double *kFor       = userData->kFor;
+
+  //const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  //const int nspecies = atom->nspecies_dpd;
+
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    dydt[ispecies] = 0.0;
+
+  // Construct the reaction rate laws
+  for(int jrxn=0; jrxn<nreactions; jrxn++){
+    double rxnRateLawForward = kFor[jrxn];
+
+    for(int ispecies=0; ispecies<nspecies; ispecies++){
+      const double concentration = y[ispecies]/VDPD;
+      rxnRateLawForward *= pow( concentration, d_kineticsData.stoichReactants(jrxn,ispecies) );
+      //rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
+    }
+    rxnRateLaw[jrxn] = rxnRateLawForward;
+  }
+
+  // Construct the reaction rates for each species
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    for(int jrxn=0; jrxn<nreactions; jrxn++)
+    {
+      dydt[ispecies] += d_kineticsData.stoich(jrxn,ispecies) *VDPD*rxnRateLaw[jrxn];
+      //dydt[ispecies] += stoich[jrxn][ispecies]*VDPD*rxnRateLaw[jrxn];
+    }
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::rhs_sparse(double t, const double *y, double *dydt, void *v_params) const
+{
+   UserRHSData *userData = (UserRHSData *) v_params;
+
+   //const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+
+   #define kFor         (userData->kFor)
+   #define kRev         (NULL)
+   #define rxnRateLaw   (userData->rxnRateLaw)
+   #define conc         (dydt)
+   #define maxReactants (this->sparseKinetics_maxReactants)
+   #define maxSpecies   (this->sparseKinetics_maxSpecies)
+   #define nuk          (this->d_kineticsData.nuk)
+   #define nu           (this->d_kineticsData.nu)
+   #define inu          (this->d_kineticsData.inu)
+   #define isIntegral(idx) ( SparseKinetics_enableIntegralReactions \
+                             && this->d_kineticsData.isIntegral(idx) )
+
+   for (int k = 0; k < nspecies; ++k)
+      conc[k] = y[k] / VDPD;
+
+   // Construct the reaction rate laws
+   for (int i = 0; i < nreactions; ++i)
+   {
+      double rxnRateLawForward;
+      if (isIntegral(i)){
+         rxnRateLawForward = kFor[i] * powint( conc[ nuk(i,0) ], inu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= powint( conc[k], inu(i,kk) );
+         }
+      } else {
+         rxnRateLawForward = kFor[i] * pow( conc[ nuk(i,0) ], nu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= pow( conc[k], nu(i,kk) );
+         }
+      }
+
+      rxnRateLaw[i] = rxnRateLawForward;
+   }
+
+   // Construct the reaction rates for each species from the
+   // Stoichiometric matrix and ROP vector.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] = 0.0;
+
+   for (int i = 0; i < nreactions; ++i){
+      // Reactants ...
+      dydt[ nuk(i,0) ] -= nu(i,0) * rxnRateLaw[i];
+      for (int kk = 1; kk < maxReactants; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] -= nu(i,kk) * rxnRateLaw[i];
+      }
+
+      // Products ...
+      dydt[ nuk(i,maxReactants) ] += nu(i,maxReactants) * rxnRateLaw[i];
+      for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] += nu(i,kk) * rxnRateLaw[i];
+      }
+   }
+
+   // Add in the volume factor to convert to the proper units.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] *= VDPD;
+
+   #undef kFor
+   #undef kRev
+   #undef rxnRateLaw
+   #undef conc
+   #undef maxReactants
+   #undef maxSpecies
+   #undef nuk
+   #undef nu
+   #undef inu
+   #undef isIntegral
+   //#undef invalidIndex
+
+   return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+  // Use the sparse format instead.
+  if (useSparseKinetics)
+    return this->k_rhs_sparse( t, y, dydt, userData);
+  else
+    return this->k_rhs_dense ( t, y, dydt, userData);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs_dense(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+  #define rxnRateLaw (userData.rxnRateLaw)
+  #define kFor       (userData.kFor      )
+
+  //const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  //const int nspecies = atom->nspecies_dpd;
+
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    dydt[ispecies] = 0.0;
+
+  // Construct the reaction rate laws
+  for(int jrxn=0; jrxn<nreactions; jrxn++){
+    double rxnRateLawForward = kFor[jrxn];
+
+    for(int ispecies=0; ispecies<nspecies; ispecies++){
+      const double concentration = y[ispecies]/VDPD;
+      rxnRateLawForward *= pow( concentration, d_kineticsData.stoichReactants(jrxn,ispecies) );
+      //rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
+    }
+    rxnRateLaw[jrxn] = rxnRateLawForward;
+  }
+
+  // Construct the reaction rates for each species
+  for(int ispecies=0; ispecies<nspecies; ispecies++)
+    for(int jrxn=0; jrxn<nreactions; jrxn++)
+    {
+      dydt[ispecies] += d_kineticsData.stoich(jrxn,ispecies) *VDPD*rxnRateLaw[jrxn];
+      //dydt[ispecies] += stoich[jrxn][ispecies]*VDPD*rxnRateLaw[jrxn];
+    }
+
+  #undef rxnRateLaw
+  #undef kFor
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <typename VectorType, typename UserDataType>
+int FixRxKokkos<DeviceType>::k_rhs_sparse(double t, const VectorType& y, VectorType& dydt, UserDataType& userData) const
+{
+   #define kFor         (userData.kFor)
+   #define kRev         (NULL)
+   #define rxnRateLaw   (userData.rxnRateLaw)
+   #define conc         (dydt)
+   #define maxReactants (this->sparseKinetics_maxReactants)
+   #define maxSpecies   (this->sparseKinetics_maxSpecies)
+   #define nuk          (this->d_kineticsData.nuk)
+   #define nu           (this->d_kineticsData.nu)
+   #define inu          (this->d_kineticsData.inu)
+   #define isIntegral(idx) ( SparseKinetics_enableIntegralReactions \
+                             && this->d_kineticsData.isIntegral(idx) )
+
+   for (int k = 0; k < nspecies; ++k)
+      conc[k] = y[k] / VDPD;
+
+   // Construct the reaction rate laws
+   for (int i = 0; i < nreactions; ++i)
+   {
+      double rxnRateLawForward;
+      if (isIntegral(i)){
+         rxnRateLawForward = kFor[i] * powint( conc[ nuk(i,0) ], inu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= powint( conc[k], inu(i,kk) );
+         }
+      } else {
+         rxnRateLawForward = kFor[i] * pow( conc[ nuk(i,0) ], nu(i,0) );
+         for (int kk = 1; kk < maxReactants; ++kk){
+            const int k = nuk(i,kk);
+            if (k == SparseKinetics_invalidIndex) break;
+            //if (k != SparseKinetics_invalidIndex)
+               rxnRateLawForward *= pow( conc[k], nu(i,kk) );
+         }
+      }
+
+      rxnRateLaw[i] = rxnRateLawForward;
+   }
+
+   // Construct the reaction rates for each species from the
+   // Stoichiometric matrix and ROP vector.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] = 0.0;
+
+   for (int i = 0; i < nreactions; ++i){
+      // Reactants ...
+      dydt[ nuk(i,0) ] -= nu(i,0) * rxnRateLaw[i];
+      for (int kk = 1; kk < maxReactants; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] -= nu(i,kk) * rxnRateLaw[i];
+      }
+
+      // Products ...
+      dydt[ nuk(i,maxReactants) ] += nu(i,maxReactants) * rxnRateLaw[i];
+      for (int kk = maxReactants+1; kk < maxSpecies; ++kk){
+         const int k = nuk(i,kk);
+         if (k == SparseKinetics_invalidIndex) break;
+         //if (k != SparseKinetics_invalidIndex)
+            dydt[k] += nu(i,kk) * rxnRateLaw[i];
+      }
+   }
+
+   // Add in the volume factor to convert to the proper units.
+   for (int k = 0; k < nspecies; ++k)
+      dydt[k] *= VDPD;
+
+   #undef kFor
+   #undef kRev
+   #undef rxnRateLaw
+   #undef conc
+   #undef maxReactants
+   #undef maxSpecies
+   #undef nuk
+   #undef nu
+   #undef inu
+   #undef isIntegral
+   //#undef invalidIndex
+
+   return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*template <typename DeviceType>
+  template <typename SolverType>
+    KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(SolverType, const int &i) const
+{
+  if (atom->mask[i] & groupbit)
+  {
+    double *rwork = new double[8*nspecies];
+
+    UserRHSData userData;
+    userData.kFor = new double[nreactions];
+    userData.rxnRateLaw = new double[nreactions];
+
+    int ode_counter[4] = { 0 };
+
+    const double theta = (localTempFlag) ? dpdThetaLocal[i] : atom->dpdTheta[i];
+
+    //Compute the reaction rate constants
+    for (int irxn = 0; irxn < nreactions; irxn++)
+    {
+      if (SolverType::setToZero)
+        userData.kFor[irxn] = 0.0;
+      else
+        userData.kFor[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/force->boltz/theta);
+    }
+
+    if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+      rk4(i, rwork, &userData);
+    else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+      rkf45(i, rwork, &userData, ode_counter);
+
+    delete [] rwork;
+    delete [] userData.kFor;
+    delete [] userData.rxnRateLaw;
+  }
+} */
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::create_kinetics_data(void)
+{
+  //printf("Inside FixRxKokkos::create_kinetics_data\n");
+
+  memory->create_kokkos( d_kineticsData.Arr, h_kineticsData.Arr, nreactions, "KineticsType::Arr");
+  memory->create_kokkos( d_kineticsData.nArr, h_kineticsData.nArr, nreactions, "KineticsType::nArr");
+  memory->create_kokkos( d_kineticsData.Ea, h_kineticsData.Ea, nreactions, "KineticsType::Ea");
+
+  for (int i = 0; i < nreactions; ++i)
+  {
+    h_kineticsData.Arr[i]  = Arr[i];
+    h_kineticsData.nArr[i] = nArr[i];
+    h_kineticsData.Ea[i]   = Ea[i];
+  }
+
+  Kokkos::deep_copy( d_kineticsData.Arr, h_kineticsData.Arr );
+  Kokkos::deep_copy( d_kineticsData.nArr, h_kineticsData.nArr );
+  Kokkos::deep_copy( d_kineticsData.Ea, h_kineticsData.Ea );
+
+  if (useSparseKinetics)
+  {
+
+    memory->create_kokkos( d_kineticsData.nu , h_kineticsData.nu , nreactions, sparseKinetics_maxSpecies, "KineticsType::nu");
+    memory->create_kokkos( d_kineticsData.nuk, h_kineticsData.nuk, nreactions, sparseKinetics_maxSpecies, "KineticsType::nuk");
+
+    for (int i = 0; i < nreactions; ++i)
+      for (int k = 0; k < sparseKinetics_maxSpecies; ++k)
+      {
+        h_kineticsData.nu (i,k) = sparseKinetics_nu [i][k];
+        h_kineticsData.nuk(i,k) = sparseKinetics_nuk[i][k];
+      }
+
+    Kokkos::deep_copy( d_kineticsData.nu, h_kineticsData.nu );
+    Kokkos::deep_copy( d_kineticsData.nuk, h_kineticsData.nuk );
+
+    if (SparseKinetics_enableIntegralReactions)
+    {
+      memory->create_kokkos( d_kineticsData.inu, h_kineticsData.inu, nreactions, sparseKinetics_maxSpecies, "KineticsType::inu");
+      memory->create_kokkos( d_kineticsData.isIntegral, h_kineticsData.isIntegral, nreactions, "KineticsType::isIntegral");
+
+      for (int i = 0; i < nreactions; ++i)
+      {
+        h_kineticsData.isIntegral(i) = sparseKinetics_isIntegralReaction[i];
+
+        for (int k = 0; k < sparseKinetics_maxSpecies; ++k)
+          h_kineticsData.inu(i,k) = sparseKinetics_inu[i][k];
+      }
+
+      Kokkos::deep_copy( d_kineticsData.inu, h_kineticsData.inu );
+      Kokkos::deep_copy( d_kineticsData.isIntegral, h_kineticsData.isIntegral );
+    }
+  }
+
+  //else
+  //{
+
+    // Dense option
+    memory->create_kokkos( d_kineticsData.stoich, h_kineticsData.stoich, nreactions, nspecies, "KineticsType::stoich");
+    memory->create_kokkos( d_kineticsData.stoichReactants, h_kineticsData.stoichReactants, nreactions, nspecies, "KineticsType::stoichReactants");
+    memory->create_kokkos( d_kineticsData.stoichProducts, h_kineticsData.stoichProducts, nreactions, nspecies, "KineticsType::stoichProducts");
+
+    for (int i = 0; i < nreactions; ++i)
+      for (int k = 0; k < nspecies; ++k)
+      {
+        h_kineticsData.stoich(i,k) = stoich[i][k];
+        h_kineticsData.stoichReactants(i,k) = stoichReactants[i][k];
+        h_kineticsData.stoichProducts(i,k) = stoichProducts[i][k];
+      }
+
+    Kokkos::deep_copy( d_kineticsData.stoich, h_kineticsData.stoich );
+    Kokkos::deep_copy( d_kineticsData.stoichReactants, h_kineticsData.stoichReactants );
+    Kokkos::deep_copy( d_kineticsData.stoichProducts, h_kineticsData.stoichProducts );
+
+  //}
+
+  update_kinetics_data = false;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::setup_pre_force(int vflag)
+{
+  //printf("Inside FixRxKokkos<DeviceType>::setup_pre_force restartFlag= %d\n", my_restartFlag);
+
+  if (my_restartFlag)
+    my_restartFlag = 0;
+  else
+    this->solve_reactions( vflag, false );
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::pre_force(int vflag)
+{
+  //printf("Inside FixRxKokkos<DeviceType>::pre_force localTempFlag= %d\n", localTempFlag);
+
+  this->solve_reactions( vflag, true );
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_zeroCounterViews, const int& i) const
+{
+  d_diagnosticCounterPerODEnSteps(i) = 0;
+  d_diagnosticCounterPerODEnFuncs(i) = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <bool ZERO_RATES>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES>, const int& i, CounterType& counter) const
+{
+  if (d_mask(i) & groupbit)
+  {
+    StridedArrayType<double,1> y( d_scratchSpace.ptr_on_device() + scratchSpaceSize * i );
+    StridedArrayType<double,1> rwork( &y[nspecies] );
+
+    UserRHSDataKokkos<1> userData;
+    userData.kFor.m_data = &( rwork[7*nspecies] );
+    userData.rxnRateLaw.m_data = &( userData.kFor[ nreactions ] );
+
+    CounterType counter_i;
+
+    const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);
+
+    //Compute the reaction rate constants
+    for (int irxn = 0; irxn < nreactions; irxn++)
+    {
+      if (ZERO_RATES)
+        userData.kFor[irxn] = 0.0;
+      else
+      {
+        userData.kFor[irxn] = d_kineticsData.Arr(irxn) *
+                               pow(theta, d_kineticsData.nArr(irxn)) *
+                               exp(-d_kineticsData.Ea(irxn) / boltz / theta);
+      }
+    }
+
+    // Update ConcOld and initialize the ODE solution vector y[].
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+    {
+      const double tmp = d_dvector(ispecies, i);
+      d_dvector(ispecies+nspecies, i) = tmp;
+      y[ispecies] = tmp;
+    }
+
+    // Solver the ODE system.
+    if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+    {
+      k_rk4(t_stop, y, rwork, userData);
+    }
+    else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+    {
+      k_rkf45(nspecies, t_stop, y, rwork, userData, counter_i);
+
+      if (diagnosticFrequency == 1)
+      {
+        d_diagnosticCounterPerODEnSteps(i) = counter_i.nSteps;
+        d_diagnosticCounterPerODEnFuncs(i) = counter_i.nFuncs;
+      }
+    }
+
+    // Store the solution back in dvector.
+    for (int ispecies = 0; ispecies < nspecies; ispecies++)
+    {
+      if (y[ispecies] < -1.0e-10)
+      {
+        //error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
+        k_error_flag.template view<DeviceType>()() = 2;
+        // This should be an atomic update.
+      }
+      else if (y[ispecies] < MY_EPSILON)
+        y[ispecies] = 0.0;
+
+      d_dvector(ispecies,i) = y[ispecies];
+    }
+
+    // Update the iteration statistics counter. Is this unique for each iteration?
+    counter += counter_i;
+
+  } // if
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::solve_reactions(const int vflag, const bool isPreForce)
+{
+  //printf("Inside FixRxKokkos<DeviceType>::solve_reactions localTempFlag= %d isPreForce= %s\n", localTempFlag, isPreForce ? "True" : "false");
+
+  copymode = 1;
+
+  if (update_kinetics_data)
+    create_kinetics_data();
+
+  TimerType timer_start = getTimeStamp();
+
+  //const int nlocal = atom->nlocal;
+  this->nlocal = atom->nlocal;
+  const int nghost = atom->nghost;
+  const int newton_pair = force->newton_pair;
+
+  // Set the forward rates to zero if acting as setup_pre_force.
+  const bool setRatesToZero = (isPreForce == false);
+
+  if (localTempFlag)
+  {
+    const int count = nlocal + (newton_pair ? nghost : 0);
+
+    if (count > k_dpdThetaLocal.template view<DeviceType>().dimension_0()) {
+      memory->destroy_kokkos (k_dpdThetaLocal, dpdThetaLocal);
+      memory->create_kokkos (k_dpdThetaLocal, dpdThetaLocal, count, "FixRxKokkos::dpdThetaLocal");
+      this->d_dpdThetaLocal = k_dpdThetaLocal.template view<DeviceType>();
+      this->h_dpdThetaLocal = k_dpdThetaLocal.h_view;
+    }
+
+    const int neighflag = lmp->kokkos->neighflag;
+
+#define _template_switch(_wtflag, _localTempFlag) { \
+       if (neighflag == HALF) \
+          if (newton_pair) \
+             computeLocalTemperature<_wtflag, _localTempFlag, true , HALF> (); \
+          else \
+             computeLocalTemperature<_wtflag, _localTempFlag, false, HALF> (); \
+       else if (neighflag == HALFTHREAD) \
+          if (newton_pair) \
+             computeLocalTemperature<_wtflag, _localTempFlag, true , HALFTHREAD> (); \
+          else \
+             computeLocalTemperature<_wtflag, _localTempFlag, false, HALFTHREAD> (); \
+       else if (neighflag == FULL) \
+          if (newton_pair) \
+             computeLocalTemperature<_wtflag, _localTempFlag, true , FULL> (); \
+          else \
+             computeLocalTemperature<_wtflag, _localTempFlag, false, FULL> (); \
+    }
+
+    // Are there is no other options than wtFlag = (0)LUCY and localTempFlag = NONE : HARMONIC?
+    if (localTempFlag == HARMONIC) {
+       _template_switch(LUCY, HARMONIC)
+    }
+    else {
+       _template_switch(LUCY, NONE)
+    }
+#undef _template_switch
+  }
+
+  TimerType timer_localTemperature = getTimeStamp();
+
+  // Total counters from the ODE solvers.
+  CounterType TotalCounters;
+
+  // Set data needed in the operators.
+  // ...
+
+  // Local references to the atomKK objects.
+  //typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  //typename ArrayTypes<DeviceType>::t_float_2d  d_dvector  = atomKK->k_dvector.view<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_int_1d    d_mask     = atomKK->k_mask.view<DeviceType>();
+  this->d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  this->d_dvector  = atomKK->k_dvector.view<DeviceType>();
+  this->d_mask     = atomKK->k_mask.view<DeviceType>();
+
+  // Get up-to-date data.
+  atomKK->sync( execution_space, MASK_MASK | DVECTOR_MASK | DPDTHETA_MASK );
+
+  // Set some constants outside of the parallel_for
+  //const double boltz = force->boltz;
+  //const double t_stop = update->dt; // DPD time-step and integration length.
+  this->boltz = force->boltz;
+  this->t_stop = update->dt; // DPD time-step and integration length.
+
+  // Average DPD volume. Used in the RHS function.
+  this->VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+
+  if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
+  {
+    memory->create_kokkos (k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps, nlocal, "FixRxKokkos::diagnosticCounterPerODEnSteps");
+    memory->create_kokkos (k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs, nlocal, "FixRxKokkos::diagnosticCounterPerODEnFuncs");
+
+    d_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.template view<DeviceType>();
+    d_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.template view<DeviceType>();
+
+    Kokkos::parallel_for ( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_zeroCounterViews>(0,nlocal), *this);
+    //Kokkos::parallel_for ( nlocal,
+    //      LAMMPS_LAMBDA(const int i)
+    //      {
+    //         d_diagnosticCounterPerODEnSteps(i) = 0;
+    //         d_diagnosticCounterPerODEnFuncs(i) = 0;
+    //      }
+    //   );
+  }
+
+  // Error flag for any failures.
+  //DAT::tdual_int_scalar k_error_flag("pair:error_flag");
+
+  // Initialize and sync the device flag.
+  k_error_flag.h_view() = 0;
+  k_error_flag.template modify<LMPHostType>();
+  k_error_flag.template sync<DeviceType>();
+
+  // Create scratch array space.
+  //const size_t scratchSpaceSize = (8*nspecies + 2*nreactions);
+  this->scratchSpaceSize = (8*nspecies + 2*nreactions);
+  //double *scratchSpace = new double[ scratchSpaceSize * nlocal ];
+
+  //typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace("d_scratchSpace", scratchSpaceSize * nlocal);
+  if (nlocal*scratchSpaceSize > d_scratchSpace.dimension_0()) {
+    memory->destroy_kokkos (d_scratchSpace);
+    memory->create_kokkos (d_scratchSpace, nlocal*scratchSpaceSize, "FixRxKokkos::d_scratchSpace");
+  }
+
+#if 0
+  Kokkos::parallel_reduce( nlocal, LAMMPS_LAMBDA(int i, CounterType &counter)
+    {
+      if (d_mask(i) & groupbit)
+      {
+        //double *y = new double[8*nspecies];
+        //double *rwork = y + nspecies;
+
+        //StridedArrayType<double,1> _y( y );
+        //StridedArrayType<double,1> _rwork( rwork );
+
+        StridedArrayType<double,1> y( d_scratchSpace.ptr_on_device() + scratchSpaceSize * i );
+        StridedArrayType<double,1> rwork( &y[nspecies] );
+
+        //UserRHSData userData;
+        //userData.kFor = new double[nreactions];
+        //userData.rxnRateLaw = new double[nreactions];
+
+        //UserRHSDataKokkos<1> userDataKokkos;
+        //userDataKokkos.kFor.m_data = userData.kFor;
+        //userDataKokkos.rxnRateLaw.m_data = userData.rxnRateLaw;
+
+        UserRHSDataKokkos<1> userData;
+        userData.kFor.m_data = &( rwork[7*nspecies] );
+        userData.rxnRateLaw.m_data = &( userData.kFor[ nreactions ] );
+
+        CounterType counter_i;
+
+        const double theta = (localTempFlag) ? d_dpdThetaLocal(i) : d_dpdTheta(i);
+
+        //Compute the reaction rate constants
+        for (int irxn = 0; irxn < nreactions; irxn++)
+        {
+          if (setRatesToZero)
+            userData.kFor[irxn] = 0.0;
+          else
+          {
+            userData.kFor[irxn] = d_kineticsData.Arr(irxn) *
+                                   pow(theta, d_kineticsData.nArr(irxn)) *
+                                   exp(-d_kineticsData.Ea(irxn) / boltz / theta);
+          }
+        }
+
+        // Update ConcOld and initialize the ODE solution vector y[].
+        for (int ispecies = 0; ispecies < nspecies; ispecies++)
+        {
+          const double tmp = d_dvector(ispecies, i);
+          d_dvector(ispecies+nspecies, i) = tmp;
+          y[ispecies] = tmp;
+        }
+
+        // Solver the ODE system.
+        if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+        {
+          k_rk4(t_stop, y, rwork, userData);
+        }
+        else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+        {
+          k_rkf45(nspecies, t_stop, y, rwork, userData, counter_i);
+
+          if (diagnosticFrequency == 1)
+          {
+            d_diagnosticCounterPerODEnSteps(i) = counter_i.nSteps;
+            d_diagnosticCounterPerODEnFuncs(i) = counter_i.nFuncs;
+          }
+        }
+
+        // Store the solution back in dvector.
+        for (int ispecies = 0; ispecies < nspecies; ispecies++)
+        {
+          if (y[ispecies] < -1.0e-10)
+          {
+            //error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
+            k_error_flag.template view<DeviceType>()() = 2;
+            // This should be an atomic update.
+          }
+          else if (y[ispecies] < MY_EPSILON)
+            y[ispecies] = 0.0;
+
+          d_dvector(ispecies,i) = y[ispecies];
+        }
+
+        //delete [] y;
+        //delete [] userData.kFor;
+        //delete [] userData.rxnRateLaw;
+
+        // Update the iteration statistics counter. Is this unique for each iteration?
+        counter += counter_i;
+
+      } // if
+    } // parallel_for lambda-body
+
+    , TotalCounters // reduction value for all iterations.
+  );
+#else
+  if (setRatesToZero)
+    Kokkos::parallel_reduce( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_solveSystems<true > >(0,nlocal), *this, TotalCounters);
+  else
+    Kokkos::parallel_reduce( Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_solveSystems<false> >(0,nlocal), *this, TotalCounters);
+#endif
+
+  TimerType timer_ODE = getTimeStamp();
+
+  // Check the error flag for any failures.
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view() == 2)
+    error->one(FLERR,"Computed concentration in RK solver is < -1.0e-10");
+
+  // Signal that dvector has been modified on this execution space.
+  atomKK->modified( execution_space, DVECTOR_MASK );
+
+  // Communicate the updated species data to all nodes
+  atomKK->sync ( Host, DVECTOR_MASK );
+
+  comm->forward_comm_fix(this);
+
+  atomKK->modified ( Host, DVECTOR_MASK );
+
+  TimerType timer_stop = getTimeStamp();
+
+  double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);
+
+  //printf("me= %d kokkos total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
+  //                       getElapsedTime(timer_start, timer_stop),
+  //                       getElapsedTime(timer_start, timer_localTemperature),
+  //                       getElapsedTime(timer_localTemperature, timer_ODE),
+  //                       getElapsedTime(timer_ODE, timer_stop), nlocal, TotalCounters.nFuncs, TotalCounters.nSteps);
+
+  // Warn the user if a failure was detected in the ODE solver.
+  if (TotalCounters.nFails > 0){
+    char sbuf[128];
+    sprintf(sbuf,"in FixRX::pre_force, ODE solver failed for %d atoms.", TotalCounters.nFails);
+    error->warning(FLERR, sbuf);
+  }
+
+  // Compute and report ODE diagnostics, if requested.
+  if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency != 0)
+  {
+    // Update the counters.
+    diagnosticCounter[StepSum] += TotalCounters.nSteps;
+    diagnosticCounter[FuncSum] += TotalCounters.nFuncs;
+    diagnosticCounter[TimeSum] += time_ODE;
+    diagnosticCounter[AtomSum] += nlocal;
+    diagnosticCounter[numDiagnosticCounters-1] ++;
+
+    if ( (diagnosticFrequency > 0 &&
+               ((update->ntimestep - update->firststep) % diagnosticFrequency) == 0) ||
+         (diagnosticFrequency < 0 && update->ntimestep == update->laststep) )
+      this->odeDiagnostics();
+  }
+
+  copymode = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::odeDiagnostics(void)
+{
+  TimerType timer_start = getTimeStamp();
+
+  // Compute:
+  // 1) Average # of ODE integrator steps and RHS evaluations per atom globally.
+  // 2) RMS     # of  ...
+  // 3) Average # of ODE steps and RHS evaluations per MPI task.
+  // 4) RMS     # of ODE steps and RHS evaluations per MPI task.
+  // 5) MAX     # of ODE steps and RHS evaluations per MPI task.
+  //
+  // ... 1,2 are for ODE control diagnostics.
+  // ... 3-5 are for load balancing diagnostics.
+  //
+  // To do this, we'll need to
+  // a) Allreduce (sum) the sum of nSteps / nFuncs. Dividing by atom->natoms
+  //    gives the avg # of steps/funcs per atom globally.
+  // b) Reduce (sum) to root the sum of squares of the differences.
+  //    i) Sum_i (steps_i - avg_steps_global)^2
+  //   ii) Sum_i (funcs_i - avg_funcs_global)^2
+  //  iii) (avg_steps_local - avg_steps_global)^2
+  //   iv) (avg_funcs_local - avg_funcs_global)^2
+
+  const int numCounters = numDiagnosticCounters-1;
+
+  // # of time-steps for averaging.
+  const int nTimes = this->diagnosticCounter[numDiagnosticCounters-1];
+
+  // # of ODE's per time-step (on average).
+  //const int nODEs  = this->diagnosticCounter[AtomSum] / nTimes;
+
+  // Sum up the sums from each task.
+  double sums[numCounters];
+  double my_vals[numCounters];
+  double max_per_proc[numCounters];
+  double min_per_proc[numCounters];
+
+  // Compute counters per dpd time-step.
+  for (int i = 0; i < numCounters; ++i){
+    my_vals[i] = this->diagnosticCounter[i] / nTimes;
+    //printf("my sum[%d] = %f %d\n", i, my_vals[i], comm->me);
+  }
+
+  MPI_Allreduce (my_vals, sums, numCounters, MPI_DOUBLE, MPI_SUM, world);
+
+  MPI_Reduce (my_vals, max_per_proc, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
+  MPI_Reduce (my_vals, min_per_proc, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);
+
+  const double nODEs = sums[numCounters-1];
+
+  double avg_per_atom[numCounters], avg_per_proc[numCounters];
+
+  // Averages per-ODE and per-proc per time-step.
+  for (int i = 0; i < numCounters; ++i){
+    avg_per_atom[i] = sums[i] / nODEs;
+    avg_per_proc[i] = sums[i] / comm->nprocs;
+  }
+
+  // Sum up the differences from each task.
+  double sum_sq[2*numCounters];
+  double my_sum_sq[2*numCounters];
+  for (int i = 0; i < numCounters; ++i){
+    double diff_i = my_vals[i] - avg_per_proc[i];
+    my_sum_sq[i] = diff_i * diff_i;
+  }
+
+  double max_per_ODE[numCounters], min_per_ODE[numCounters];
+
+  // Process the per-ODE RMS of the # of steps/funcs
+  if (diagnosticFrequency == 1)
+  {
+    h_diagnosticCounterPerODEnSteps = k_diagnosticCounterPerODEnSteps.h_view;
+    h_diagnosticCounterPerODEnFuncs = k_diagnosticCounterPerODEnFuncs.h_view;
+
+    Kokkos::deep_copy( h_diagnosticCounterPerODEnSteps, d_diagnosticCounterPerODEnSteps );
+    Kokkos::deep_copy( h_diagnosticCounterPerODEnFuncs, d_diagnosticCounterPerODEnFuncs );
+
+    double my_max[numCounters], my_min[numCounters];
+
+    //const int nlocal = atom->nlocal;
+    nlocal = atom->nlocal;
+    HAT::t_int_1d h_mask = atomKK->k_mask.h_view;
+
+    for (int i = 0; i < numCounters; ++i)
+    {
+      my_sum_sq[i+numCounters] = 0;
+      my_max[i] = 0;
+      my_min[i] = DBL_MAX;
+    }
+
+    for (int j = 0; j < nlocal; ++j)
+      if (h_mask(j) & groupbit)
+      {
+        int nSteps = h_diagnosticCounterPerODEnSteps(j);
+        double diff_nSteps = double( nSteps ) - avg_per_atom[StepSum];
+        my_sum_sq[StepSum+numCounters] += diff_nSteps*diff_nSteps;
+        my_max[StepSum] = std::max( my_max[StepSum], (double)nSteps );
+        my_min[StepSum] = std::min( my_min[StepSum], (double)nSteps );
+
+        int nFuncs = h_diagnosticCounterPerODEnFuncs(j);
+        double diff_nFuncs = double( nFuncs ) - avg_per_atom[FuncSum];
+        my_sum_sq[FuncSum+numCounters] += diff_nFuncs*diff_nFuncs;
+
+        my_max[FuncSum] = std::max( my_max[FuncSum], (double)nFuncs );
+        my_min[FuncSum] = std::min( my_min[FuncSum], (double)nFuncs );
+      }
+
+    memory->destroy_kokkos( k_diagnosticCounterPerODEnSteps, diagnosticCounterPerODEnSteps );
+    memory->destroy_kokkos( k_diagnosticCounterPerODEnFuncs, diagnosticCounterPerODEnFuncs );
+
+    MPI_Reduce (my_sum_sq, sum_sq, 2*numCounters, MPI_DOUBLE, MPI_SUM, 0, world);
+
+    MPI_Reduce (my_max, max_per_ODE, numCounters, MPI_DOUBLE, MPI_MAX, 0, world);
+    MPI_Reduce (my_min, min_per_ODE, numCounters, MPI_DOUBLE, MPI_MIN, 0, world);
+  }
+  else
+    MPI_Reduce (my_sum_sq, sum_sq, numCounters, MPI_DOUBLE, MPI_SUM, 0, world);
+
+  TimerType timer_stop = getTimeStamp();
+  double time_local = getElapsedTime( timer_start, timer_stop );
+
+  if (comm->me == 0){
+    char smesg[128];
+
+#define print_mesg(smesg) {\
+    if (screen)  fprintf(screen,"%s\n", smesg); \
+    if (logfile) fprintf(logfile,"%s\n", smesg); }
+
+    sprintf(smesg, "FixRX::ODE Diagnostics:  # of iters  |# of rhs evals| run-time (sec) | # atoms");
+    print_mesg(smesg);
+
+    sprintf(smesg, "         AVG per ODE  : %-12.5g | %-12.5g | %-12.5g", avg_per_atom[0], avg_per_atom[1], avg_per_atom[2]);
+    print_mesg(smesg);
+
+    // only valid for single time-step!
+    if (diagnosticFrequency == 1){
+      double rms_per_ODE[numCounters];
+      for (int i = 0; i < numCounters; ++i)
+        rms_per_ODE[i] = sqrt( sum_sq[i+numCounters] / nODEs );
+
+      sprintf(smesg, "         RMS per ODE  : %-12.5g | %-12.5g ", rms_per_ODE[0], rms_per_ODE[1]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MAX per ODE  : %-12.5g | %-12.5g ", max_per_ODE[0], max_per_ODE[1]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MIN per ODE  : %-12.5g | %-12.5g ", min_per_ODE[0], min_per_ODE[1]);
+      print_mesg(smesg);
+    }
+
+    sprintf(smesg, "         AVG per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", avg_per_proc[StepSum], avg_per_proc[FuncSum], avg_per_proc[TimeSum], avg_per_proc[AtomSum]);
+    print_mesg(smesg);
+
+    if (comm->nprocs > 1){
+      double rms_per_proc[numCounters];
+      for (int i = 0; i < numCounters; ++i)
+        rms_per_proc[i] = sqrt( sum_sq[i] / comm->nprocs );
+
+      sprintf(smesg, "         RMS per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2], rms_per_proc[AtomSum]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MAX per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2], max_per_proc[AtomSum]);
+      print_mesg(smesg);
+
+      sprintf(smesg, "         MIN per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2], min_per_proc[AtomSum]);
+      print_mesg(smesg);
+    }
+
+    sprintf(smesg, "  AVG'd over %d time-steps", nTimes);
+    print_mesg(smesg);
+    sprintf(smesg, "  AVG'ing took %g sec", time_local);
+    print_mesg(smesg);
+
+#undef print_mesg
+
+  }
+
+  // Reset the counters.
+  for (int i = 0; i < numDiagnosticCounters; ++i)
+    diagnosticCounter[i] = 0;
+
+  return;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_zeroTemperatureViews, const int& i) const
+{
+  d_sumWeights(i) = 0.0;
+  d_dpdThetaLocal(i) = 0.0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <int WT_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_firstPairOperator<WT_FLAG,NEWTON_PAIR,NEIGHFLAG>, const int& ii) const
+{
+  // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
+  // for Half/thread scenarios.
+  typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+
+  AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
+  AtomicViewType a_sumWeights    = d_sumWeights;
+
+  // Local scalar accumulators.
+  double i_dpdThetaLocal = 0.0;
+  double i_sumWeights    = 0.0;
+
+  const int i = d_ilist(ii);
+ 
+  const double xtmp = d_x(i,0);
+  const double ytmp = d_x(i,1);
+  const double ztmp = d_x(i,2);
+  const int itype = d_type(i);
+
+  const int jnum = d_numneigh(i);
+
+  for (int jj = 0; jj < jnum; jj++)
+  {
+    const int j = (d_neighbors(i,jj) & NEIGHMASK);
+    const int jtype = d_type(j);
+
+    const double delx = xtmp - d_x(j,0);
+    const double dely = ytmp - d_x(j,1);
+    const double delz = ztmp - d_x(j,2);
+    const double rsq = delx*delx + dely*dely + delz*delz;
+
+    const double cutsq_ij = d_cutsq(itype,jtype);
+
+    if (rsq < cutsq_ij)
+    {
+      const double rcut = sqrt( cutsq_ij );
+      double rij = sqrt(rsq);
+      double ratio = rij/rcut;
+
+      double wij = 0.0;
+
+      // Lucy's Weight Function
+      if (WT_FLAG == LUCY)
+      {
+        wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
+        i_dpdThetaLocal += wij / d_dpdTheta(j);
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+          a_dpdThetaLocal(j) += wij / d_dpdTheta(i);
+      }
+
+      i_sumWeights += wij;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        a_sumWeights(j) += wij;
+    }
+  }
+
+  // Update, don't assign, the array value (because another iteration may have hit it).
+  a_dpdThetaLocal(i) += i_dpdThetaLocal;
+  a_sumWeights(i) += i_sumWeights;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG>
+  KOKKOS_INLINE_FUNCTION
+void FixRxKokkos<DeviceType>::operator()(Tag_FixRxKokkos_2ndPairOperator<WT_FLAG,LOCAL_TEMP_FLAG>, const int& i) const
+{
+  double wij = 0.0;
+
+  // Lucy Weight Function
+  if (WT_FLAG == LUCY)
+  {
+    wij = 1.0;
+    d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
+  }
+  d_sumWeights(i) += wij;
+
+  // Normalized local temperature
+  d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);
+
+  if (LOCAL_TEMP_FLAG == HARMONIC)
+    d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+void FixRxKokkos<DeviceType>::computeLocalTemperature()
+{
+  //typename ArrayTypes<DeviceType>::t_x_array_randomread d_x        = atomKK->k_x.view<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type     = atomKK->k_type.view<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_efloat_1d          d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+  d_x        = atomKK->k_x.view<DeviceType>();
+  d_type     = atomKK->k_type.view<DeviceType>();
+  d_dpdTheta = atomKK->k_dpdTheta.view<DeviceType>(); 
+
+  atomKK->sync(execution_space, X_MASK | TYPE_MASK | DPDTHETA_MASK );
+
+  //const int nlocal = atom->nlocal;
+  nlocal = atom->nlocal;
+  const int nghost = atom->nghost;
+
+  //printf("Inside FixRxKokkos::computeLocalTemperature: %d %d %d %d %d %d %d\n", WT_FLAG, LOCAL_TEMP_FLAG, NEWTON_PAIR, (int)lmp->kokkos->neighflag, NEIGHFLAG, nlocal, nghost);
+
+  // Pull from pairDPDE. The pairDPDEKK objects are protected so recreate here for now.
+  //pairDPDEKK->k_cutsq.template sync<DeviceType>();
+  //typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq = pairDPDEKK->k_cutsq.template view<DeviceType();
+
+  //!< Copies pulled from pairDPDE for local use since pairDPDEKK's objects are protected.
+  //typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  //typename ArrayTypes<DeviceType>::t_ffloat_2d     d_cutsq;
+  //double **h_cutsq;
+
+  {
+    const int ntypes = atom->ntypes;
+
+    //memory->create_kokkos (k_cutsq, h_cutsq, ntypes+1, ntypes+1, "pair:cutsq");
+    if (ntypes+1 > k_cutsq.dimension_0()) {
+      memory->destroy_kokkos (k_cutsq);
+      memory->create_kokkos (k_cutsq, ntypes+1, ntypes+1, "FixRxKokkos::k_cutsq");
+      d_cutsq = k_cutsq.template view<DeviceType>();
+    }
+
+    for (int i = 1; i <= ntypes; ++i)
+      for (int j = i; j <= ntypes; ++j)
+      {
+        k_cutsq.h_view(i,j) = pairDPDE->cutsq[i][j];
+        k_cutsq.h_view(j,i) = k_cutsq.h_view(i,j);
+      }
+
+    k_cutsq.template modify<LMPHostType>();
+    k_cutsq.template sync<DeviceType>();
+  }
+
+  // Initialize the local temperature weight array
+  int sumWeightsCt = nlocal + (NEWTON_PAIR ? nghost : 0);
+
+  //memory->create_kokkos (k_sumWeights, sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
+  if (sumWeightsCt > k_sumWeights.template view<DeviceType>().dimension_0()) {
+    memory->destroy_kokkos(k_sumWeights, sumWeights);
+    memory->create_kokkos (k_sumWeights, sumWeightsCt, "FixRxKokkos::sumWeights");
+    d_sumWeights = k_sumWeights.template view<DeviceType>();
+    h_sumWeights = k_sumWeights.h_view;
+  }
+
+  // Initialize the accumulator to zero ...
+  //Kokkos::parallel_for (sumWeightsCt,
+  //      LAMMPS_LAMBDA(const int i)
+  //      {
+  //         d_sumWeights(i) = 0.0;
+  //      }
+  //   );
+
+  Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_zeroTemperatureViews>(0, sumWeightsCt), *this);
+
+  // Local list views. (This isn't working!)
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  if (not(list->kokkos))
+     error->one(FLERR,"list is not a Kokkos list\n");
+
+  //typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors = k_list->d_neighbors;
+  //typename ArrayTypes<DeviceType>::t_int_1d       d_ilist     = k_list->d_ilist;
+  //typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh  = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist     = k_list->d_ilist;
+  d_numneigh  = k_list->d_numneigh;
+
+  const int inum = list->inum;
+
+  // loop over neighbors of my atoms
+#if 0
+  Kokkos::parallel_for ( inum,
+        LAMMPS_LAMBDA(const int ii)
+        {
+          // Create an atomic view of sumWeights and dpdThetaLocal. Only needed
+          // for Half/thread scenarios.
+          //typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+          typedef Kokkos::View< E_FLOAT*, typename DAT::t_efloat_1d::array_layout, DeviceType, Kokkos::MemoryTraits< AtomicF< NEIGHFLAG >::value> > AtomicViewType;
+
+          AtomicViewType a_dpdThetaLocal = d_dpdThetaLocal;
+          AtomicViewType a_sumWeights    = d_sumWeights;
+
+          // Local scalar accumulators.
+          double i_dpdThetaLocal = 0.0;
+          double i_sumWeights    = 0.0;
+
+          const int i = d_ilist(ii);
+ 
+          const double xtmp = d_x(i,0);
+          const double ytmp = d_x(i,1);
+          const double ztmp = d_x(i,2);
+          const int itype = d_type(i);
+
+          const int jnum = d_numneigh(i);
+
+          for (int jj = 0; jj < jnum; jj++)
+          {
+            const int j = (d_neighbors(i,jj) & NEIGHMASK);
+            const int jtype = d_type(j);
+
+            const double delx = xtmp - d_x(j,0);
+            const double dely = ytmp - d_x(j,1);
+            const double delz = ztmp - d_x(j,2);
+            const double rsq = delx*delx + dely*dely + delz*delz;
+
+            const double cutsq_ij = d_cutsq(itype,jtype);
+
+            if (rsq < cutsq_ij)
+            {
+              const double rcut = sqrt( cutsq_ij );
+              double rij = sqrt(rsq);
+              double ratio = rij/rcut;
+
+              double wij = 0.0;
+
+              // Lucy's Weight Function
+              if (WT_FLAG == LUCY)
+              {
+                wij = (1.0+3.0*ratio) * (1.0-ratio)*(1.0-ratio)*(1.0-ratio);
+                i_dpdThetaLocal += wij / d_dpdTheta(j);
+                if (NEWTON_PAIR || j < nlocal)
+                  a_dpdThetaLocal(j) += wij / d_dpdTheta(i);
+              }
+
+              i_sumWeights += wij;
+              if (NEWTON_PAIR || j < nlocal)
+                a_sumWeights(j) += wij;
+            }
+          }
+
+          // Update, don't assign, the array value (because another iteration may have hit it).
+          a_dpdThetaLocal(i) += i_dpdThetaLocal;
+          a_sumWeights(i) += i_sumWeights;
+        }
+     );
+#else
+  Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_firstPairOperator<WT_FLAG, NEWTON_PAIR, NEIGHFLAG> >(0, inum), *this);
+#endif
+
+  // Signal that dpdThetaLocal and sumWeights have been modified.
+  k_dpdThetaLocal.template modify<DeviceType>();
+  k_sumWeights.   template modify<DeviceType>();
+
+  // Communicate the sum dpdTheta and the weights on the host.
+  if (NEWTON_PAIR) comm->reverse_comm_fix(this);
+
+  // Update the device view in case they got changed.
+  k_dpdThetaLocal.template sync<DeviceType>();
+  k_sumWeights.   template sync<DeviceType>();
+
+  // self-interaction for local temperature
+#if 0
+  Kokkos::parallel_for ( nlocal,
+        LAMMPS_LAMBDA(const int i)
+        {
+          double wij = 0.0;
+
+          // Lucy Weight Function
+          if (WT_FLAG == LUCY)
+          {
+            wij = 1.0;
+            d_dpdThetaLocal(i) += wij / d_dpdTheta(i);
+          }
+          d_sumWeights(i) += wij;
+
+          // Normalized local temperature
+          d_dpdThetaLocal(i) = d_dpdThetaLocal(i) / d_sumWeights(i);
+
+          if (LOCAL_TEMP_FLAG == HARMONIC)
+            d_dpdThetaLocal(i) = 1.0 / d_dpdThetaLocal(i);
+        }
+     );
+#else
+  Kokkos::parallel_for (Kokkos::RangePolicy<DeviceType, Tag_FixRxKokkos_2ndPairOperator<WT_FLAG, LOCAL_TEMP_FLAG> >(0, nlocal), *this);
+#endif
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  //printf("inside FixRxKokkos::pack_forward_comm %d\n", comm->me);
+
+  HAT::t_float_2d h_dvector = atomKK->k_dvector.h_view;
+
+  int m = 0;
+  for (int ii = 0; ii < n; ii++) {
+    const int jj = list[ii];
+    for(int ispecies = 0; ispecies < nspecies; ispecies++){
+      buf[m++] = h_dvector(ispecies,jj);
+      buf[m++] = h_dvector(ispecies+nspecies,jj);
+    }
+  }
+
+  //printf("done with FixRxKokkos::pack_forward_comm %d\n", comm->me);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  //printf("inside FixRxKokkos::unpack_forward_comm %d\n", comm->me);
+
+  HAT::t_float_2d h_dvector = atomKK->k_dvector.h_view;
+
+  const int last = first + n ;
+  int m = 0;
+  for (int ii = first; ii < last; ii++){
+    for (int ispecies = 0; ispecies < nspecies; ispecies++){
+      h_dvector(ispecies,ii) = buf[m++];
+      h_dvector(ispecies+nspecies,ii) = buf[m++];
+    }
+  }
+
+  //printf("done with FixRxKokkos::unpack_forward_comm %d\n", comm->me);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+int FixRxKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  //printf("inside FixRxKokkos::pack_reverse_comm %d %d %d\n", comm->me, first, n);
+  // Sync the host view.
+  k_dpdThetaLocal.template sync<LMPHostType>();
+  k_sumWeights.   template sync<LMPHostType>();
+
+  const int last = first + n;
+  int m = 0;
+  for (int i = first; i < last; ++i)
+  {
+    buf[m++] = h_dpdThetaLocal(i);
+    buf[m++] = h_sumWeights(i);
+  }
+  //printf("done with FixRxKokkos::pack_reverse_comm %d\n", comm->me);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template <typename DeviceType>
+void FixRxKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  // printf("inside FixRxKokkos::unpack_reverse_comm %d\n", comm->me);
+  int m = 0;
+  for (int i = 0; i < n; i++) {
+    const int j = list[i];
+
+    h_dpdThetaLocal(j) += buf[m++];
+    h_sumWeights(j) += buf[m++];
+  }
+
+  // Signal that the host view has been modified.
+  k_dpdThetaLocal.template modify<LMPHostType>();
+  k_sumWeights.   template modify<LMPHostType>();
+
+  // printf("done with FixRxKokkos::unpack_reverse_comm %d\n", comm->me);
+}
+
+namespace LAMMPS_NS {
+template class FixRxKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixRxKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_rx_kokkos.h b/src/KOKKOS/fix_rx_kokkos.h
new file mode 100644
index 0000000000..92b715f34d
--- /dev/null
+++ b/src/KOKKOS/fix_rx_kokkos.h
@@ -0,0 +1,282 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(rx/kk,FixRxKokkos<LMPDeviceType>)
+FixStyle(rx/kk/device,FixRxKokkos<LMPDeviceType>)
+FixStyle(rx/kk/host,FixRxKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_RX_KOKKOS_H
+#define LMP_FIX_RX_KOKKOS_H
+
+#include "fix_rx.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+#include "kokkos_type.h"
+#include "neigh_list.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+struct Tag_FixRxKokkos_zeroTemperatureViews {};
+struct Tag_FixRxKokkos_zeroCounterViews {};
+
+template <int WT_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+struct Tag_FixRxKokkos_firstPairOperator {};
+
+template <int WT_FLAG, int LOCAL_TEMP_FLAG>
+struct Tag_FixRxKokkos_2ndPairOperator {};
+
+template <bool ZERO_RATES>
+struct Tag_FixRxKokkos_solveSystems {};
+
+struct s_CounterType
+{
+  int nSteps, nIters, nFuncs, nFails;
+
+  KOKKOS_INLINE_FUNCTION
+  s_CounterType() : nSteps(0), nIters(0), nFuncs(0), nFails(0) {};
+
+  KOKKOS_INLINE_FUNCTION
+  s_CounterType& operator+=(const s_CounterType &rhs)
+  {
+    nSteps += rhs.nSteps;
+    nIters += rhs.nIters;
+    nFuncs += rhs.nFuncs;
+    nFails += rhs.nFails;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  volatile s_CounterType& operator+=(const volatile s_CounterType &rhs) volatile
+  {
+    nSteps += rhs.nSteps;
+    nIters += rhs.nIters;
+    nFuncs += rhs.nFuncs;
+    nFails += rhs.nFails;
+    return *this;
+  }
+};
+typedef struct s_CounterType CounterType;
+
+template <typename DeviceType>
+class FixRxKokkos : public FixRX {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+
+  FixRxKokkos(class LAMMPS *, int, char **);
+  virtual ~FixRxKokkos();
+  virtual void init();
+  void init_list(int, class NeighList *);
+  void post_constructor();
+  virtual void setup_pre_force(int);
+  virtual void pre_force(int);
+
+  // Define a value_type here for the reduction operator on CounterType.
+  typedef CounterType value_type;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_zeroCounterViews, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_zeroTemperatureViews, const int&) const;
+
+  template <int WT_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_firstPairOperator<WT_FLAG,NEWTON_PAIR,NEIGHFLAG>, const int&) const;
+
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_2ndPairOperator<WT_FLAG,LOCAL_TEMP_FLAG>, const int&) const;
+
+  template <bool ZERO_RATES>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(Tag_FixRxKokkos_solveSystems<ZERO_RATES>, const int&, CounterType&) const;
+
+ //protected:
+  PairDPDfdtEnergyKokkos<DeviceType>* pairDPDEKK;
+  double VDPD;
+
+  double boltz;
+  double t_stop;
+
+  template <typename T, int stride = 1>
+  struct StridedArrayType
+  {
+    typedef T value_type;
+    enum { Stride = stride };
+
+    value_type *m_data;
+
+    KOKKOS_INLINE_FUNCTION
+    StridedArrayType() : m_data(NULL) {}
+    KOKKOS_INLINE_FUNCTION
+    StridedArrayType(value_type *ptr) : m_data(ptr) {}
+
+    KOKKOS_INLINE_FUNCTION       value_type& operator()(const int idx)       { return m_data[Stride*idx]; }
+    KOKKOS_INLINE_FUNCTION const value_type& operator()(const int idx) const { return m_data[Stride*idx]; }
+    KOKKOS_INLINE_FUNCTION       value_type& operator[](const int idx)       { return m_data[Stride*idx]; }
+    KOKKOS_INLINE_FUNCTION const value_type& operator[](const int idx) const { return m_data[Stride*idx]; }
+  };
+
+  template <int stride = 1>
+  struct UserRHSDataKokkos
+  {
+    StridedArrayType<double,1> kFor;
+    StridedArrayType<double,1> rxnRateLaw;
+  };
+
+  void solve_reactions(const int vflag, const bool isPreForce);
+
+  int rhs       (double, const double *, double *, void *) const;
+  int rhs_dense (double, const double *, double *, void *) const;
+  int rhs_sparse(double, const double *, double *, void *) const;
+
+  template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
+  int k_rhs       (double, const VectorType&, VectorType&, UserDataType& ) const;
+
+  template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
+  int k_rhs_dense (double, const VectorType&, VectorType&, UserDataType& ) const;
+
+  template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
+  int k_rhs_sparse(double, const VectorType&, VectorType&, UserDataType& ) const;
+
+  //!< Classic Runge-Kutta 4th-order stepper.
+  void rk4(const double t_stop, double *y, double *rwork, void *v_params) const;
+
+  //!< Runge-Kutta-Fehlberg ODE Solver.
+  void rkf45(const int neq, const double t_stop, double *y, double *rwork, void *v_params, CounterType& counter) const;
+
+  //!< Runge-Kutta-Fehlberg ODE stepper function.
+  void rkf45_step (const int neq, const double h, double y[], double y_out[],
+                   double rwk[], void *) const;
+
+  //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
+  int rkf45_h0 (const int neq, const double t, const double t_stop,
+                     const double hmin, const double hmax,
+                     double& h0, double y[], double rwk[], void *v_params) const;
+
+  //!< Classic Runge-Kutta 4th-order stepper.
+  template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
+  void k_rk4(const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData) const;
+
+  //!< Runge-Kutta-Fehlberg ODE Solver.
+  template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
+  void k_rkf45(const int neq, const double t_stop, VectorType& y, VectorType& rwork, UserDataType& userData, CounterType& counter) const;
+
+  //!< Runge-Kutta-Fehlberg ODE stepper function.
+  template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
+  void k_rkf45_step (const int neq, const double h, VectorType& y, VectorType& y_out,
+                     VectorType& rwk, UserDataType& userData) const;
+
+  //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
+  template <typename VectorType, typename UserDataType>
+    KOKKOS_INLINE_FUNCTION
+  int k_rkf45_h0 (const int neq, const double t, const double t_stop,
+                  const double hmin, const double hmax,
+                  double& h0, VectorType& y, VectorType& rwk, UserDataType& userData) const;
+
+  //!< ODE Solver diagnostics.
+  void odeDiagnostics(void);
+
+  //!< Special counters per-ode.
+  int *diagnosticCounterPerODEnSteps;
+  int *diagnosticCounterPerODEnFuncs;
+  DAT::tdual_int_1d k_diagnosticCounterPerODEnSteps;
+  DAT::tdual_int_1d k_diagnosticCounterPerODEnFuncs;
+  //typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnSteps;
+  //typename ArrayTypes<DeviceType>::t_int_1d d_diagnosticCounterPerODEnFuncs;
+  typename AT::t_int_1d d_diagnosticCounterPerODEnSteps;
+  typename AT::t_int_1d d_diagnosticCounterPerODEnFuncs;
+  HAT::t_int_1d h_diagnosticCounterPerODEnSteps;
+  HAT::t_int_1d h_diagnosticCounterPerODEnFuncs;
+
+  template <typename KokkosDeviceType>
+  struct KineticsType
+  {
+    // Arrhenius rate coefficients.
+    typename ArrayTypes<KokkosDeviceType>::t_float_1d Arr, nArr, Ea;
+
+    // Dense versions.
+    typename ArrayTypes<KokkosDeviceType>::t_float_2d stoich, stoichReactants, stoichProducts;
+
+    // Sparse versions.
+    typename ArrayTypes<KokkosDeviceType>::t_int_2d   nuk, inu;
+    typename ArrayTypes<KokkosDeviceType>::t_float_2d nu;
+    typename ArrayTypes<KokkosDeviceType>::t_int_1d   isIntegral;
+  };
+
+  //!< Kokkos versions of the kinetics data.
+  KineticsType<LMPHostType> h_kineticsData;
+  KineticsType<DeviceType>  d_kineticsData;
+
+  bool update_kinetics_data;
+
+  void create_kinetics_data(void);
+
+  // Need a dual-view and device-view for dpdThetaLocal and sumWeights since they're used in several callbacks.
+  DAT::tdual_efloat_1d k_dpdThetaLocal, k_sumWeights;
+  //typename ArrayTypes<DeviceType>::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
+  typename AT::t_efloat_1d d_dpdThetaLocal, d_sumWeights;
+  HAT::t_efloat_1d h_dpdThetaLocal, h_sumWeights;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread d_x       ;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread  d_type    ;
+  typename ArrayTypes<DeviceType>::t_efloat_1d          d_dpdTheta;
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d     d_cutsq;
+  //double **h_cutsq;
+
+  typename ArrayTypes<DeviceType>::t_neighbors_2d d_neighbors;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_ilist    ;
+  typename ArrayTypes<DeviceType>::t_int_1d       d_numneigh ;
+
+  typename ArrayTypes<DeviceType>::t_float_2d  d_dvector;
+  typename ArrayTypes<DeviceType>::t_int_1d    d_mask   ;
+
+  typename ArrayTypes<DeviceType>::t_double_1d d_scratchSpace;
+  size_t scratchSpaceSize;
+
+  // Error flag for any failures.
+  DAT::tdual_int_scalar k_error_flag;
+
+  template <int WT_FLAG, int LOCAL_TEMP_FLAG, bool NEWTON_PAIR, int NEIGHFLAG>
+  void computeLocalTemperature();
+
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  int pack_forward_comm(int , int *, double *, int, int *);
+  void unpack_forward_comm(int , int , double *);
+
+ //private: // replicate a few from FixRX
+  int my_restartFlag;
+  int nlocal;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp
new file mode 100644
index 0000000000..98bbb02714
--- /dev/null
+++ b/src/KOKKOS/fix_shardlow_kokkos.cpp
@@ -0,0 +1,856 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   James Larentzos (U.S. Army Research Laboratory)
+   and Timothy I. Mattox (Engility Corporation)
+
+   Martin Lisal (Institute of Chemical Process Fundamentals
+   of the Czech Academy of Sciences and J. E. Purkinje University)
+
+   John Brennan, Joshua Moore and William Mattson (Army Research Lab)
+
+   Please cite the related publications:
+   J. P. Larentzos, J. K. Brennan, J. D. Moore, M. Lisal, W. D. Mattson,
+   "Parallel implementation of isothermal and isoenergetic Dissipative
+   Particle Dynamics using Shardlow-like splitting algorithms",
+   Computer Physics Communications, 2014, 185, pp 1987--1998.
+
+   M. Lisal, J. K. Brennan, J. Bonet Avalos, "Dissipative particle dynamics
+   at isothermal, isobaric, isoenergetic, and isoenthalpic conditions using
+   Shardlow-like splitting algorithms", Journal of Chemical Physics, 2011,
+   135, 204105.
+------------------------------------------------------------------------- */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fix_shardlow_kokkos.h"
+#include "atom.h"
+#include "atom_masks.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "update.h"
+#include "respa.h"
+#include "error.h"
+#include <math.h>
+#include "atom_vec.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list_kokkos.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "domain.h"
+#include "modify.h"
+// #include "pair_dpd_fdt.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+#include "pair.h"
+#include "npair_ssa_kokkos.h"
+#include "citeme.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+#define EPSILON 1.0e-10
+#define EPSILON_SQUARED ((EPSILON) * (EPSILON))
+
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixShardlowKokkos<DeviceType>::FixShardlowKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixShardlow(lmp, narg, arg), k_pairDPDE(NULL), ghostmax(0), nlocal(0) , nghost(0)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  if (narg != 3) error->all(FLERR,"Illegal fix shardlow command");
+
+//  k_pairDPD = NULL;
+  k_pairDPDE = NULL;
+//  k_pairDPD = (PairDPDfdtKokkos *) force->pair_match("dpd/fdt",1);
+  k_pairDPDE = dynamic_cast<PairDPDfdtEnergyKokkos<DeviceType> *>(force->pair_match("dpd/fdt/energy",0));
+
+//   if(k_pairDPDE){
+    comm_forward = 3;
+    comm_reverse = 5;
+    maxRNG = 0;
+#ifdef DPD_USE_RAN_MARS
+    pp_random = NULL;
+#endif
+//   } else {
+//     comm_forward = 3;
+//     comm_reverse = 3;
+//   }
+
+
+  if(/* k_pairDPD == NULL &&*/ k_pairDPDE == NULL)
+    error->all(FLERR,"Must use pair_style "/*"dpd/fdt/kk or "*/"dpd/fdt/energy/kk with fix shardlow/kk");
+
+#ifdef DEBUG_SSA_PAIR_CT
+  d_counters = typename AT::t_int_2d("FixShardlowKokkos::d_counters", 2, 3);
+  d_hist = typename AT::t_int_1d("FixShardlowKokkos::d_hist", 32);
+#ifndef KOKKOS_USE_CUDA_UVM
+  h_counters = Kokkos::create_mirror_view(d_counters);
+  h_hist = Kokkos::create_mirror_view(d_hist);
+#else
+  h_counters = d_counters;
+  h_hist = d_hist;
+#endif
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixShardlowKokkos<DeviceType>::~FixShardlowKokkos()
+{
+  ghostmax = 0;
+#ifdef DPD_USE_RAN_MARS
+  if (pp_random) {
+    for (int i = 1; i < maxRNG; ++i) delete pp_random[i];
+    delete[] pp_random;
+    pp_random = NULL;
+  }
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::setmask()
+{
+  int mask = 0;
+  mask |= INITIAL_INTEGRATE | PRE_NEIGHBOR;
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::init()
+{
+  FixShardlow::init();
+
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+//  neighbor->requests[irequest]->pair = 0;
+//  neighbor->requests[irequest]->fix  = 1;
+//  neighbor->requests[irequest]->ghost= 1;
+//  neighbor->requests[irequest]->ssa  = 1;
+
+  int ntypes = atom->ntypes;
+  k_params = Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType>
+    ("FixShardlowKokkos::params",ntypes+1,ntypes+1);
+  params = k_params.template view<DeviceType>();
+  k_pairDPDE->k_cutsq.template sync<DeviceType>();
+  d_cutsq = k_pairDPDE->k_cutsq.template view<DeviceType>();
+
+  const double boltz2 = 2.0*force->boltz;
+  for (int i = 1; i <= ntypes; i++) {
+    for (int j = i; j <= ntypes; j++) {
+      F_FLOAT cutone = k_pairDPDE->cut[i][j];
+      if (cutone > EPSILON) k_params.h_view(i,j).cutinv = 1.0/cutone;
+      else k_params.h_view(i,j).cutinv = FLT_MAX;
+      k_params.h_view(i,j).halfsigma = 0.5*k_pairDPDE->sigma[i][j];
+      k_params.h_view(i,j).kappa = k_pairDPDE->kappa[i][j];
+      k_params.h_view(i,j).alpha = sqrt(boltz2*k_pairDPDE->kappa[i][j]);
+
+      k_params.h_view(j,i) = k_params.h_view(i,j);
+
+      if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+        m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+        m_cutsq[j][i] = m_cutsq[i][j] = k_pairDPDE->k_cutsq.h_view(i,j);
+      }
+    }
+  }
+
+  k_params.template modify<LMPHostType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::init_list(int id, NeighList *ptr)
+{
+  FixShardlow::init_list(id, ptr);
+  k_list = static_cast<NeighListKokkos<DeviceType>*>(ptr);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::pre_neighbor()
+{
+  // NOTE: this logic is specific to orthogonal boxes, not triclinic
+
+  // Enforce the constraint that ghosts must be contained in the nearest sub-domains
+  double bbx = domain->subhi[0] - domain->sublo[0];
+  double bby = domain->subhi[1] - domain->sublo[1];
+  double bbz = domain->subhi[2] - domain->sublo[2];
+
+  double rcut = 2.0*neighbor->cutneighmax;
+
+  if (domain->triclinic)
+    error->all(FLERR,"Fix shardlow does not yet support triclinic geometries");
+
+  if(rcut >= bbx || rcut >= bby || rcut>= bbz )
+  {
+    char fmt[] = {"Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either reduce the number of processors requested, or change the cutoff/skin: rcut= %e bbx= %e bby= %e bbz= %e\n"};
+    char *msg = (char *) malloc(sizeof(fmt) + 4*15);
+    sprintf(msg, fmt, rcut, bbx, bby, bbz);
+    error->one(FLERR, msg);
+  }
+
+  nlocal = atomKK->nlocal;
+  nghost = atomKK->nghost;
+
+  // Allocate memory for h_v_t0 to hold the initial velocities for the ghosts
+  if (nghost > ghostmax) {
+    ghostmax = nghost;
+    k_v_t0 = DAT::tdual_v_array("FixShardlowKokkos:v_t0", ghostmax);
+    // d_v_t0 = k_v_t0.template view<DeviceType>();
+    h_v_t0 = k_v_t0.h_view;
+  }
+
+  // Setup views of relevant data
+  x = atomKK->k_x.template view<DeviceType>();
+  v = atomKK->k_v.template view<DeviceType>();
+  h_v = atomKK->k_v.h_view;
+  uCond = atomKK->k_uCond.template view<DeviceType>();
+  h_uCond = atomKK->k_uCond.h_view;
+  uMech = atomKK->k_uMech.template view<DeviceType>();
+  h_uMech = atomKK->k_uMech.h_view;
+  type = atomKK->k_type.view<DeviceType>();
+  if (atomKK->rmass) {
+    massPerI = true;
+    masses = atomKK->k_rmass.view<DeviceType>();
+  } else {
+    massPerI = false;
+    masses = atomKK->k_mass.view<DeviceType>();
+  }
+//   if(k_pairDPDE){
+  dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
+
+//} else {
+//}
+}
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::setup_pre_neighbor()
+{
+  pre_neighbor();
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+#error "FixShardlowKokkos::ssa_update_dpd() is not functional yet - TIM 20170830"
+/* ----------------------------------------------------------------------
+   Perform the stochastic integration and Shardlow update for constant temperature
+   Allow for both per-type and per-atom mass
+
+   NOTE: only implemented for orthogonal boxes, not triclinic
+------------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS>
+void FixShardlowKokkos<DeviceType>::ssa_update_dpd(
+  int start_ii, int count, int id
+)
+{
+#ifdef DPD_USE_RAN_MARS
+  class RanMars *pRNG = pp_random[id];
+#else
+  rand_type rand_gen = rand_pool.get_state(id);
+#endif
+
+  int ct = count;
+  int ii = start_ii;
+
+  while (ct-- > 0) {
+    const int i = d_ilist(ii);
+    const int jlen = d_numneigh(ii);
+
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+
+    // load velocity for i from memory
+    double vxi = v(i, 0);
+    double vyi = v(i, 1);
+    double vzi = v(i, 2);
+
+    const int itype = type(i);
+
+    const double mass_i = masses(massPerI ? i : itype);
+    const double massinv_i = 1.0 / mass_i;
+
+    // Loop over Directional Neighbors only
+    for (int jj = 0; jj < jlen; jj++) {
+      const int j = d_neighbors(ii,jj) & NEIGHMASK;
+      int jtype = type[j];
+
+      const X_FLOAT delx = xtmp - x(j, 0);
+      const X_FLOAT dely = ytmp - x(j, 1);
+      const X_FLOAT delz = ztmp - x(j, 2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_SSA_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
+      else Kokkos::atomic_increment(&(d_counters(0, 1)));
+      Kokkos::atomic_increment(&(d_counters(0, 2)));
+      int rsqi = rsq / 8;
+      if (rsqi < 0) rsqi = 0;
+      else if (rsqi > 31) rsqi = 31;
+      Kokkos::atomic_increment(&(d_hist(rsqi)));
+#endif
+
+      // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
+      if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
+        && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_SSA_PAIR_CT
+        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
+        else Kokkos::atomic_increment(&(d_counters(1, 1)));
+        Kokkos::atomic_increment(&(d_counters(1, 2)));
+#endif
+        double r = sqrt(rsq);
+        double rinv = 1.0/r;
+        double delx_rinv = delx*rinv;
+        double dely_rinv = dely*rinv;
+        double delz_rinv = delz*rinv;
+
+        double wr = 1.0 - r*(STACKPARAMS?m_params[itype][jtype].cutinv:params(itype,jtype).cutinv);
+        double wdt = wr*wr*dt;
+
+        double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
+        double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
+
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v *
+#ifdef DPD_USE_RAN_MARS
+            pRNG->gaussian();
+#else
+            rand_gen.normal();
+#endif
+
+        const double mass_j = masses(massPerI ? j : jtype);
+        double massinv_j = 1.0 / mass_j;
+
+        double gammaFactor = halfgamma_ij*wdt*ftm2v;
+        double inv_1p_mu_gammaFactor = 1.0/(1.0 + (massinv_i + massinv_j)*gammaFactor);
+
+        double vxj = v(j, 0);
+        double vyj = v(j, 1);
+        double vzj = v(j, 2);
+
+        // Compute the initial velocity difference between atom i and atom j
+        double delvx = vxi - vxj;
+        double delvy = vyi - vyj;
+        double delvz = vzi - vzj;
+        double dot_rinv = (delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz);
+
+        // Compute momentum change between t and t+dt
+        double factorA = sigmaRand - gammaFactor*dot_rinv;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorA*massinv_i;
+        vyi += dely_rinv*factorA*massinv_i;
+        vzi += delz_rinv*factorA*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorA*massinv_j;
+        vyj -= dely_rinv*factorA*massinv_j;
+        vzj -= delz_rinv*factorA*massinv_j;
+
+        //ii.   Compute the new velocity diff
+        delvx = vxi - vxj;
+        delvy = vyi - vyj;
+        delvz = vzi - vzj;
+        dot_rinv = delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz;
+
+        // Compute the new momentum change between t and t+dt
+        double factorB = (sigmaRand - gammaFactor*dot_rinv)*inv_1p_mu_gammaFactor;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorB*massinv_i;
+        vyi += dely_rinv*factorB*massinv_i;
+        vzi += delz_rinv*factorB*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorB*massinv_j;
+        vyj -= dely_rinv*factorB*massinv_j;
+        vzj -= delz_rinv*factorB*massinv_j;
+
+        // Store updated velocity for j
+        v(j, 0) = vxj;
+        v(j, 1) = vyj;
+        v(j, 2) = vzj;
+      }
+    }
+    // store updated velocity for i
+    v(i, 0) = vxi;
+    v(i, 1) = vyi;
+    v(i, 2) = vzi;
+  }
+
+#ifndef DPD_USE_RAN_MARS
+  rand_pool.free_state(rand_gen);
+#endif
+}
+#endif
+
+/* ----------------------------------------------------------------------
+   Perform the stochastic integration and Shardlow update for constant energy
+   Allow for both per-type and per-atom mass
+
+   NOTE: only implemented for orthogonal boxes, not triclinic
+------------------------------------------------------------------------- */
+template<class DeviceType>
+template<bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void FixShardlowKokkos<DeviceType>::ssa_update_dpde(
+  int start_ii, int count, int id
+) const
+{
+#ifdef DPD_USE_RAN_MARS
+  class RanMars *pRNG = pp_random[id];
+#else
+  rand_type rand_gen = rand_pool.get_state(id);
+#endif
+
+  int ct = count;
+  int ii = start_ii;
+
+  while (ct-- > 0) {
+    const int i = d_ilist(ii);
+    const int jlen = d_numneigh(ii);
+
+    const double xtmp = x(i, 0);
+    const double ytmp = x(i, 1);
+    const double ztmp = x(i, 2);
+
+    // load velocity for i from memory
+    double vxi = v(i, 0);
+    double vyi = v(i, 1);
+    double vzi = v(i, 2);
+
+    double uMech_i = uMech(i);
+    double uCond_i = uCond(i);
+    const int itype = type(i);
+
+    const double theta_i_inv = 1.0/dpdTheta(i);
+    const double mass_i = masses(massPerI ? i : itype);
+    const double massinv_i = 1.0 / mass_i;
+    const double mass_i_div_neg4_ftm2v = mass_i*(-0.25)/ftm2v;
+
+    // Loop over Directional Neighbors only
+    for (int jj = 0; jj < jlen; jj++) {
+      const int j = d_neighbors(ii,jj) & NEIGHMASK;
+      const int jtype = type(j);
+
+      const X_FLOAT delx = xtmp - x(j, 0);
+      const X_FLOAT dely = ytmp - x(j, 1);
+      const X_FLOAT delz = ztmp - x(j, 2);
+      const F_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_SSA_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(0, 0)));
+      else Kokkos::atomic_increment(&(d_counters(0, 1)));
+      Kokkos::atomic_increment(&(d_counters(0, 2)));
+      int rsqi = rsq / 8;
+      if (rsqi < 0) rsqi = 0;
+      else if (rsqi > 31) rsqi = 31;
+      Kokkos::atomic_increment(&(d_hist(rsqi)));
+#endif
+
+      // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
+      if ((rsq < (STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype)))
+        && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_SSA_PAIR_CT
+        if ((i < nlocal) && (j < nlocal)) Kokkos::atomic_increment(&(d_counters(1, 0)));
+        else Kokkos::atomic_increment(&(d_counters(1, 1)));
+        Kokkos::atomic_increment(&(d_counters(1, 2)));
+#endif
+
+        double r = sqrt(rsq);
+        double rinv = 1.0/r;
+        double delx_rinv = delx*rinv;
+        double dely_rinv = dely*rinv;
+        double delz_rinv = delz*rinv;
+
+        double wr = 1.0 - r*(STACKPARAMS?m_params[itype][jtype].cutinv:params(itype,jtype).cutinv);
+        double wdt = wr*wr*dt;
+
+        // Compute the current temperature
+        double theta_j_inv = 1.0/dpdTheta(j);
+        double theta_ij_inv = 0.5*(theta_i_inv + theta_j_inv);
+
+        double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma;
+        double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv;
+
+        double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v *
+#ifdef DPD_USE_RAN_MARS
+            pRNG->gaussian();
+#else
+            rand_gen.normal();
+#endif
+
+        const double mass_j = masses(massPerI ? j : jtype);
+        double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v;
+        double massinv_j = 1.0 / mass_j;
+
+        // Compute uCond
+        double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
+        double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha;
+        double del_uCond = alpha_ij*wr*dtsqrt *
+#ifdef DPD_USE_RAN_MARS
+            pRNG->gaussian();
+#else
+            rand_gen.normal();
+#endif
+
+        del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt;
+        uCond[j] -= del_uCond;
+        uCond_i += del_uCond;
+
+        double gammaFactor = halfgamma_ij*wdt*ftm2v;
+        double inv_1p_mu_gammaFactor = 1.0/(1.0 + (massinv_i + massinv_j)*gammaFactor);
+
+        double vxj = v(j, 0);
+        double vyj = v(j, 1);
+        double vzj = v(j, 2);
+        double dot4 = vxj*vxj + vyj*vyj + vzj*vzj;
+        double dot3 = vxi*vxi + vyi*vyi + vzi*vzi;
+
+        // Compute the initial velocity difference between atom i and atom j
+        double delvx = vxi - vxj;
+        double delvy = vyi - vyj;
+        double delvz = vzi - vzj;
+        double dot_rinv = (delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz);
+
+        // Compute momentum change between t and t+dt
+        double factorA = sigmaRand - gammaFactor*dot_rinv;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorA*massinv_i;
+        vyi += dely_rinv*factorA*massinv_i;
+        vzi += delz_rinv*factorA*massinv_i;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorA*massinv_j;
+        vyj -= dely_rinv*factorA*massinv_j;
+        vzj -= delz_rinv*factorA*massinv_j;
+
+        //ii.   Compute the new velocity diff
+        delvx = vxi - vxj;
+        delvy = vyi - vyj;
+        delvz = vzi - vzj;
+        dot_rinv = delx_rinv*delvx + dely_rinv*delvy + delz_rinv*delvz;
+
+        // Compute the new momentum change between t and t+dt
+        double factorB = (sigmaRand - gammaFactor*dot_rinv)*inv_1p_mu_gammaFactor;
+
+        // Update the velocity on i
+        vxi += delx_rinv*factorB*massinv_i;
+        vyi += dely_rinv*factorB*massinv_i;
+        vzi += delz_rinv*factorB*massinv_i;
+        double partial_uMech = (vxi*vxi + vyi*vyi + vzi*vzi - dot3)*massinv_j;
+
+        // Update the velocity on j
+        vxj -= delx_rinv*factorB*massinv_j;
+        vyj -= dely_rinv*factorB*massinv_j;
+        vzj -= delz_rinv*factorB*massinv_j;
+        partial_uMech += (vxj*vxj + vyj*vyj + vzj*vzj - dot4)*massinv_i;
+
+        // Store updated velocity for j
+        v(j, 0) = vxj;
+        v(j, 1) = vyj;
+        v(j, 2) = vzj;
+
+        // Compute uMech
+        double del_uMech = partial_uMech*mass_ij_div_neg4_ftm2v;
+        uMech_i += del_uMech;
+        uMech(j) += del_uMech;
+      }
+    }
+    // store updated velocity for i
+    v(i, 0) = vxi;
+    v(i, 1) = vyi;
+    v(i, 2) = vzi;
+    // store updated uMech and uCond for i
+    uMech(i) = uMech_i;
+    uCond(i) = uCond_i;
+    ii++;
+  }
+
+#ifndef DPD_USE_RAN_MARS
+  rand_pool.free_state(rand_gen);
+#endif
+}
+
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::initial_integrate(int vflag)
+{
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  copymode = 1;
+
+  dtsqrt = sqrt(update->dt);
+
+  NPairSSAKokkos<DeviceType> *np_ssa = dynamic_cast<NPairSSAKokkos<DeviceType>*>(list->np);
+  if (!np_ssa) error->one(FLERR, "NPair wasn't a NPairSSAKokkos object");
+  ssa_phaseCt = np_ssa->ssa_phaseCt;
+  ssa_phaseLen = np_ssa->ssa_phaseLen;
+  ssa_itemLoc = np_ssa->ssa_itemLoc;
+  ssa_itemLen = np_ssa->ssa_itemLen;
+  ssa_gphaseCt = np_ssa->ssa_gphaseCt;
+  ssa_gphaseLen = np_ssa->ssa_gphaseLen;
+  ssa_gitemLoc = np_ssa->ssa_gitemLoc;
+  ssa_gitemLen = np_ssa->ssa_gitemLen;
+
+  np_ssa->k_ssa_itemLoc.template sync<DeviceType>();
+  np_ssa->k_ssa_itemLen.template sync<DeviceType>();
+  np_ssa->k_ssa_gitemLoc.template sync<DeviceType>();
+  np_ssa->k_ssa_gitemLen.template sync<DeviceType>();
+
+  np_ssa->k_ssa_phaseLen.template sync<LMPHostType>();
+  np_ssa->k_ssa_gphaseLen.template sync<LMPHostType>();
+  auto h_ssa_phaseLen = np_ssa->k_ssa_phaseLen.h_view;
+  auto h_ssa_gphaseLen = np_ssa->k_ssa_gphaseLen.h_view;
+
+  int maxWorkItemCt = (int) ssa_itemLoc.dimension_1();
+  if (maxWorkItemCt < (int) ssa_gitemLoc.dimension_1()) {
+    maxWorkItemCt = (int) ssa_gitemLoc.dimension_1();
+  }
+  if (maxWorkItemCt > maxRNG) {
+#ifdef DPD_USE_RAN_MARS
+    if (pp_random) {
+      for (int i = 1; i < maxRNG; ++i) delete pp_random[i];
+      delete[] pp_random;
+      pp_random = NULL;
+    }
+    pp_random = new RanMars*[maxWorkItemCt];
+    for (int i = 1; i < maxWorkItemCt; ++i) {
+      pp_random[i] = new RanMars(lmp, k_pairDPDE->seed + comm->me + comm->nprocs*i);
+    }
+    pp_random[0] = k_pairDPDE->random;
+#else
+    rand_pool.init(k_pairDPDE->seed + comm->me, maxWorkItemCt);
+#endif
+    maxRNG = maxWorkItemCt;
+  }
+
+#ifdef DEBUG_SSA_PAIR_CT
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 3; ++j)
+      h_counters(i,j) = 0;
+  for (int i = 0; i < 32; ++i) h_hist[i] = 0;
+  deep_copy(d_counters, h_counters);
+  deep_copy(d_hist, h_hist);
+#endif
+
+  //theta_ij_inv = 1.0/k_pairDPD->temperature; // independent of i,j
+  boltz_inv = 1.0/force->boltz;
+  ftm2v = force->ftm2v;
+  dt     = update->dt;
+
+  k_params.template sync<DeviceType>();
+
+  // process neighbors in the local AIR
+  atomKK->sync(execution_space,X_MASK | V_MASK | TYPE_MASK | RMASS_MASK | UCOND_MASK | UMECH_MASK | DPDTHETA_MASK);
+  for (workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+    int workItemCt = h_ssa_phaseLen[workPhase];
+
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS)
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDE<false> >(0,workItemCt),*this);
+    else
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDE<true> >(0,workItemCt),*this);
+  }
+  atomKK->modified(execution_space,V_MASK | UCOND_MASK | UMECH_MASK);
+
+  //Loop over all 13 outward directions (7 stages)
+  for (workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) {
+    // int airnum = workPhase + 1;
+    int workItemCt = h_ssa_gphaseLen[workPhase];
+
+    // Communicate the updated velocities to all nodes
+    atomKK->sync(Host,V_MASK);
+    comm->forward_comm_fix(this);
+    atomKK->modified(Host,V_MASK);
+
+    if(k_pairDPDE){
+      // Zero out the ghosts' uCond & uMech to be used as delta accumulators
+//      memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
+//      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
+
+      // must capture local variables, not class variables
+      atomKK->sync(execution_space,UCOND_MASK | UMECH_MASK);
+      auto l_uCond = uCond;
+      auto l_uMech = uMech;
+      Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nlocal+nghost), LAMMPS_LAMBDA (const int i) {
+        l_uCond(i) = 0.0;
+        l_uMech(i) = 0.0;
+      });
+      atomKK->modified(execution_space,UCOND_MASK | UMECH_MASK);
+    }
+
+    // process neighbors in this AIR
+    atomKK->sync(execution_space,X_MASK | V_MASK | TYPE_MASK | RMASS_MASK | UCOND_MASK | UMECH_MASK | DPDTHETA_MASK);
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS)
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDEGhost<false> >(0,workItemCt),*this);
+    else
+      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType,TagFixShardlowSSAUpdateDPDEGhost<true> >(0,workItemCt),*this);
+    atomKK->modified(execution_space,V_MASK | UCOND_MASK | UMECH_MASK);
+
+    // Communicate the ghost deltas to the atom owners
+    atomKK->sync(Host,V_MASK | UCOND_MASK | UMECH_MASK);
+    comm->reverse_comm_fix(this);
+    atomKK->modified(Host,V_MASK | UCOND_MASK | UMECH_MASK);
+
+  }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
+
+#ifdef DEBUG_SSA_PAIR_CT
+deep_copy(h_counters, d_counters);
+deep_copy(h_hist, d_hist);
+for (int i = 0; i < 32; ++i) fprintf(stdout, "%8d", h_hist[i]);
+fprintf(stdout, "\n%6d %6d,%6d %6d: "
+  ,h_counters(0, 2)
+  ,h_counters(1, 2)
+  ,h_counters(0, 1)
+  ,h_counters(1, 1)
+);
+#endif
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void FixShardlowKokkos<DeviceType>::operator()(TagFixShardlowSSAUpdateDPDE<STACKPARAMS>, const int &workItem) const {
+  const int ct = ssa_itemLen(workPhase, workItem);
+  const int ii = ssa_itemLoc(workPhase, workItem);
+  ssa_update_dpde<STACKPARAMS>(ii, ct, workItem);
+}
+
+template<class DeviceType>
+template<bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void FixShardlowKokkos<DeviceType>::operator()(TagFixShardlowSSAUpdateDPDEGhost<STACKPARAMS>, const int &workItem) const {
+  const int ct = ssa_gitemLen(workPhase, workItem);
+  const int ii = ssa_gitemLoc(workPhase, workItem);
+  ssa_update_dpde<STACKPARAMS>(ii, ct, workItem);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  int ii,jj,m;
+
+  m = 0;
+  for (ii = 0; ii < n; ii++) {
+    jj = list[ii];
+    buf[m++] = h_v(jj, 0);
+    buf[m++] = h_v(jj, 1);
+    buf[m++] = h_v(jj, 2);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  int ii,m,last;
+
+  m = 0;
+  last = first + n ;
+  for (ii = first; ii < last; ii++) {
+    h_v_t0(ii - nlocal, 0) = h_v(ii, 0) = buf[m++];
+    h_v_t0(ii - nlocal, 1) = h_v(ii, 1) = buf[m++];
+    h_v_t0(ii - nlocal, 2) = h_v(ii, 2) = buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int FixShardlowKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = h_v(i, 0) - h_v_t0(i - nlocal, 0);
+    buf[m++] = h_v(i, 1) - h_v_t0(i - nlocal, 1);
+    buf[m++] = h_v(i, 2) - h_v_t0(i - nlocal, 2);
+    if(k_pairDPDE){
+      buf[m++] = h_uCond(i); // for ghosts, this is an accumulated delta
+      buf[m++] = h_uMech(i); // for ghosts, this is an accumulated delta
+    }
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixShardlowKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+
+    h_v(j, 0) += buf[m++];
+    h_v(j, 1) += buf[m++];
+    h_v(j, 2) += buf[m++];
+    if(k_pairDPDE){
+      h_uCond(j) += buf[m++]; // add in the accumulated delta
+      h_uMech(j) += buf[m++]; // add in the accumulated delta
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double FixShardlowKokkos<DeviceType>::memory_usage()
+{
+  double bytes = 0.0;
+  bytes += sizeof(double)*3*ghostmax; // v_t0[]
+  return bytes;
+}
+
+namespace LAMMPS_NS {
+template class FixShardlowKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixShardlowKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h
new file mode 100644
index 0000000000..70dccf2e2d
--- /dev/null
+++ b/src/KOKKOS/fix_shardlow_kokkos.h
@@ -0,0 +1,196 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(shardlow/kk,FixShardlowKokkos<LMPDeviceType>)
+FixStyle(shardlow/kk/device,FixShardlowKokkos<LMPDeviceType>)
+FixStyle(shardlow/kk/host,FixShardlowKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_SHARDLOW_KOKKOS_H
+#define LMP_FIX_SHARDLOW_KOKKOS_H
+
+#include "float.h"
+#include "fix_shardlow.h"
+#include "kokkos_type.h"
+#include "neigh_list_kokkos.h"
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+#include "pair_dpd_fdt_kokkos.h"
+#endif
+#include "pair_dpd_fdt_energy_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<bool STACKPARAMS>
+struct TagFixShardlowSSAUpdateDPDE{};
+
+template<bool STACKPARAMS>
+struct TagFixShardlowSSAUpdateDPDEGhost{};
+
+template<class DeviceType>
+class FixShardlowKokkos : public FixShardlow {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+  NeighListKokkos<DeviceType> *k_list; // The SSA specific neighbor list
+
+  FixShardlowKokkos(class LAMMPS *, int, char **);
+  ~FixShardlowKokkos();
+  int setmask();
+  virtual void init();
+  virtual void init_list(int, class NeighList *);
+  virtual void initial_integrate(int);
+  void setup_pre_neighbor();
+  void pre_neighbor();
+
+  double memory_usage();
+
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  int pack_forward_comm(int , int *, double *, int, int *);
+  void unpack_forward_comm(int , int , double *);
+
+  struct params_ssa {
+    KOKKOS_INLINE_FUNCTION
+    params_ssa(){cutinv=FLT_MAX;halfsigma=0;kappa=0;alpha=0;};
+    KOKKOS_INLINE_FUNCTION
+    params_ssa(int i){cutinv=FLT_MAX;halfsigma=0;kappa=0;alpha=0;};
+    F_FLOAT cutinv,halfsigma,kappa,alpha;
+  };
+
+  template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixShardlowSSAUpdateDPDE<STACKPARAMS>, const int&) const;
+
+  template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixShardlowSSAUpdateDPDEGhost<STACKPARAMS>, const int&) const;
+
+#ifdef DEBUG_SSA_PAIR_CT
+  typename AT::t_int_2d d_counters;
+  typename HAT::t_int_2d h_counters;
+  typename AT::t_int_1d d_hist;
+  typename HAT::t_int_1d h_hist;
+#endif
+
+ protected:
+  int workPhase;
+  double theta_ij_inv,boltz_inv,ftm2v,dt;
+
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+//  class PairDPDfdt *pairDPD; FIXME as per k_pairDPDE below
+#endif
+  PairDPDfdtEnergyKokkos<DeviceType> *k_pairDPDE;
+
+  int maxRNG;
+#ifdef DPD_USE_RAN_MARS
+  class RanMars **pp_random;
+#elif defined(DPD_USE_Random_XorShift1024)
+  Kokkos::Random_XorShift1024_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift1024_Pool<DeviceType>::generator_type rand_type;
+#else
+  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+#endif
+
+  Kokkos::DualView<params_ssa**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_ssa**,
+    Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
+  // hardwired to space for MAX_TYPES_STACKPARAMS (12) atom types
+  params_ssa m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  typename DAT::tdual_v_array k_v_t0;
+  // typename AT::t_v_array d_v_t0; v_t0 only used in comm routines (on host)
+  typename HAT::t_v_array h_v_t0;
+
+  typename AT::t_x_array x;
+  typename AT::t_v_array v;
+  typename HAT::t_v_array h_v;
+  typename AT::t_efloat_1d uCond, uMech;
+  typename HAT::t_efloat_1d h_uCond, h_uMech;
+  typename AT::t_int_1d type;
+  bool massPerI;
+  typename AT::t_float_1d_randomread masses;
+  typename AT::t_efloat_1d dpdTheta;
+
+  double dtsqrt; // = sqrt(update->dt);
+  int ghostmax;
+  int nlocal, nghost;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist, d_numneigh;
+
+  int ssa_phaseCt;
+  typename AT::t_int_1d ssa_phaseLen;
+  typename AT::t_int_2d ssa_itemLoc, ssa_itemLen;
+
+  int ssa_gphaseCt;
+  typename AT::t_int_1d ssa_gphaseLen;
+  typename AT::t_int_2d ssa_gitemLoc, ssa_gitemLen;
+
+
+#ifdef ENABLE_KOKKOS_DPD_CONSTANT_TEMPERATURE
+  template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void ssa_update_dpd(int, int, int) const;  // Constant Temperature
+#endif
+  template<bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void ssa_update_dpde(int, int, int) const; // Constant Energy
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Must use dpd/fdt pair_style with fix shardlow
+
+Self-explanatory.
+
+E: Must use pair_style dpd/fdt or dpd/fdt/energy with fix shardlow
+
+E: A deterministic integrator must be specified after fix shardlow in input
+file (e.g. fix nve or fix nph).
+
+Self-explanatory.
+
+E: Cannot use constant temperature integration routines with DPD
+
+Self-explanatory.  Must use deterministic integrators such as nve or nph
+
+E: Fix shardlow does not yet support triclinic geometries
+
+Self-explanatory.
+
+E:  Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either
+reduce the number of processors requested, or change the cutoff/skin
+
+The Shardlow splitting algorithm requires the size of the sub-domain lengths
+to be are larger than twice the cutoff+skin.  Generally, the domain decomposition
+is dependant on the number of processors requested.
+
+*/
diff --git a/src/KOKKOS/fix_wall_lj93_kokkos.cpp b/src/KOKKOS/fix_wall_lj93_kokkos.cpp
new file mode 100644
index 0000000000..b0f7e0bda4
--- /dev/null
+++ b/src/KOKKOS/fix_wall_lj93_kokkos.cpp
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include "fix_wall_lj93_kokkos.h"
+#include "atom_kokkos.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template <class DeviceType>
+FixWallLJ93Kokkos<DeviceType>::FixWallLJ93Kokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixWallLJ93(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ----------------------------------------------------------------------
+   interaction of all particles in group with a wall
+   m = index of wall coeffs
+   which = xlo,xhi,ylo,yhi,zlo,zhi
+   error if any particle is on or behind wall
+------------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixWallLJ93Kokkos<DeviceType>::wall_particle(int m_in, int which, double coord_in)
+{
+  m = m_in;
+  coord = coord_in;
+
+  atomKK->sync(execution_space, X_MASK|F_MASK|MASK_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  DAT::tdual_int_scalar k_oneflag = DAT::tdual_int_scalar("fix:oneflag");
+  d_oneflag = k_oneflag.view<DeviceType>();
+
+  int nlocal = atom->nlocal;
+
+  dim = which / 2;
+  side = which % 2;
+  if (side == 0) side = -1;
+
+  copymode = 1;
+  FixWallLJ93KokkosFunctor<DeviceType> wp_functor(this);
+  Kokkos::parallel_reduce(nlocal,wp_functor,ewall);
+  copymode = 0;
+
+  atomKK->modified(execution_space, F_MASK);
+
+  k_oneflag.template modify<DeviceType>();
+  k_oneflag.template sync<LMPHostType>();
+  if (k_oneflag.h_view()) error->one(FLERR,"Particle on or inside fix wall surface");
+}
+
+template <class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixWallLJ93Kokkos<DeviceType>::wall_particle_item(int i, value_type ewall) const {
+  if (mask(i) & groupbit) {
+    double delta;
+    if (side < 0) delta = x(i,dim) - coord;
+    else delta = coord - x(i,dim);
+    if (delta >= cutoff[m]) return;
+    if (delta <= 0.0) {
+      d_oneflag() = 1;
+      return;
+    }
+    double rinv = 1.0/delta;
+    double r2inv = rinv*rinv;
+    double r4inv = r2inv*r2inv;
+    double r10inv = r4inv*r4inv*r2inv;
+    double fwall = side * (coeff1[m]*r10inv - coeff2[m]*r4inv);
+    f(i,dim) -= fwall;
+    ewall[0] += coeff3[m]*r4inv*r4inv*rinv -
+      coeff4[m]*r2inv*rinv - offset[m];
+    ewall[m+1] += fwall;
+  }
+}
+
+namespace LAMMPS_NS {
+template class FixWallLJ93Kokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class FixWallLJ93Kokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/fix_wall_lj93_kokkos.h b/src/KOKKOS/fix_wall_lj93_kokkos.h
new file mode 100644
index 0000000000..64f3c59a62
--- /dev/null
+++ b/src/KOKKOS/fix_wall_lj93_kokkos.h
@@ -0,0 +1,83 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+
+FixStyle(wall/lj93/kk,FixWallLJ93Kokkos<LMPDeviceType>)
+FixStyle(wall/lj93/kk/device,FixWallLJ93Kokkos<LMPDeviceType>)
+FixStyle(wall/lj93/kk/host,FixWallLJ93Kokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_FIX_WALL_LJ93_KOKKOS_H
+#define LMP_FIX_WALL_LJ93_KOKKOS_H
+
+#include "fix_wall_lj93.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template <class DeviceType>
+class FixWallLJ93Kokkos : public FixWallLJ93 {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef double value_type[];
+
+  FixWallLJ93Kokkos(class LAMMPS *, int, char **);
+  void wall_particle(int, int, double);
+
+  int m;
+
+  KOKKOS_INLINE_FUNCTION
+  void wall_particle_item(int, value_type) const;
+
+ private:
+  int dim,side;
+  double coord;
+
+  typename AT::t_x_array x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d mask;
+  typename AT::t_int_scalar d_oneflag;
+};
+
+template <class DeviceType>
+struct FixWallLJ93KokkosFunctor  {
+  typedef DeviceType device_type ;
+  typedef double value_type[];
+  const int value_count;
+
+  FixWallLJ93Kokkos<DeviceType> c;
+  FixWallLJ93KokkosFunctor(FixWallLJ93Kokkos<DeviceType>* c_ptr):
+    c(*c_ptr),
+    value_count(c_ptr->m+1) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int i, value_type ewall) const {
+    c.wall_particle_item(i,ewall);
+  }
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Particle on or inside fix wall surface
+
+Particles must be "exterior" to the wall in order for energy/force to
+be calculated.
+
+*/
diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp
index b8be74ac1e..072a802b54 100644
--- a/src/KOKKOS/kokkos.cpp
+++ b/src/KOKKOS/kokkos.cpp
@@ -293,4 +293,4 @@ void KokkosLMP::my_signal_handler(int sig)
   if (sig == SIGSEGV) {
     kill(getpid(),SIGABRT);
   }
-}
\ No newline at end of file
+}
diff --git a/src/KOKKOS/nbin_kokkos.cpp b/src/KOKKOS/nbin_kokkos.cpp
index 5e41787247..c7e815928a 100644
--- a/src/KOKKOS/nbin_kokkos.cpp
+++ b/src/KOKKOS/nbin_kokkos.cpp
@@ -95,7 +95,6 @@ void NBinKokkos<DeviceType>::bin_atoms()
     MemsetZeroFunctor<DeviceType> f_zero;
     f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
     Kokkos::parallel_for(mbins, f_zero);
-    DeviceType::fence();
 
     atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
     x = atomKK->k_x.view<DeviceType>();
@@ -106,7 +105,6 @@ void NBinKokkos<DeviceType>::bin_atoms()
     NPairKokkosBinAtomsFunctor<DeviceType> f(*this);
 
     Kokkos::parallel_for(atom->nlocal+atom->nghost, f);
-    DeviceType::fence();
 
     deep_copy(h_resize, d_resize);
     if(h_resize()) {
diff --git a/src/KOKKOS/nbin_ssa_kokkos.cpp b/src/KOKKOS/nbin_ssa_kokkos.cpp
new file mode 100644
index 0000000000..ab97cb5848
--- /dev/null
+++ b/src/KOKKOS/nbin_ssa_kokkos.cpp
@@ -0,0 +1,307 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   James Larentzos (ARL) and Timothy I. Mattox (Engility Corporation)
+------------------------------------------------------------------------- */
+
+#include "nbin_ssa_kokkos.h"
+#include "neighbor.h"
+#include "atom_kokkos.h"
+#include "group.h"
+#include "domain.h"
+#include "comm.h"
+#include "update.h"
+#include "error.h"
+#include "atom_masks.h"
+
+// #include "memory.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+NBinSSAKokkos<DeviceType>::NBinSSAKokkos(LAMMPS *lmp) : NBinStandard(lmp)
+{
+  atoms_per_bin = ghosts_per_gbin = 16;
+
+  d_resize = typename AT::t_int_scalar("NBinSSAKokkos::d_resize");
+  d_lbinxlo = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinxlo");
+  d_lbinylo = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinylo");
+  d_lbinzlo = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinzlo");
+  d_lbinxhi = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinxhi");
+  d_lbinyhi = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinyhi");
+  d_lbinzhi = typename AT::t_int_scalar("NBinSSAKokkos::d_lbinzhi");
+#ifndef KOKKOS_USE_CUDA_UVM
+  h_resize = Kokkos::create_mirror_view(d_resize);
+  h_lbinxlo = Kokkos::create_mirror_view(d_lbinxlo);
+  h_lbinylo = Kokkos::create_mirror_view(d_lbinylo);
+  h_lbinzlo = Kokkos::create_mirror_view(d_lbinzlo);
+  h_lbinxhi = Kokkos::create_mirror_view(d_lbinxhi);
+  h_lbinyhi = Kokkos::create_mirror_view(d_lbinyhi);
+  h_lbinzhi = Kokkos::create_mirror_view(d_lbinzhi);
+#else
+  h_resize = d_resize;
+  h_lbinxlo = d_lbinxlo;
+  h_lbinylo = d_lbinylo;
+  h_lbinzlo = d_lbinzlo;
+  h_lbinxhi = d_lbinxhi;
+  h_lbinyhi = d_lbinyhi;
+  h_lbinzhi = d_lbinzhi;
+#endif
+  h_resize() = 1;
+
+  k_gbincount = DAT::tdual_int_1d("NBinSSAKokkos::gbincount",8);
+  gbincount = k_gbincount.view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NBinSSAKokkos<DeviceType>::bin_atoms_setup(int nall)
+{
+  if (mbins > (int) k_bins.h_view.dimension_0()) {
+    k_bins = DAT::tdual_int_2d("NBinSSAKokkos::bins",mbins,atoms_per_bin);
+    bins = k_bins.view<DeviceType>();
+
+    k_bincount = DAT::tdual_int_1d("NBinSSAKokkos::bincount",mbins);
+    bincount = k_bincount.view<DeviceType>();
+  }
+
+  ghosts_per_gbin = atom->nghost / 7; // estimate needed size
+
+  if (ghosts_per_gbin > (int) k_gbins.h_view.dimension_1()) {
+    k_gbins = DAT::tdual_int_2d("NBinSSAKokkos::gbins",8,ghosts_per_gbin);
+    gbins = k_gbins.view<DeviceType>();
+  }
+
+  // Clear the local bin extent bounding box.
+  h_lbinxlo() = mbinx - 1; // Safe to = stencil->sx + 1
+  h_lbinylo() = mbiny - 1; // Safe to = stencil->sy + 1
+  h_lbinzlo() = mbinz - 1; // Safe to = stencil->sz + 1
+  h_lbinxhi() = 0; // Safe to = mbinx - stencil->sx - 1
+  h_lbinyhi() = 0; // Safe to = mbiny - stencil->sy - 1
+  h_lbinzhi() = 0; // Safe to = mbinz - stencil->sz - 1
+  deep_copy(d_lbinxlo, h_lbinxlo);
+  deep_copy(d_lbinylo, h_lbinylo);
+  deep_copy(d_lbinzlo, h_lbinzlo);
+  deep_copy(d_lbinxhi, h_lbinxhi);
+  deep_copy(d_lbinyhi, h_lbinyhi);
+  deep_copy(d_lbinzhi, h_lbinzhi);
+}
+
+/* ----------------------------------------------------------------------
+   bin owned and ghost atoms for the Shardlow Splitting Algorithm (SSA)
+   local atoms are in distinct bins (binhead[]) from the ghosts
+   ghost atoms are "binned" in gairhead_ssa[] instead
+     ghosts which are not in an Active Interaction Region (AIR) are skipped
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NBinSSAKokkos<DeviceType>::bin_atoms()
+{
+  last_bin = update->ntimestep;
+
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int nall = nlocal + nghost;
+
+  atomKK->sync(ExecutionSpaceFromDevice<DeviceType>::space,X_MASK);
+  x = atomKK->k_x.view<DeviceType>();
+
+  sublo_[0] = domain->sublo[0];
+  sublo_[1] = domain->sublo[1];
+  sublo_[2] = domain->sublo[2];
+  subhi_[0] = domain->subhi[0];
+  subhi_[1] = domain->subhi[1];
+  subhi_[2] = domain->subhi[2];
+
+  bboxlo_[0] = bboxlo[0]; bboxlo_[1] = bboxlo[1]; bboxlo_[2] = bboxlo[2];
+  bboxhi_[0] = bboxhi[0]; bboxhi_[1] = bboxhi[1]; bboxhi_[2] = bboxhi[2];
+
+  k_binID = DAT::tdual_int_1d("NBinSSAKokkos::binID",nall);
+  binID = k_binID.view<DeviceType>();
+
+  // find each local atom's binID
+  {
+    atoms_per_bin = 0;
+    NPairSSAKokkosBinIDAtomsFunctor<DeviceType> f(*this);
+    Kokkos::parallel_reduce(nlocal, f, atoms_per_bin);
+  }
+  deep_copy(h_lbinxlo, d_lbinxlo);
+  deep_copy(h_lbinylo, d_lbinylo);
+  deep_copy(h_lbinzlo, d_lbinzlo);
+  deep_copy(h_lbinxhi, d_lbinxhi);
+  deep_copy(h_lbinyhi, d_lbinyhi);
+  deep_copy(h_lbinzhi, d_lbinzhi);
+
+  // find each ghost's binID (AIR number)
+  {
+    for (int i = 0; i < 8; i++) k_gbincount.h_view(i) = 0;
+    k_gbincount.modify<LMPHostType>();
+    k_gbincount.sync<DeviceType>();
+    ghosts_per_gbin = 0;
+    NPairSSAKokkosBinIDGhostsFunctor<DeviceType> f(*this);
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nall), f, ghosts_per_gbin);
+  }
+
+  // actually bin the ghost atoms
+  {
+    if(ghosts_per_gbin > (int) gbins.dimension_1()) {
+      k_gbins = DAT::tdual_int_2d("gbins", 8, ghosts_per_gbin);
+      gbins = k_gbins.view<DeviceType>();
+    }
+    for (int i = 0; i < 8; i++) k_gbincount.h_view(i) = 0;
+    k_gbincount.modify<LMPHostType>();
+    k_gbincount.sync<DeviceType>();
+
+    auto binID_ = binID;
+    auto gbincount_ = gbincount;
+    auto gbins_ = gbins;
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(nlocal,nall),
+      LAMMPS_LAMBDA (const int i) {
+      const int iAIR = binID_(i);
+      if (iAIR > 0) { // include only ghost atoms in an AIR
+        const int ac = Kokkos::atomic_fetch_add(&gbincount_[iAIR], (int)1);
+        gbins_(iAIR, ac) = i;
+      }
+    });
+    Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType>(1,8),
+      LAMMPS_LAMBDA (const int i) {
+      sortBin(gbincount_, gbins_, i);
+    });
+  }
+  c_gbins = gbins; // gbins won't change until the next bin_atoms
+
+  // actually bin the local atoms
+  {
+    if ((mbins > (int) bins.dimension_0()) ||
+        (atoms_per_bin > (int) bins.dimension_1())) {
+      k_bins = DAT::tdual_int_2d("bins", mbins, atoms_per_bin);
+      bins = k_bins.view<DeviceType>();
+    }
+    MemsetZeroFunctor<DeviceType> f_zero;
+    f_zero.ptr = (void*) k_bincount.view<DeviceType>().ptr_on_device();
+    Kokkos::parallel_for(mbins, f_zero);
+
+    auto bincount_ = bincount;
+    auto bins_ = bins;
+
+    NPairSSAKokkosBinAtomsFunctor<DeviceType> f(*this);
+    Kokkos::parallel_for(nlocal, f);
+
+    Kokkos::parallel_for(mbins,
+      LAMMPS_LAMBDA (const int i) {
+      sortBin(bincount_, bins_, i);
+    });
+  }
+  k_bins.modify<DeviceType>();
+  k_bincount.modify<DeviceType>();
+  c_bins = bins; // bins won't change until the next bin_atoms
+
+  k_gbins.modify<DeviceType>();
+  k_gbincount.modify<DeviceType>();
+
+//now dispose of the k_binID array
+  k_binID = DAT::tdual_int_1d("NBinSSAKokkos::binID",0);
+  binID = k_binID.view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::binAtomsItem(const int &i) const
+{
+  const int ibin = binID(i);
+  const int ac = Kokkos::atomic_fetch_add(&(bincount[ibin]), (int)1);
+  bins(ibin, ac) = i;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::binIDAtomsItem(const int &i, int &update) const
+{
+  int loc[3];
+  const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
+  binID(i) = ibin;
+
+  // Find the bounding box of the local atoms in the bins
+  if (loc[0] < d_lbinxlo()) Kokkos::atomic_fetch_min(&d_lbinxlo(),loc[0]);
+  if (loc[0] >= d_lbinxhi()) Kokkos::atomic_fetch_max(&d_lbinxhi(),loc[0] + 1);
+  if (loc[1] < d_lbinylo()) Kokkos::atomic_fetch_min(&d_lbinylo(),loc[1]);
+  if (loc[1] >= d_lbinyhi()) Kokkos::atomic_fetch_max(&d_lbinyhi(),loc[1] + 1);
+  if (loc[2] < d_lbinzlo()) Kokkos::atomic_fetch_min(&d_lbinzlo(),loc[2]);
+  if (loc[2] >= d_lbinzhi()) Kokkos::atomic_fetch_max(&d_lbinzhi(),loc[2] + 1);
+
+  const int ac = Kokkos::atomic_fetch_add(&(bincount[ibin]), (int)1);
+  if (update <= ac) update = ac + 1;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::binIDGhostsItem(const int &i, int &update) const
+{
+  const int iAIR = coord2ssaAIR(x(i, 0), x(i, 1), x(i, 2));
+  binID(i) = iAIR;
+  if (iAIR > 0) { // include only ghost atoms in an AIR
+    const int ac = Kokkos::atomic_fetch_add(&gbincount[iAIR], (int)1);
+    if (update <= ac) update = ac + 1;
+  }
+}
+
+// An implementation of heapsort without recursion
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void NBinSSAKokkos<DeviceType>::sortBin(
+      typename AT::t_int_1d gbincount,
+      typename AT::t_int_2d gbins,
+      const int &ibin)
+{
+  int n = gbincount(ibin);
+  int i = n/2;
+  int t;
+
+  do { /* Loops until bin is sorted */
+    if (i > 0) { /* First stage - Sorting the heap */
+      i--;           /* Save its index to i */
+      t = gbins(ibin, i);    /* Save parent value to t */
+    } else {     /* Second stage - Extracting elements in-place */
+      if ((--n) <= 0) return; /* When the heap is empty, we are done */
+      t = gbins(ibin, n);    /* Save last value (it will be overwritten) */
+      gbins(ibin, n) = gbins(ibin, 0); /* Save largest value at the end of the bin */
+    }
+    int parent = i; /* We will start pushing down t from parent */
+    int child = i*2 + 1; /* parent's left child */
+    /* Sift operation - pushing the value of t down the heap */
+    while (child < n) {
+      /* Choose the largest child */
+      if ((child + 1 < n) && (gbins(ibin, child + 1) > gbins(ibin, child))) ++child;
+      if (gbins(ibin, child) <= t) break; /* t's place is found */
+      gbins(ibin, parent) = gbins(ibin, child); /* Move the largest child up */
+      parent = child; /* Move parent pointer to this child */
+      child = parent*2+1; /* Find the next child */
+    }
+    gbins(ibin, parent) = t; /* We save t in the heap */
+  } while(1);
+}
+
+namespace LAMMPS_NS {
+template class NBinSSAKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class NBinSSAKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/nbin_ssa_kokkos.h b/src/KOKKOS/nbin_ssa_kokkos.h
new file mode 100644
index 0000000000..cc98859913
--- /dev/null
+++ b/src/KOKKOS/nbin_ssa_kokkos.h
@@ -0,0 +1,246 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef NBIN_CLASS
+
+NBinStyle(ssa/kk/host,
+          NBinSSAKokkos<LMPHostType>,
+          NB_SSA | NB_KOKKOS_HOST)
+
+NBinStyle(ssa/kk/device,
+          NBinSSAKokkos<LMPDeviceType>,
+          NB_SSA | NB_KOKKOS_DEVICE)
+
+#else
+
+#ifndef LMP_NBIN_SSA_KOKKOS_H
+#define LMP_NBIN_SSA_KOKKOS_H
+
+#include "nbin_standard.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class NBinSSAKokkos : public NBinStandard {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+
+  NBinSSAKokkos(class LAMMPS *);
+  ~NBinSSAKokkos() {}
+  void bin_atoms_setup(int);
+  void bin_atoms();
+
+   // temporary array to hold the binID for each atom
+  DAT::tdual_int_1d k_binID;
+  typename AT::t_int_1d binID;
+  typename AT::t_int_1d_const c_binID;
+
+  int atoms_per_bin;
+  DAT::tdual_int_1d k_bincount;
+  DAT::tdual_int_2d k_bins;
+  typename AT::t_int_1d bincount;
+  typename AT::t_int_2d bins;
+  typename AT::t_int_2d_const c_bins;
+
+  int ghosts_per_gbin;
+  DAT::tdual_int_1d k_gbincount;
+  DAT::tdual_int_2d k_gbins;
+  typename AT::t_int_1d gbincount;
+  typename AT::t_int_2d gbins;
+  typename AT::t_int_2d_const c_gbins;
+
+  typename AT::t_int_scalar d_resize;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
+  typename AT::t_x_array_randomread x;
+
+  // Bounds of the local atoms in the bins array
+  typename AT::t_int_scalar d_lbinxlo;  // lowest local bin x-dim coordinate
+  typename AT::t_int_scalar d_lbinylo;  // lowest local bin y-dim coordinate
+  typename AT::t_int_scalar d_lbinzlo;  // lowest local bin z-dim coordinate
+  typename AT::t_int_scalar d_lbinxhi;  // highest local bin x-dim coordinate
+  typename AT::t_int_scalar d_lbinyhi;  // highest local bin y-dim coordinate
+  typename AT::t_int_scalar d_lbinzhi;  // highest local bin z-dim coordinate
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinxlo;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinylo;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinzlo;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinxhi;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinyhi;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_lbinzhi;
+
+
+  KOKKOS_INLINE_FUNCTION
+  void binAtomsItem(const int &i) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void binIDAtomsItem(const int &i, int &update) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void binIDGhostsItem(const int &i, int &update) const;
+
+  static KOKKOS_INLINE_FUNCTION
+  void sortBin(
+      typename AT::t_int_1d gbincount,
+      typename AT::t_int_2d gbins,
+      const int &ibin);
+
+/* ----------------------------------------------------------------------
+   convert atom coords into the ssa active interaction region number
+------------------------------------------------------------------------- */
+  KOKKOS_INLINE_FUNCTION
+  int coord2ssaAIR(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
+  {
+    int ix, iy, iz;
+    ix = iy = iz = 0;
+    if (z < sublo_[2]) iz = -1;
+    if (z >= subhi_[2]) iz = 1;
+    if (y < sublo_[1]) iy = -1;
+    if (y >= subhi_[1]) iy = 1;
+    if (x < sublo_[0]) ix = -1;
+    if (x >= subhi_[0]) ix = 1;
+    if(iz < 0){
+      return -1;
+    } else if(iz == 0){
+      if( iy<0 ) return -1; // bottom left/middle/right
+      if( (iy==0) && (ix<0)  ) return -1; // left atoms
+      if( (iy==0) && (ix==0) ) return 0; // Locally owned atoms
+      if( (iy==0) && (ix>0)  ) return 2; // Right atoms
+      if( (iy>0)  && (ix==0) ) return 1; // Top-middle atoms
+      if( (iy>0)  && (ix!=0) ) return 3; // Top-right and top-left atoms
+    } else { // iz > 0
+      if((ix==0) && (iy==0)) return 4; // Back atoms
+      if((ix==0) && (iy!=0)) return 5; // Top-back and bottom-back atoms
+      if((ix!=0) && (iy==0)) return 6; // Left-back and right-back atoms
+      if((ix!=0) && (iy!=0)) return 7; // Back corner atoms
+    }
+    return -2;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    i[0] = ix - mbinxlo;
+    i[1] = iy - mbinylo;
+    i[2] = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+ private:
+  double bboxlo_[3],bboxhi_[3];
+  double sublo_[3], subhi_[3];
+};
+
+template<class DeviceType>
+struct NPairSSAKokkosBinAtomsFunctor {
+  typedef DeviceType device_type;
+
+  const NBinSSAKokkos<DeviceType> c;
+
+  NPairSSAKokkosBinAtomsFunctor(const NBinSSAKokkos<DeviceType> &_c):
+    c(_c) {};
+  ~NPairSSAKokkosBinAtomsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i) const {
+    c.binAtomsItem(i);
+  }
+};
+
+template<class DeviceType>
+struct NPairSSAKokkosBinIDAtomsFunctor {
+  typedef DeviceType device_type;
+  typedef int value_type;
+
+  const NBinSSAKokkos<DeviceType> c;
+
+  NPairSSAKokkosBinIDAtomsFunctor(const NBinSSAKokkos<DeviceType> &_c):
+    c(_c) {};
+  ~NPairSSAKokkosBinIDAtomsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i, value_type& update) const {
+    c.binIDAtomsItem(i, update);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join (volatile value_type& dst,
+             const volatile value_type& src) const {
+    if (dst < src) dst = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init (value_type& dst) const {
+    dst = INT_MIN;
+  }
+};
+
+template<class DeviceType>
+struct NPairSSAKokkosBinIDGhostsFunctor {
+  typedef DeviceType device_type;
+  typedef int value_type;
+
+  const NBinSSAKokkos<DeviceType> c;
+
+  NPairSSAKokkosBinIDGhostsFunctor(const NBinSSAKokkos<DeviceType> &_c):
+    c(_c) {};
+  ~NPairSSAKokkosBinIDGhostsFunctor() {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int & i, value_type& update) const {
+    c.binIDGhostsItem(i, update);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join (volatile value_type& dst,
+             const volatile value_type& src) const {
+    if (dst < src) dst = src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init (value_type& dst) const {
+    dst = INT_MIN;
+  }
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/neigh_bond_kokkos.cpp b/src/KOKKOS/neigh_bond_kokkos.cpp
index a8c230fa59..a674e7cec4 100644
--- a/src/KOKKOS/neigh_bond_kokkos.cpp
+++ b/src/KOKKOS/neigh_bond_kokkos.cpp
@@ -274,7 +274,6 @@ void NeighBondKokkos<DeviceType>::bond_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondBondAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -370,7 +369,6 @@ void NeighBondKokkos<DeviceType>::bond_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondBondPartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -443,7 +441,6 @@ void NeighBondKokkos<DeviceType>::bond_check()
   k_bondlist.sync<DeviceType>();
 
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondBondCheck>(0,neighbor->nbondlist),*this,flag);
-  DeviceType::fence();
 
   int flag_all;
   MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
@@ -494,7 +491,6 @@ void NeighBondKokkos<DeviceType>::angle_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondAngleAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -597,7 +593,6 @@ void NeighBondKokkos<DeviceType>::angle_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondAnglePartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -678,7 +673,6 @@ void NeighBondKokkos<DeviceType>::angle_check()
   k_anglelist.sync<DeviceType>();
 
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondAngleCheck>(0,neighbor->nanglelist),*this,flag);
-  DeviceType::fence();
 
   int flag_all;
   MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
@@ -741,7 +735,6 @@ void NeighBondKokkos<DeviceType>::dihedral_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondDihedralAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -849,7 +842,6 @@ void NeighBondKokkos<DeviceType>::dihedral_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondDihedralPartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -935,7 +927,6 @@ void NeighBondKokkos<DeviceType>::dihedral_check(int nlist, typename AT::t_int_2
   k_dihedrallist.sync<DeviceType>();
 
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondDihedralCheck>(0,nlist),*this,flag);
-  DeviceType::fence();
 
   int flag_all;
   MPI_Allreduce(&flag,&flag_all,1,MPI_INT,MPI_SUM,world);
@@ -1015,7 +1006,6 @@ void NeighBondKokkos<DeviceType>::improper_all()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondImproperAll>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
@@ -1123,7 +1113,6 @@ void NeighBondKokkos<DeviceType>::improper_partial()
     k_fail_flag.template sync<DeviceType>();
 
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighBondImproperPartial>(0,nlocal),*this,nmissing);
-    DeviceType::fence();
 
     k_nlist.template modify<DeviceType>();
     k_nlist.template sync<LMPHostType>();
diff --git a/src/KOKKOS/neigh_list_kokkos.h b/src/KOKKOS/neigh_list_kokkos.h
index c887bd13b7..1c433f321c 100644
--- a/src/KOKKOS/neigh_list_kokkos.h
+++ b/src/KOKKOS/neigh_list_kokkos.h
@@ -48,7 +48,7 @@ class AtomNeighborsConst
   const int num_neighs;
 
   KOKKOS_INLINE_FUNCTION
-  AtomNeighborsConst(int* const & firstneigh, const int & _num_neighs,
+  AtomNeighborsConst(const int* const & firstneigh, const int & _num_neighs,
                      const int & stride):
   _firstneigh(firstneigh), num_neighs(_num_neighs), _stride(stride) {};
   KOKKOS_INLINE_FUNCTION
@@ -82,6 +82,14 @@ public:
                          &d_neighbors(i,1)-&d_neighbors(i,0));
   }
 
+  KOKKOS_INLINE_FUNCTION
+  static AtomNeighborsConst static_neighbors_const(int i,
+           typename ArrayTypes<Device>::t_neighbors_2d_const const& d_neighbors,
+           typename ArrayTypes<Device>::t_int_1d_const const& d_numneigh) {
+    return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i),
+                              &d_neighbors(i,1)-&d_neighbors(i,0));
+  }
+
   KOKKOS_INLINE_FUNCTION
   AtomNeighborsConst get_neighbors_const(const int &i) const {
     return AtomNeighborsConst(&d_neighbors(i,0),d_numneigh(i),
diff --git a/src/KOKKOS/neighbor_kokkos.cpp b/src/KOKKOS/neighbor_kokkos.cpp
index 8eda7ee55c..9a40808052 100644
--- a/src/KOKKOS/neighbor_kokkos.cpp
+++ b/src/KOKKOS/neighbor_kokkos.cpp
@@ -206,7 +206,6 @@ int NeighborKokkos::check_distance_kokkos()
   int flag = 0;
   copymode = 1;
   Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagNeighborCheckDistance<DeviceType> >(0,nlocal),*this,flag);
-  DeviceType::fence();
   copymode = 0;
 
   int flagall;
@@ -273,7 +272,6 @@ void NeighborKokkos::build_kokkos(int topoflag)
     }
     copymode = 1;
     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagNeighborXhold<DeviceType> >(0,nlocal),*this);
-    DeviceType::fence();
     copymode = 0;
     xhold.modify<DeviceType>();
     if (boxcheck) {
diff --git a/src/KOKKOS/npair_kokkos.cpp b/src/KOKKOS/npair_kokkos.cpp
index fd5f9373f2..b568bd5c93 100644
--- a/src/KOKKOS/npair_kokkos.cpp
+++ b/src/KOKKOS/npair_kokkos.cpp
@@ -173,19 +173,13 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
   data.special_flag[2] = special_flag[2];
   data.special_flag[3] = special_flag[3];
 
-  if(list->d_neighbors.dimension_0()<nall) {
-    list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", nall*1.1, list->maxneighs);
-    list->d_numneigh = typename ArrayTypes<DeviceType>::t_int_1d("numneigh", nall*1.1);
-    data.neigh_list.d_neighbors = list->d_neighbors;
-    data.neigh_list.d_numneigh = list->d_numneigh;
-  }
   data.h_resize()=1;
   while(data.h_resize()) {
     data.h_new_maxneighs() = list->maxneighs;
-  data.h_resize() = 0;
+    data.h_resize() = 0;
 
-  Kokkos::deep_copy(data.resize, data.h_resize);
-  Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
+    Kokkos::deep_copy(data.resize, data.h_resize);
+    Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
 #ifdef KOKKOS_HAVE_CUDA
     #define BINS_PER_BLOCK 2
     const int factor = atoms_per_bin<64?2:1;
@@ -194,33 +188,32 @@ void NPairKokkos<DeviceType,HALF_NEIGH,GHOST,TRI>::build(NeighList *list_)
     const int factor = 1;
 #endif
 
-if (GHOST) {
-  NPairKokkosBuildFunctorGhost<DeviceType,HALF_NEIGH> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
-  Kokkos::parallel_for(nall, f);
-} else {
-  if (newton_pair) {
-    NPairKokkosBuildFunctor<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
-#ifdef KOKKOS_HAVE_CUDA
-    if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
-      Kokkos::parallel_for(config, f);
-    else
+    if (GHOST) {
+      NPairKokkosBuildFunctorGhost<DeviceType,HALF_NEIGH> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
       Kokkos::parallel_for(nall, f);
-#else
-    Kokkos::parallel_for(nall, f);
-#endif
-  } else {
-    NPairKokkosBuildFunctor<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+    } else {
+      if (newton_pair) {
+        NPairKokkosBuildFunctor<DeviceType,TRI?0:HALF_NEIGH,1,TRI> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
 #ifdef KOKKOS_HAVE_CUDA
-    if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
-      Kokkos::parallel_for(config, f);
-    else
-      Kokkos::parallel_for(nall, f);
+        if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
+          Kokkos::parallel_for(config, f);
+        else
+          Kokkos::parallel_for(nall, f);
 #else
-    Kokkos::parallel_for(nall, f);
+        Kokkos::parallel_for(nall, f);
 #endif
-  }
-}
-  DeviceType::fence();
+      } else {
+        NPairKokkosBuildFunctor<DeviceType,HALF_NEIGH,0,0> f(data,atoms_per_bin * 5 * sizeof(X_FLOAT) * factor);
+#ifdef KOKKOS_HAVE_CUDA
+        if (ExecutionSpaceFromDevice<DeviceType>::space == Device)
+          Kokkos::parallel_for(config, f);
+        else
+          Kokkos::parallel_for(nall, f);
+#else
+        Kokkos::parallel_for(nall, f);
+#endif
+      }
+    }
     deep_copy(data.h_resize, data.resize);
 
     if(data.h_resize()) {
@@ -435,10 +428,10 @@ void NeighborKokkosExecute<DeviceType>::
 
   neigh_list.d_numneigh(i) = n;
 
-  if(n >= neigh_list.maxneighs) {
+  if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n >= new_maxneighs()) new_maxneighs() = n;
+    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
   }
 
   neigh_list.d_ilist(i) = i;
@@ -645,10 +638,10 @@ void NeighborKokkosExecute<DeviceType>::build_ItemCuda(typename Kokkos::TeamPoli
     neigh_list.d_ilist(i) = i;
   }
 
-  if(n >= neigh_list.maxneighs) {
+  if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n >= new_maxneighs()) new_maxneighs() = n;
+    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
   }
   }
 }
@@ -737,9 +730,9 @@ void NeighborKokkosExecute<DeviceType>::
     const int ybin = binxyz[1];
     const int zbin = binxyz[2];
     for (int k = 0; k < nstencil; k++) {
-      const X_FLOAT xbin2 = xbin + stencilxyz(k,0);
-      const X_FLOAT ybin2 = ybin + stencilxyz(k,1);
-      const X_FLOAT zbin2 = zbin + stencilxyz(k,2);
+      const int xbin2 = xbin + stencilxyz(k,0);
+      const int ybin2 = ybin + stencilxyz(k,1);
+      const int zbin2 = zbin + stencilxyz(k,2);
       if (xbin2 < 0 || xbin2 >= mbinx ||
           ybin2 < 0 || ybin2 >= mbiny ||
           zbin2 < 0 || zbin2 >= mbinz) continue;
@@ -768,10 +761,10 @@ void NeighborKokkosExecute<DeviceType>::
 
   neigh_list.d_numneigh(i) = n;
 
-  if(n >= neigh_list.maxneighs) {
+  if(n > neigh_list.maxneighs) {
     resize() = 1;
 
-    if(n >= new_maxneighs()) new_maxneighs() = n;
+    if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
   }
   neigh_list.d_ilist(i) = i;
 }
diff --git a/src/KOKKOS/npair_kokkos.h b/src/KOKKOS/npair_kokkos.h
index a28b5ff978..517ea546fa 100644
--- a/src/KOKKOS/npair_kokkos.h
+++ b/src/KOKKOS/npair_kokkos.h
@@ -281,9 +281,6 @@ class NeighborKokkosExecute
   void build_ItemCuda(typename Kokkos::TeamPolicy<DeviceType>::member_type dev) const;
 #endif
 
-  KOKKOS_INLINE_FUNCTION
-  void binatomsItem(const int &i) const;
-
   KOKKOS_INLINE_FUNCTION
   int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z) const
   {
diff --git a/src/KOKKOS/npair_ssa_kokkos.cpp b/src/KOKKOS/npair_ssa_kokkos.cpp
new file mode 100644
index 0000000000..b73e54e33f
--- /dev/null
+++ b/src/KOKKOS/npair_ssa_kokkos.cpp
@@ -0,0 +1,750 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors:
+   James Larentzos and Timothy I. Mattox (Engility Corporation)
+------------------------------------------------------------------------- */
+
+#include "npair_ssa_kokkos.h"
+#include "neigh_list.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "domain_kokkos.h"
+#include "neighbor_kokkos.h"
+#include "nbin_ssa_kokkos.h"
+#include "nstencil_ssa.h"
+#include "error.h"
+#include "comm.h"
+
+namespace LAMMPS_NS {
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+NPairSSAKokkos<DeviceType>::NPairSSAKokkos(LAMMPS *lmp) : NPair(lmp), ssa_phaseCt(27), ssa_gphaseCt(7)
+{
+  const int gphaseLenEstimate = 1; //FIXME make this 4 eventually
+  k_ssa_gphaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_gphaseLen",ssa_gphaseCt);
+  ssa_gphaseLen = k_ssa_gphaseLen.view<DeviceType>();
+
+  k_ssa_gitemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLoc",ssa_gphaseCt,gphaseLenEstimate);
+  ssa_gitemLoc = k_ssa_gitemLoc.view<DeviceType>();
+  k_ssa_gitemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_gitemLen",ssa_gphaseCt,gphaseLenEstimate);
+  ssa_gitemLen = k_ssa_gitemLen.view<DeviceType>();
+}
+
+/* ----------------------------------------------------------------------
+   copy needed info from Neighbor class to this build class
+   ------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::copy_neighbor_info()
+{
+  NPair::copy_neighbor_info();
+
+  NeighborKokkos* neighborKK = (NeighborKokkos*) neighbor;
+
+  // general params
+
+  k_cutneighsq = neighborKK->k_cutneighsq;
+
+  // exclusion info
+
+  k_ex1_type = neighborKK->k_ex1_type;
+  k_ex2_type = neighborKK->k_ex2_type;
+  k_ex_type = neighborKK->k_ex_type;
+  k_ex1_group = neighborKK->k_ex1_group;
+  k_ex2_group = neighborKK->k_ex2_group;
+  k_ex1_bit = neighborKK->k_ex1_bit;
+  k_ex2_bit = neighborKK->k_ex2_bit;
+  k_ex_mol_group = neighborKK->k_ex_mol_group;
+  k_ex_mol_bit = neighborKK->k_ex_mol_bit;
+}
+
+/* ----------------------------------------------------------------------
+ copy per-atom and per-bin vectors from NBinSSAKokkos class to this build class
+ ------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::copy_bin_info()
+{
+  NPair::copy_bin_info();
+
+  NBinSSAKokkos<DeviceType>* nbKK = dynamic_cast<NBinSSAKokkos<DeviceType>*>(nb);
+  if (!nbKK) error->one(FLERR, "NBin wasn't a NBinSSAKokkos object");
+
+  atoms_per_bin = nbKK->atoms_per_bin;
+  k_bincount = nbKK->k_bincount;
+  k_bins = nbKK->k_bins;
+
+  ghosts_per_gbin = nbKK->ghosts_per_gbin;
+  k_gbincount = nbKK->k_gbincount;
+  k_gbins = nbKK->k_gbins;
+
+  lbinxlo = nbKK->h_lbinxlo();
+  lbinxhi = nbKK->h_lbinxhi();
+  lbinylo = nbKK->h_lbinylo();
+  lbinyhi = nbKK->h_lbinyhi();
+  lbinzlo = nbKK->h_lbinzlo();
+  lbinzhi = nbKK->h_lbinzhi();
+}
+
+/* ----------------------------------------------------------------------
+ copy needed info from NStencil class to this build class
+ ------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::copy_stencil_info()
+{
+  NPair::copy_stencil_info();
+
+  nstencil = ns->nstencil;
+
+  int maxstencil = ns->get_maxstencil();
+
+  k_stencil = DAT::tdual_int_1d("NPairSSAKokkos:stencil",maxstencil);
+  for (int k = 0; k < maxstencil; k++) {
+    k_stencil.h_view(k) = ns->stencil[k];
+  }
+  k_stencil.modify<LMPHostType>();
+  k_stencil.sync<DeviceType>();
+  k_stencilxyz = DAT::tdual_int_1d_3("NPairSSAKokkos:stencilxyz",maxstencil);
+  for (int k = 0; k < maxstencil; k++) {
+    k_stencilxyz.h_view(k,0) = ns->stencilxyz[k][0];
+    k_stencilxyz.h_view(k,1) = ns->stencilxyz[k][1];
+    k_stencilxyz.h_view(k,2) = ns->stencilxyz[k][2];
+  }
+  k_stencilxyz.modify<LMPHostType>();
+  k_stencilxyz.sync<DeviceType>();
+
+  NStencilSSA *ns_ssa = dynamic_cast<NStencilSSA*>(ns);
+  if (!ns_ssa) error->one(FLERR, "NStencil wasn't a NStencilSSA object");
+
+  k_nstencil_ssa = DAT::tdual_int_1d("NPairSSAKokkos:nstencil_ssa",5);
+  for (int k = 0; k < 5; ++k) {
+    k_nstencil_ssa.h_view(k) = ns_ssa->nstencil_ssa[k];
+  }
+  k_nstencil_ssa.modify<LMPHostType>();
+  k_nstencil_ssa.sync<DeviceType>();
+  sx1 = ns_ssa->sx + 1;
+  sy1 = ns_ssa->sy + 1;
+  sz1 = ns_ssa->sz + 1;
+
+  // Setup the phases of the workplan for locals
+  ssa_phaseCt = sz1*sy1*sx1;
+  if (ssa_phaseCt > (int) k_ssa_phaseLen.dimension_0()) {
+    k_ssa_phaseLen = DAT::tdual_int_1d("NPairSSAKokkos:ssa_phaseLen",ssa_phaseCt);
+    ssa_phaseLen = k_ssa_phaseLen.view<DeviceType>();
+    k_ssa_phaseOff = DAT::tdual_int_1d_3("NPairSSAKokkos:ssa_phaseOff",ssa_phaseCt);
+    ssa_phaseOff = k_ssa_phaseOff.view<DeviceType>();
+  }
+  auto h_ssa_phaseOff = k_ssa_phaseOff.h_view;
+  k_ssa_phaseOff.sync<LMPHostType>();
+  int workPhase = 0;
+  for (int zoff = sz1 - 1; zoff >= 0; --zoff) {
+    for (int yoff = sy1 - 1; yoff >= 0; --yoff) {
+      for (int xoff = sx1 - 1; xoff >= 0; --xoff) {
+        h_ssa_phaseOff(workPhase, 0) = xoff;
+        h_ssa_phaseOff(workPhase, 1) = yoff;
+        h_ssa_phaseOff(workPhase, 2) = zoff;
+        workPhase++;
+      }
+    }
+  }
+  k_ssa_phaseOff.modify<LMPHostType>();
+  k_ssa_phaseOff.sync<DeviceType>();
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int NPairSSAKokkosExecute<DeviceType>::find_special(const int &i, const int &j) const
+{
+  const int n1 = nspecial(i,0);
+  const int n2 = nspecial(i,1);
+  const int n3 = nspecial(i,2);
+
+  for (int k = 0; k < n3; k++) {
+    if (special(i,k) == tag(j)) {
+      if (k < n1) {
+        if (special_flag[1] == 0) return -1;
+        else if (special_flag[1] == 1) return 0;
+        else return 1;
+      } else if (k < n2) {
+        if (special_flag[2] == 0) return -1;
+        else if (special_flag[2] == 1) return 0;
+        else return 2;
+      } else {
+        if (special_flag[3] == 0) return -1;
+        else if (special_flag[3] == 1) return 0;
+        else return 3;
+      }
+    }
+  }
+  return 0;
+};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int NPairSSAKokkosExecute<DeviceType>::exclusion(const int &i,const int &j,
+                                             const int &itype,const int &jtype) const
+{
+  int m;
+
+  if (nex_type && ex_type(itype,jtype)) return 1;
+
+  if (nex_group) {
+    for (m = 0; m < nex_group; m++) {
+      if (mask(i) & ex1_bit(m) && mask(j) & ex2_bit(m)) return 1;
+      if (mask(i) & ex2_bit(m) && mask(j) & ex1_bit(m)) return 1;
+    }
+  }
+
+  if (nex_mol) {
+    for (m = 0; m < nex_mol; m++)
+      if (mask(i) & ex_mol_bit(m) && mask(j) & ex_mol_bit(m) &&
+          molecule(i) == molecule(j)) return 1;
+  }
+
+  return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+
+/* ----------------------------------------------------------------------
+   binned neighbor list construction with full Newton's 3rd law
+   for use by Shardlow Spliting Algorithm
+   each owned atom i checks its own bin and other bins in Newton stencil
+   every pair stored exactly once by some processor
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void NPairSSAKokkos<DeviceType>::build(NeighList *list_)
+{
+  NeighListKokkos<DeviceType>* list = (NeighListKokkos<DeviceType>*) list_;
+  const int nlocal = includegroup?atom->nfirst:atom->nlocal;
+  int nl_size;
+
+  int xbinCt = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
+  int ybinCt = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
+  int zbinCt = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
+  int phaseLenEstimate = xbinCt*ybinCt*zbinCt;
+
+  if ((ssa_phaseCt > (int) k_ssa_itemLoc.dimension_0()) ||
+      (phaseLenEstimate > (int) k_ssa_itemLoc.dimension_1())) {
+    k_ssa_itemLoc = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLoc",ssa_phaseCt,phaseLenEstimate);
+    ssa_itemLoc = k_ssa_itemLoc.view<DeviceType>();
+    k_ssa_itemLen = DAT::tdual_int_2d("NPairSSAKokkos::ssa_itemLen",ssa_phaseCt,phaseLenEstimate);
+    ssa_itemLen = k_ssa_itemLen.view<DeviceType>();
+  }
+
+  k_ssa_itemLoc.sync<LMPHostType>();
+  k_ssa_itemLen.sync<LMPHostType>();
+  k_ssa_gitemLoc.sync<LMPHostType>();
+  k_ssa_gitemLen.sync<LMPHostType>();
+  k_ssa_phaseOff.sync<LMPHostType>();
+  k_ssa_phaseLen.sync<LMPHostType>();
+  auto h_ssa_itemLoc = k_ssa_itemLoc.h_view;
+  auto h_ssa_itemLen = k_ssa_itemLen.h_view;
+  auto h_ssa_gitemLoc = k_ssa_gitemLoc.h_view;
+  auto h_ssa_gitemLen = k_ssa_gitemLen.h_view;
+  auto h_ssa_phaseOff = k_ssa_phaseOff.h_view;
+  auto h_ssa_phaseLen = k_ssa_phaseLen.h_view;
+
+{ // Preflight the neighbor list workplan
+  k_bincount.sync<LMPHostType>();
+  auto h_bincount = k_bincount.h_view;
+  k_stencil.sync<LMPHostType>();
+  auto h_stencil = k_stencil.h_view;
+  k_nstencil_ssa.sync<LMPHostType>();
+  auto h_nstencil_ssa = k_nstencil_ssa.h_view;
+  int inum = 0;
+
+  // loop over bins with local atoms, counting half of the neighbors
+  for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+    int zoff = h_ssa_phaseOff(workPhase, 2);
+    int yoff = h_ssa_phaseOff(workPhase, 1);
+    int xoff = h_ssa_phaseOff(workPhase, 0);
+    int workItem = 0;
+  for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
+  for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
+  for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
+    int inum_start = inum;
+//    if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
+
+    for (int subphase = 0; subphase < 4; subphase++) {
+      int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
+      int s_xbin = xbin + ((subphase & 0x1) ? sx1 - 1 : 0);
+      if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue;
+      if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
+
+      const int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
+      const int ibinCt = h_bincount(ibin);
+      if (ibinCt > 0) {
+        int base_n = 0;
+        bool include_same = false;
+        // count all local atoms in the current stencil "subphase" as potential neighbors
+        for (int k = h_nstencil_ssa(subphase); k < h_nstencil_ssa(subphase+1); k++) {
+          const int jbin = ibin+h_stencil(k);
+          if (jbin != ibin) base_n += h_bincount(jbin);
+          else include_same = true;
+        }
+        // Calculate how many ibin particles would have had some neighbors
+        if (base_n > 0) inum += ibinCt;
+        else if (include_same) inum += ibinCt - 1;
+      }
+    }
+    h_ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
+    h_ssa_itemLen(workPhase,workItem) = inum - inum_start; // record workItem length
+#ifdef DEBUG_SSA_BUILD_LOCALS
+if (h_ssa_itemLen(workPhase,workItem) < 0) fprintf(stdout, "undr%03d phase (%3d,%3d) inum %d - inum_start %d UNDERFLOW\n"
+  ,comm->me
+  ,workPhase
+  ,workItem
+  ,inum
+  ,inum_start
+);
+#endif
+    workItem++;
+  }
+  }
+  }
+
+#ifdef DEBUG_SSA_BUILD_LOCALS
+fprintf(stdout, "phas%03d phase %3d could use %6d inums, expected %6d inums. maxworkItems = %3d, inums/workItems = %g\n"
+  ,comm->me
+  ,workPhase
+  ,inum - h_ssa_itemLoc(workPhase, 0)
+  ,(nlocal*4 + ssa_phaseCt - 1) / ssa_phaseCt
+  ,workItem
+  ,(inum - h_ssa_itemLoc(workPhase, 0)) / (double) workItem
+);
+#endif
+    // record where workPhase ends
+    h_ssa_phaseLen(workPhase) = workItem;
+  }
+#ifdef DEBUG_SSA_BUILD_LOCALS
+fprintf(stdout, "tota%03d total %3d could use %6d inums, expected %6d inums. inums/phase = %g\n"
+  ,comm->me
+  ,workPhase
+  ,inum
+  ,nlocal*4
+  ,inum / (double) workPhase
+);
+#endif
+  nl_size = inum; // record how much space is needed for the local work plan
+}
+
+  // count how many ghosts might have neighbors, and increase the work plan storage
+  k_gbincount.sync<LMPHostType>();
+  for (int workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) {
+    int len = k_gbincount.h_view(workPhase + 1);
+    h_ssa_gitemLoc(workPhase,0) = nl_size; // record where workItem starts in ilist
+    h_ssa_gitemLen(workPhase,0) = len;
+    nl_size += len;
+  }
+  list->grow(nl_size); // Make special larger SSA neighbor list
+
+  k_ssa_itemLoc.modify<LMPHostType>();
+  k_ssa_itemLen.modify<LMPHostType>();
+  k_ssa_gitemLoc.modify<LMPHostType>();
+  k_ssa_gitemLen.modify<LMPHostType>();
+  k_ssa_phaseLen.modify<LMPHostType>();
+  k_ssa_itemLoc.sync<DeviceType>();
+  k_ssa_itemLen.sync<DeviceType>();
+  k_ssa_gitemLen.sync<DeviceType>();
+  k_ssa_gitemLoc.sync<DeviceType>();
+  k_ssa_phaseOff.sync<DeviceType>();
+  k_ssa_phaseLen.sync<DeviceType>();
+  k_ssa_gphaseLen.sync<DeviceType>();
+
+  NPairSSAKokkosExecute<DeviceType>
+    data(*list,
+         k_cutneighsq.view<DeviceType>(),
+         k_bincount.view<DeviceType>(),
+         k_bins.view<DeviceType>(),
+         k_gbincount.view<DeviceType>(),
+         k_gbins.view<DeviceType>(),
+         lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi,
+         nstencil, sx1, sy1, sz1,
+         k_stencil.view<DeviceType>(),
+         k_stencilxyz.view<DeviceType>(),
+         k_nstencil_ssa.view<DeviceType>(),
+         ssa_phaseCt,
+         k_ssa_phaseLen.view<DeviceType>(),
+         k_ssa_phaseOff.view<DeviceType>(),
+         k_ssa_itemLoc.view<DeviceType>(),
+         k_ssa_itemLen.view<DeviceType>(),
+         ssa_gphaseCt,
+         k_ssa_gphaseLen.view<DeviceType>(),
+         k_ssa_gitemLoc.view<DeviceType>(),
+         k_ssa_gitemLen.view<DeviceType>(),
+         nlocal,
+         atomKK->k_x.view<DeviceType>(),
+         atomKK->k_type.view<DeviceType>(),
+         atomKK->k_mask.view<DeviceType>(),
+         atomKK->k_molecule.view<DeviceType>(),
+         atomKK->k_tag.view<DeviceType>(),
+         atomKK->k_special.view<DeviceType>(),
+         atomKK->k_nspecial.view<DeviceType>(),
+         atomKK->molecular,
+         nbinx,nbiny,nbinz,mbinx,mbiny,mbinz,mbinxlo,mbinylo,mbinzlo,
+         bininvx,bininvy,bininvz,
+         exclude, nex_type,
+         k_ex1_type.view<DeviceType>(),
+         k_ex2_type.view<DeviceType>(),
+         k_ex_type.view<DeviceType>(),
+         nex_group,
+         k_ex1_group.view<DeviceType>(),
+         k_ex2_group.view<DeviceType>(),
+         k_ex1_bit.view<DeviceType>(),
+         k_ex2_bit.view<DeviceType>(),
+         nex_mol,
+         k_ex_mol_group.view<DeviceType>(),
+         k_ex_mol_bit.view<DeviceType>(),
+         bboxhi,bboxlo,
+         domain->xperiodic,domain->yperiodic,domain->zperiodic,
+         domain->xprd_half,domain->yprd_half,domain->zprd_half);
+
+  k_cutneighsq.sync<DeviceType>();
+  k_ex1_type.sync<DeviceType>();
+  k_ex2_type.sync<DeviceType>();
+  k_ex_type.sync<DeviceType>();
+  k_ex1_group.sync<DeviceType>();
+  k_ex2_group.sync<DeviceType>();
+  k_ex1_bit.sync<DeviceType>();
+  k_ex2_bit.sync<DeviceType>();
+  k_ex_mol_group.sync<DeviceType>();
+  k_ex_mol_bit.sync<DeviceType>();
+  k_bincount.sync<DeviceType>();
+  k_bins.sync<DeviceType>();
+  k_gbincount.sync<DeviceType>();
+  k_gbins.sync<DeviceType>();
+  atomKK->sync(Device,X_MASK|TYPE_MASK|MASK_MASK|MOLECULE_MASK|TAG_MASK|SPECIAL_MASK);
+
+  data.special_flag[0] = special_flag[0];
+  data.special_flag[1] = special_flag[1];
+  data.special_flag[2] = special_flag[2];
+  data.special_flag[3] = special_flag[3];
+
+  bool firstTry = true;
+  data.h_resize()=1;
+  while(data.h_resize()) {
+    data.h_new_maxneighs() = list->maxneighs;
+    data.h_resize() = 0;
+
+    Kokkos::deep_copy(data.resize, data.h_resize);
+    Kokkos::deep_copy(data.new_maxneighs, data.h_new_maxneighs);
+
+    // loop over bins with local atoms, storing half of the neighbors
+    Kokkos::parallel_for(ssa_phaseCt, LAMMPS_LAMBDA (const int workPhase) {
+      data.build_locals_onePhase(firstTry, comm->me, workPhase);
+    });
+    k_ssa_itemLoc.modify<DeviceType>();
+    k_ssa_itemLen.modify<DeviceType>();
+    k_ssa_phaseLen.modify<DeviceType>();
+    k_ssa_itemLoc.sync<LMPHostType>();
+    k_ssa_itemLen.sync<LMPHostType>();
+    k_ssa_phaseLen.sync<LMPHostType>();
+    data.neigh_list.inum = h_ssa_itemLoc(ssa_phaseCt-1,h_ssa_phaseLen(ssa_phaseCt-1)-1) +
+      h_ssa_itemLen(ssa_phaseCt-1,h_ssa_phaseLen(ssa_phaseCt-1)-1);
+
+    // loop over AIR ghost atoms, storing their local neighbors
+    Kokkos::parallel_for(ssa_gphaseCt, LAMMPS_LAMBDA (const int workPhase) {
+      data.build_ghosts_onePhase(workPhase);
+    });
+    k_ssa_gitemLoc.modify<DeviceType>();
+    k_ssa_gitemLen.modify<DeviceType>();
+    k_ssa_gphaseLen.modify<DeviceType>();
+    k_ssa_gitemLoc.sync<LMPHostType>();
+    k_ssa_gitemLen.sync<LMPHostType>();
+    k_ssa_gphaseLen.sync<LMPHostType>();
+    auto h_ssa_gphaseLen = k_ssa_gphaseLen.h_view;
+    data.neigh_list.gnum = h_ssa_gitemLoc(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) +
+      h_ssa_gitemLen(ssa_gphaseCt-1,h_ssa_gphaseLen(ssa_gphaseCt-1)-1) - data.neigh_list.inum;
+    firstTry = false;
+
+    deep_copy(data.h_resize, data.resize);
+
+    if(data.h_resize()) {
+      deep_copy(data.h_new_maxneighs, data.new_maxneighs);
+      list->maxneighs = data.h_new_maxneighs() * 1.2;
+      list->d_neighbors = typename ArrayTypes<DeviceType>::t_neighbors_2d("neighbors", list->d_neighbors.dimension_0(), list->maxneighs);
+      data.neigh_list.d_neighbors = list->d_neighbors;
+      data.neigh_list.maxneighs = list->maxneighs;
+    }
+  }
+
+  //k_ssa_phaseLen.modify<DeviceType>();
+  //k_ssa_itemLoc.modify<DeviceType>();
+  //k_ssa_itemLen.modify<DeviceType>();
+  //k_ssa_gphaseLen.modify<DeviceType>();
+  //k_ssa_gitemLoc.modify<DeviceType>();
+  //k_ssa_gitemLen.modify<DeviceType>();
+
+  list->inum = data.neigh_list.inum; //FIXME once the above is in a parallel_for
+  list->gnum = data.neigh_list.gnum; // it will need a deep_copy or something
+
+#ifdef DEBUG_SSA_BUILD_LOCALS
+fprintf(stdout, "Fina%03d %6d inum %6d gnum, total used %6d, allocated %6d\n"
+  ,comm->me
+  ,list->inum
+  ,list->gnum
+  ,list->inum + list->gnum
+  ,nl_size
+);
+#endif
+
+  list->k_ilist.template modify<DeviceType>();
+}
+
+
+template<class DeviceType>
+void NPairSSAKokkosExecute<DeviceType>::build_locals_onePhase(const bool firstTry, int me, int workPhase) const
+{
+  const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil = d_stencil;
+  int which = 0;
+
+  int zoff = d_ssa_phaseOff(workPhase, 2);
+  int yoff = d_ssa_phaseOff(workPhase, 1);
+  int xoff = d_ssa_phaseOff(workPhase, 0);
+  int workItem = 0;
+  int skippedItems = 0;
+  for (int zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
+  for (int ybin = lbinylo + yoff - sy1 + 1; ybin < lbinyhi; ybin += sy1) {
+  for (int xbin = lbinxlo + xoff - sx1 + 1; xbin < lbinxhi; xbin += sx1) {
+    if (d_ssa_itemLen(workPhase, workItem + skippedItems) == 0) {
+      if (firstTry) ++skippedItems;
+      else ++workItem; // phase is done,should break out of three loops here if we could...
+      continue;
+    }
+    int inum_start = d_ssa_itemLoc(workPhase, workItem + skippedItems);
+    int inum = inum_start;
+
+    for (int subphase = 0; subphase < 4; subphase++) {
+      int s_ybin = ybin + ((subphase & 0x2) ? sy1 - 1 : 0);
+      int s_xbin = xbin + ((subphase & 0x1) ? sx1 - 1 : 0);
+      if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue;
+      if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
+
+      int ibin = zbin*mbiny*mbinx + s_ybin*mbinx + s_xbin;
+      for (int il = 0; il < c_bincount(ibin); ++il) {
+        const int i = c_bins(ibin, il);
+        int n = 0;
+
+        const AtomNeighbors neighbors_i = neigh_list.get_neighbors(inum);
+        const X_FLOAT xtmp = x(i, 0);
+        const X_FLOAT ytmp = x(i, 1);
+        const X_FLOAT ztmp = x(i, 2);
+        const int itype = type(i);
+
+        // loop over all local atoms in the current stencil "subphase"
+        for (int k = d_nstencil_ssa(subphase); k < d_nstencil_ssa(subphase+1); k++) {
+          const int jbin = ibin+stencil(k);
+          int jl;
+          if (jbin != ibin) jl = 0;
+          else jl = il + 1; // same bin as i, so start just past i in the bin
+          for (; jl < c_bincount(jbin); ++jl) {
+            const int j = c_bins(jbin, jl);
+            const int jtype = type(j);
+            if(exclude && exclusion(i,j,itype,jtype)) continue;
+
+            const X_FLOAT delx = xtmp - x(j, 0);
+            const X_FLOAT dely = ytmp - x(j, 1);
+            const X_FLOAT delz = ztmp - x(j, 2);
+            const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+            if(rsq <= cutneighsq(itype,jtype)) {
+              if (molecular) {
+                if (!moltemplate)
+                  which = find_special(i,j);
+                    /* else if (imol >= 0) */
+                    /*   which = find_special(onemols[imol]->special[iatom], */
+                    /*                        onemols[imol]->nspecial[iatom], */
+                    /*                        tag[j]-tagprev); */
+                    /* else which = 0; */
+                if (which == 0){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }else if (minimum_image_check(delx,dely,delz)){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }
+                else if (which > 0) {
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
+                  else n++;
+                }
+              } else {
+                if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                else n++;
+              }
+            }
+          }
+        }
+
+        if (n > 0) {
+          neigh_list.d_numneigh(inum) = n;
+          neigh_list.d_ilist(inum++) = i;
+          if(n > neigh_list.maxneighs) {
+            resize() = 1;
+            if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+          }
+        }
+      }
+    }
+    int len = inum - inum_start;
+#ifdef DEBUG_SSA_BUILD_LOCALS
+    if (len != d_ssa_itemLen(workPhase, workItem + skippedItems)) {
+fprintf(stdout, "Leng%03d workphase (%2d,%3d,%3d): len  = %4d, but ssa_itemLen = %4d%s\n"
+  ,me
+  ,workPhase
+  ,workItem
+  ,workItem + skippedItems
+  ,len
+  ,d_ssa_itemLen(workPhase, workItem + skippedItems)
+  ,(len > d_ssa_itemLen(workPhase, workItem + skippedItems)) ? " OVERFLOW" : ""
+);
+    }
+#endif
+    if (inum > inum_start) {
+      d_ssa_itemLoc(workPhase,workItem) = inum_start; // record where workItem starts in ilist
+      d_ssa_itemLen(workPhase,workItem) = inum - inum_start; // record actual workItem length
+      workItem++;
+    } else if (firstTry) ++skippedItems;
+  }
+  }
+  }
+
+#ifdef DEBUG_SSA_BUILD_LOCALS
+fprintf(stdout, "Phas%03d phase %3d used %6d inums, workItems = %3d, skipped = %3d, inums/workItems = %g\n"
+  ,me
+  ,workPhase
+  ,inum - d_ssa_itemLoc(workPhase, 0)
+  ,workItem
+  ,skippedItems
+  ,(inum - d_ssa_itemLoc(workPhase, 0)) / (double) workItem
+);
+#endif
+    // record where workPhase actually ends
+    if (firstTry) {
+      d_ssa_phaseLen(workPhase) = workItem;
+      while (workItem < (int) d_ssa_itemLen.dimension_1()) {
+        d_ssa_itemLen(workPhase,workItem++) = 0;
+      }
+    }
+
+}
+
+
+template<class DeviceType>
+void NPairSSAKokkosExecute<DeviceType>::build_ghosts_onePhase(int workPhase) const
+{
+  const typename ArrayTypes<DeviceType>::t_int_1d_const_um stencil = d_stencil;
+  int which = 0;
+
+  // since these are ghosts, must check if stencil bin is out of bounds
+    int airnum = workPhase + 1;
+    //FIXME for now, there is only 1 workItem for each ghost AIR
+    int workItem;
+    for (workItem = 0; workItem < 1; ++workItem) {
+      int gNdx = d_ssa_gitemLoc(workPhase, workItem); // record where workItem starts in ilist
+      for (int il = 0; il < c_gbincount(airnum); ++il) {
+        const int i = c_gbins(airnum, il);
+        int n = 0;
+
+        const AtomNeighbors neighbors_i = neigh_list.get_neighbors(gNdx);
+        const X_FLOAT xtmp = x(i, 0);
+        const X_FLOAT ytmp = x(i, 1);
+        const X_FLOAT ztmp = x(i, 2);
+        const int itype = type(i);
+
+        int loc[3];
+        const int ibin = coord2bin(x(i, 0), x(i, 1), x(i, 2), &(loc[0]));
+
+        // loop over AIR ghost atoms in all bins in "full" stencil
+        // Note: the non-AIR ghost atoms have already been filtered out
+        for (int k = 0; k < nstencil; k++) {
+          int xbin2 = loc[0] + d_stencilxyz(k,0);
+          int ybin2 = loc[1] + d_stencilxyz(k,1);
+          int zbin2 = loc[2] + d_stencilxyz(k,2);
+          // Skip it if this bin is outside the extent of local bins
+          if (xbin2 < lbinxlo || xbin2 >= lbinxhi ||
+              ybin2 < lbinylo || ybin2 >= lbinyhi ||
+              zbin2 < lbinzlo || zbin2 >= lbinzhi) continue;
+          const int jbin = ibin+stencil(k);
+          for (int jl = 0; jl < c_bincount(jbin); ++jl) {
+            const int j = c_bins(jbin, jl);
+            const int jtype = type(j);
+            if(exclude && exclusion(i,j,itype,jtype)) continue;
+
+            const X_FLOAT delx = xtmp - x(j, 0);
+            const X_FLOAT dely = ytmp - x(j, 1);
+            const X_FLOAT delz = ztmp - x(j, 2);
+            const X_FLOAT rsq = delx*delx + dely*dely + delz*delz;
+            if(rsq <= cutneighsq(itype,jtype)) {
+              if (molecular) {
+                if (!moltemplate)
+                  which = find_special(j,i);
+                    /* else if (jmol >= 0) */
+                    /*   which = find_special(onemols[jmol]->special[jatom], */
+                    /*                        onemols[jmol]->nspecial[jatom], */
+                    /*                        tag[i]-jtagprev); */
+                    /* else which = 0; */
+                if (which == 0){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }else if (minimum_image_check(delx,dely,delz)){
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                  else n++;
+                }
+                else if (which > 0) {
+                  if(n<neigh_list.maxneighs) neighbors_i(n++) = j ^ (which << SBBITS);
+                  else n++;
+                }
+              } else {
+                if(n<neigh_list.maxneighs) neighbors_i(n++) = j;
+                else n++;
+              }
+            }
+          }
+        }
+
+        if (n > 0) {
+          neigh_list.d_numneigh(gNdx) = n;
+          neigh_list.d_ilist(gNdx++) = i;
+          if(n > neigh_list.maxneighs) {
+            resize() = 1;
+            if(n > new_maxneighs()) Kokkos::atomic_fetch_max(&new_maxneighs(),n);
+          }
+        }
+      }
+      // record where workItem ends in ilist
+      d_ssa_gitemLen(workPhase,workItem) = gNdx - d_ssa_gitemLoc(workPhase,workItem);
+      // if (d_ssa_gitemLen(workPhase,workItem) > 0) workItem++;
+    }
+    d_ssa_gphaseLen(workPhase) = workItem;
+}
+
+}
+
+namespace LAMMPS_NS {
+template class NPairSSAKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class NPairSSAKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/npair_ssa_kokkos.h b/src/KOKKOS/npair_ssa_kokkos.h
new file mode 100644
index 0000000000..98046feba8
--- /dev/null
+++ b/src/KOKKOS/npair_ssa_kokkos.h
@@ -0,0 +1,362 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef NPAIR_CLASS
+
+typedef NPairSSAKokkos<LMPHostType> NPairSSAKokkosHost;
+NPairStyle(half/bin/newton/ssa/kk/host,
+           NPairSSAKokkosHost,
+           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST | NP_KOKKOS_HOST)
+
+typedef NPairSSAKokkos<LMPDeviceType> NPairSSAKokkosDevice;
+NPairStyle(half/bin/newton/ssa/kk/device,
+           NPairSSAKokkosDevice,
+           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST | NP_KOKKOS_DEVICE)
+
+#else
+
+#ifndef LMP_NPAIR_SSA_KOKKOS_H
+#define LMP_NPAIR_SSA_KOKKOS_H
+
+#include "npair.h"
+#include "neigh_list_kokkos.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class NPairSSAKokkos : public NPair {
+ public:
+  typedef ArrayTypes<DeviceType> AT;
+
+  // SSA Work plan data structures
+  int ssa_phaseCt;
+  DAT::tdual_int_1d k_ssa_phaseLen;
+  DAT::tdual_int_1d_3 k_ssa_phaseOff;
+  DAT::tdual_int_2d k_ssa_itemLoc;
+  DAT::tdual_int_2d k_ssa_itemLen;
+  typename AT::t_int_1d ssa_phaseLen;
+  typename AT::t_int_1d_3 ssa_phaseOff;
+  typename AT::t_int_2d ssa_itemLoc;
+  typename AT::t_int_2d ssa_itemLen;
+
+  const int ssa_gphaseCt;
+  DAT::tdual_int_1d k_ssa_gphaseLen;
+  DAT::tdual_int_2d k_ssa_gitemLoc;
+  DAT::tdual_int_2d k_ssa_gitemLen;
+  typename AT::t_int_1d ssa_gphaseLen;
+  typename AT::t_int_2d ssa_gitemLoc;
+  typename AT::t_int_2d ssa_gitemLen;
+
+  NPairSSAKokkos(class LAMMPS *);
+  ~NPairSSAKokkos() {}
+  void copy_neighbor_info();
+  void copy_bin_info();
+  void copy_stencil_info();
+  void build(class NeighList *);
+ private:
+  // data from Neighbor class
+
+  DAT::tdual_xfloat_2d k_cutneighsq;
+
+  // exclusion data from Neighbor class
+
+  DAT::tdual_int_1d k_ex1_type,k_ex2_type;
+  DAT::tdual_int_2d k_ex_type;
+  DAT::tdual_int_1d k_ex1_group,k_ex2_group;
+  DAT::tdual_int_1d k_ex1_bit,k_ex2_bit;
+  DAT::tdual_int_1d k_ex_mol_group;
+  DAT::tdual_int_1d k_ex_mol_bit;
+
+  // data from NBinSSA class
+
+  int atoms_per_bin;
+  DAT::tdual_int_1d k_bincount;
+  DAT::tdual_int_2d k_bins;
+  int ghosts_per_gbin;
+  DAT::tdual_int_1d k_gbincount;
+  DAT::tdual_int_2d k_gbins;
+  int lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi;
+
+  // data from NStencilSSA class
+
+  int nstencil;
+  DAT::tdual_int_1d k_stencil;  // # of J neighs for each I
+  DAT::tdual_int_1d_3 k_stencilxyz;
+  DAT::tdual_int_1d k_nstencil_ssa;
+  int sx1, sy1, sz1;
+};
+
+template<class DeviceType>
+class NPairSSAKokkosExecute
+{
+  typedef ArrayTypes<DeviceType> AT;
+
+ public:
+  NeighListKokkos<DeviceType> neigh_list;
+
+  // data from Neighbor class
+
+  const typename AT::t_xfloat_2d_randomread cutneighsq;
+
+  // exclusion data from Neighbor class
+
+  const int exclude;
+
+  const int nex_type;
+  const typename AT::t_int_1d_const ex1_type,ex2_type;
+  const typename AT::t_int_2d_const ex_type;
+
+  const int nex_group;
+  const typename AT::t_int_1d_const ex1_group,ex2_group;
+  const typename AT::t_int_1d_const ex1_bit,ex2_bit;
+
+  const int nex_mol;
+  const typename AT::t_int_1d_const ex_mol_group;
+  const typename AT::t_int_1d_const ex_mol_bit;
+
+  // data from NBinSSA class
+
+  const typename AT::t_int_1d bincount;
+  const typename AT::t_int_1d_const c_bincount;
+  typename AT::t_int_2d bins;
+  typename AT::t_int_2d_const c_bins;
+  const typename AT::t_int_1d gbincount;
+  const typename AT::t_int_1d_const c_gbincount;
+  typename AT::t_int_2d gbins;
+  typename AT::t_int_2d_const c_gbins;
+  const int lbinxlo, lbinxhi, lbinylo, lbinyhi, lbinzlo, lbinzhi;
+
+
+  // data from NStencil class
+
+  const int nstencil;
+  const int sx1, sy1, sz1;
+  typename AT::t_int_1d d_stencil;  // # of J neighs for each I
+  typename AT::t_int_1d_3 d_stencilxyz;
+  typename AT::t_int_1d d_nstencil_ssa;
+
+  // data from Atom class
+
+  const typename AT::t_x_array_randomread x;
+  const typename AT::t_int_1d_const type,mask;
+  const typename AT::t_tagint_1d_const molecule;
+  const typename AT::t_tagint_1d_const tag;
+  const typename AT::t_tagint_2d_const special;
+  const typename AT::t_int_2d_const nspecial;
+  const int molecular;
+  int moltemplate;
+
+  int special_flag[4];
+
+  const int nbinx,nbiny,nbinz;
+  const int mbinx,mbiny,mbinz;
+  const int mbinxlo,mbinylo,mbinzlo;
+  const X_FLOAT bininvx,bininvy,bininvz;
+  X_FLOAT bboxhi[3],bboxlo[3];
+
+  const int nlocal;
+
+  typename AT::t_int_scalar resize;
+  typename AT::t_int_scalar new_maxneighs;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_resize;
+  typename ArrayTypes<LMPHostType>::t_int_scalar h_new_maxneighs;
+
+  const int xperiodic, yperiodic, zperiodic;
+  const int xprd_half, yprd_half, zprd_half;
+
+  // SSA Work plan data structures
+  int ssa_phaseCt;
+  typename AT::t_int_1d d_ssa_phaseLen;
+  typename AT::t_int_1d_3_const d_ssa_phaseOff;
+  typename AT::t_int_2d d_ssa_itemLoc;
+  typename AT::t_int_2d d_ssa_itemLen;
+  int ssa_gphaseCt;
+  typename AT::t_int_1d d_ssa_gphaseLen;
+  typename AT::t_int_2d d_ssa_gitemLoc;
+  typename AT::t_int_2d d_ssa_gitemLen;
+
+  NPairSSAKokkosExecute(
+        const NeighListKokkos<DeviceType> &_neigh_list,
+        const typename AT::t_xfloat_2d_randomread &_cutneighsq,
+        const typename AT::t_int_1d &_bincount,
+        const typename AT::t_int_2d &_bins,
+        const typename AT::t_int_1d &_gbincount,
+        const typename AT::t_int_2d &_gbins,
+        const int _lbinxlo, const int _lbinxhi,
+        const int _lbinylo, const int _lbinyhi,
+        const int _lbinzlo, const int _lbinzhi,
+        const int _nstencil, const int _sx1, const int _sy1, const int _sz1,
+        const typename AT::t_int_1d &_d_stencil,
+        const typename AT::t_int_1d_3 &_d_stencilxyz,
+        const typename AT::t_int_1d &_d_nstencil_ssa,
+        const int _ssa_phaseCt,
+        const typename AT::t_int_1d &_d_ssa_phaseLen,
+        const typename AT::t_int_1d_3 &_d_ssa_phaseOff,
+        const typename AT::t_int_2d &_d_ssa_itemLoc,
+        const typename AT::t_int_2d &_d_ssa_itemLen,
+        const int _ssa_gphaseCt,
+        const typename AT::t_int_1d &_d_ssa_gphaseLen,
+        const typename AT::t_int_2d &_d_ssa_gitemLoc,
+        const typename AT::t_int_2d &_d_ssa_gitemLen,
+        const int _nlocal,
+        const typename AT::t_x_array_randomread &_x,
+        const typename AT::t_int_1d_const &_type,
+        const typename AT::t_int_1d_const &_mask,
+        const typename AT::t_tagint_1d_const &_molecule,
+        const typename AT::t_tagint_1d_const &_tag,
+        const typename AT::t_tagint_2d_const &_special,
+        const typename AT::t_int_2d_const &_nspecial,
+        const int &_molecular,
+        const int & _nbinx,const int & _nbiny,const int & _nbinz,
+        const int & _mbinx,const int & _mbiny,const int & _mbinz,
+        const int & _mbinxlo,const int & _mbinylo,const int & _mbinzlo,
+        const X_FLOAT &_bininvx,const X_FLOAT &_bininvy,const X_FLOAT &_bininvz,
+        const int & _exclude,const int & _nex_type,
+        const typename AT::t_int_1d_const & _ex1_type,
+        const typename AT::t_int_1d_const & _ex2_type,
+        const typename AT::t_int_2d_const & _ex_type,
+        const int & _nex_group,
+        const typename AT::t_int_1d_const & _ex1_group,
+        const typename AT::t_int_1d_const & _ex2_group,
+        const typename AT::t_int_1d_const & _ex1_bit,
+        const typename AT::t_int_1d_const & _ex2_bit,
+        const int & _nex_mol,
+        const typename AT::t_int_1d_const & _ex_mol_group,
+        const typename AT::t_int_1d_const & _ex_mol_bit,
+        const X_FLOAT *_bboxhi, const X_FLOAT* _bboxlo,
+        const int & _xperiodic, const int & _yperiodic, const int & _zperiodic,
+        const int & _xprd_half, const int & _yprd_half, const int & _zprd_half):
+    neigh_list(_neigh_list), cutneighsq(_cutneighsq),
+    bincount(_bincount),c_bincount(_bincount),bins(_bins),c_bins(_bins),
+    gbincount(_gbincount),c_gbincount(_gbincount),gbins(_gbins),c_gbins(_gbins),
+    lbinxlo(_lbinxlo),lbinxhi(_lbinxhi),
+    lbinylo(_lbinylo),lbinyhi(_lbinyhi),
+    lbinzlo(_lbinzlo),lbinzhi(_lbinzhi),
+    nstencil(_nstencil),sx1(_sx1),sy1(_sy1),sz1(_sz1),
+    d_stencil(_d_stencil),d_stencilxyz(_d_stencilxyz),d_nstencil_ssa(_d_nstencil_ssa),
+    ssa_phaseCt(_ssa_phaseCt),
+    d_ssa_phaseLen(_d_ssa_phaseLen),
+    d_ssa_phaseOff(_d_ssa_phaseOff),
+    d_ssa_itemLoc(_d_ssa_itemLoc),
+    d_ssa_itemLen(_d_ssa_itemLen),
+    ssa_gphaseCt(_ssa_gphaseCt),
+    d_ssa_gphaseLen(_d_ssa_gphaseLen),
+    d_ssa_gitemLoc(_d_ssa_gitemLoc),
+    d_ssa_gitemLen(_d_ssa_gitemLen),
+    nlocal(_nlocal),
+    x(_x),type(_type),mask(_mask),molecule(_molecule),
+    tag(_tag),special(_special),nspecial(_nspecial),molecular(_molecular),
+    nbinx(_nbinx),nbiny(_nbiny),nbinz(_nbinz),
+    mbinx(_mbinx),mbiny(_mbiny),mbinz(_mbinz),
+    mbinxlo(_mbinxlo),mbinylo(_mbinylo),mbinzlo(_mbinzlo),
+    bininvx(_bininvx),bininvy(_bininvy),bininvz(_bininvz),
+    exclude(_exclude),nex_type(_nex_type),
+    ex1_type(_ex1_type),ex2_type(_ex2_type),ex_type(_ex_type),
+    nex_group(_nex_group),
+    ex1_group(_ex1_group),ex2_group(_ex2_group),
+    ex1_bit(_ex1_bit),ex2_bit(_ex2_bit),nex_mol(_nex_mol),
+    ex_mol_group(_ex_mol_group),ex_mol_bit(_ex_mol_bit),
+    xperiodic(_xperiodic),yperiodic(_yperiodic),zperiodic(_zperiodic),
+    xprd_half(_xprd_half),yprd_half(_yprd_half),zprd_half(_zprd_half) {
+
+    if (molecular == 2) moltemplate = 1;
+    else moltemplate = 0;
+
+    bboxlo[0] = _bboxlo[0]; bboxlo[1] = _bboxlo[1]; bboxlo[2] = _bboxlo[2];
+    bboxhi[0] = _bboxhi[0]; bboxhi[1] = _bboxhi[1]; bboxhi[2] = _bboxhi[2];
+
+    resize = typename AT::t_int_scalar("NPairSSAKokkosExecute::resize");
+#ifndef KOKKOS_USE_CUDA_UVM
+    h_resize = Kokkos::create_mirror_view(resize);
+#else
+    h_resize = resize;
+#endif
+    h_resize() = 1;
+    new_maxneighs = typename AT::
+      t_int_scalar("NPairSSAKokkosExecute::new_maxneighs");
+#ifndef KOKKOS_USE_CUDA_UVM
+    h_new_maxneighs = Kokkos::create_mirror_view(new_maxneighs);
+#else
+    h_new_maxneighs = new_maxneighs;
+#endif
+    h_new_maxneighs() = neigh_list.maxneighs;
+  };
+
+  ~NPairSSAKokkosExecute() {neigh_list.copymode = 1;};
+
+  KOKKOS_FUNCTION
+  void build_locals_onePhase(const bool firstTry, int me, int workPhase) const;
+
+  KOKKOS_FUNCTION
+  void build_ghosts_onePhase(int workPhase) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int coord2bin(const X_FLOAT & x,const X_FLOAT & y,const X_FLOAT & z, int* i) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi[0])
+      ix = static_cast<int> ((x-bboxhi[0])*bininvx) + nbinx;
+    else if (x >= bboxlo[0]) {
+      ix = static_cast<int> ((x-bboxlo[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo[0])*bininvx) - 1;
+
+    if (y >= bboxhi[1])
+      iy = static_cast<int> ((y-bboxhi[1])*bininvy) + nbiny;
+    else if (y >= bboxlo[1]) {
+      iy = static_cast<int> ((y-bboxlo[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo[1])*bininvy) - 1;
+
+    if (z >= bboxhi[2])
+      iz = static_cast<int> ((z-bboxhi[2])*bininvz) + nbinz;
+    else if (z >= bboxlo[2]) {
+      iz = static_cast<int> ((z-bboxlo[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo[2])*bininvz) - 1;
+
+    i[0] = ix - mbinxlo;
+    i[1] = iy - mbinylo;
+    i[2] = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int exclusion(const int &i,const int &j, const int &itype,const int &jtype) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int find_special(const int &i, const int &j) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int minimum_image_check(double dx, double dy, double dz) const {
+    if (xperiodic && fabs(dx) > xprd_half) return 1;
+    if (yperiodic && fabs(dy) > yprd_half) return 1;
+    if (zperiodic && fabs(dz) > zprd_half) return 1;
+    return 0;
+  }
+
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
new file mode 100644
index 0000000000..c559ab412f
--- /dev/null
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.cpp
@@ -0,0 +1,796 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (Sandia)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "atom_kokkos.h"
+#include "atom_vec.h"
+#include "comm.h"
+#include "update.h"
+#include "fix.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "modify.h"
+#include "pair_dpd_fdt_energy_kokkos.h"
+#include "error.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+#define EPSILON 1.0e-10
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairDPDfdtEnergyKokkos<DeviceType>::PairDPDfdtEnergyKokkos(LAMMPS *lmp) :
+  PairDPDfdtEnergy(lmp),
+#ifdef DPD_USE_RAN_MARS
+  rand_pool(0 /* unused */, lmp)
+#else
+  rand_pool()
+#endif
+{
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairDPDfdtEnergyKokkos<DeviceType>::~PairDPDfdtEnergyKokkos()
+{
+  if (copymode) return;
+
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
+  if (allocated) {
+    memory->destroy_kokkos(k_duCond,duCond);
+    memory->destroy_kokkos(k_duMech,duMech);
+  }
+
+  memory->destroy_kokkos(k_cutsq,cutsq);
+
+#ifdef DPD_USE_RAN_MARS
+  rand_pool.destroy();
+#endif
+}
+
+/* ----------------------------------------------------------------------
+   init specific to this pair style
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::init_style()
+{
+  PairDPDfdtEnergy::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with dpd/fdt/energy/kk");
+  }
+
+#ifdef DPD_USE_RAN_MARS
+  rand_pool.init(random,seed);
+#else
+  rand_pool.init(seed + comm->me,DeviceType::max_hardware_threads());
+#endif
+}
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(__CUDACC__)
+// CUDA specialization of init_style to properly call rand_pool.init()
+template<>
+void PairDPDfdtEnergyKokkos<Kokkos::Cuda>::init_style()
+{
+  PairDPDfdtEnergy::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<Kokkos::Cuda,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<Kokkos::Cuda,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<Kokkos::Cuda,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with dpd/fdt/energy/kk");
+  }
+
+#ifdef DPD_USE_RAN_MARS
+  rand_pool.init(random,seed);
+#else
+  rand_pool.init(seed + comm->me,4*32768 /*fake max_hardware_threads()*/);
+#endif
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  copymode = 1;
+
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
+  else evflag = vflag_fdotr = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.template view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.template view<DeviceType>();
+  }
+
+  x = atomKK->k_x.view<DeviceType>();
+  v = atomKK->k_v.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  mass = atomKK->k_mass.view<DeviceType>();
+  rmass = atomKK->rmass;
+  dpdTheta = atomKK->k_dpdTheta.view<DeviceType>();
+
+  k_cutsq.template sync<DeviceType>();
+  k_params.template sync<DeviceType>();
+  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK);
+  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK);
+  else atomKK->modified(execution_space,F_MASK);
+
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+
+  nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int newton_pair = force->newton_pair;
+  dtinvsqrt = 1.0/sqrt(update->dt);
+
+  int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  boltz = force->boltz;
+  ftm2v = force->ftm2v;
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+
+  if (splitFDT_flag) {
+    if (!a0_is_zero) {
+      if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+        if (neighflag == HALF) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0,false> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0,false> >(0,inum),*this);
+          }
+        } else if (neighflag == HALFTHREAD) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0,false> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
+          }
+        } else if (neighflag == FULL) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,0,false> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,1,false> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,0,false> >(0,inum),*this);
+          }
+        }
+      } else {
+        if (neighflag == HALF) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,1,0,true> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALF,0,0,true> >(0,inum),*this);
+          }
+        } else if (neighflag == HALFTHREAD) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,1,0,true> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<HALFTHREAD,0,0,true> >(0,inum),*this);
+          }
+        } else if (neighflag == FULL) {
+          if (newton_pair) {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,1,0,true> >(0,inum),*this);
+          } else {
+            if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,1,true> >(0,inum),*this,ev);
+            else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeSplit<FULL,0,0,true> >(0,inum),*this);
+          }
+        }
+      }
+    }
+  } else {
+
+    // Allocate memory for duCond and duMech
+    if (allocated) {
+      memory->destroy_kokkos(k_duCond,duCond);
+      memory->destroy_kokkos(k_duMech,duMech);
+    }
+    memory->create_kokkos(k_duCond,duCond,nlocal+nghost,"pair:duCond");
+    memory->create_kokkos(k_duMech,duMech,nlocal+nghost,"pair:duMech");
+    d_duCond = k_duCond.view<DeviceType>();
+    d_duMech = k_duMech.view<DeviceType>();
+    h_duCond = k_duCond.h_view;
+    h_duMech = k_duMech.h_view;
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyZero>(0,nlocal+nghost),*this);
+
+    atomKK->sync(execution_space,V_MASK | DPDTHETA_MASK | RMASS_MASK);
+    atomKK->k_mass.sync<DeviceType>();
+
+    // loop over neighbors of my atoms
+
+    if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+      if (neighflag == HALF) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,0,false> >(0,inum),*this);
+        }
+      } else if (neighflag == HALFTHREAD) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
+        }
+      } else if (neighflag == FULL) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,0,false> >(0,inum),*this);
+        }
+      }
+    } else {
+      if (neighflag == HALF) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALF,0,0,false> >(0,inum),*this);
+        }
+      } else if (neighflag == HALFTHREAD) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<HALFTHREAD,0,0,false> >(0,inum),*this);
+        }
+      } else if (neighflag == FULL) {
+        if (newton_pair) {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,1,0,false> >(0,inum),*this);
+        } else {
+          if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,1,false> >(0,inum),*this,ev);
+          else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairDPDfdtEnergyComputeNoSplit<FULL,0,0,false> >(0,inum),*this);
+        }
+      }
+    }
+
+    // Communicate the ghost delta energies to the locally owned atoms
+
+    // this memory transfer can be removed when fix_dpd_fdt_energy_kokkos is added
+    k_duCond.template modify<DeviceType>();
+    k_duCond.template sync<LMPHostType>();
+    k_duMech.template modify<DeviceType>();
+    k_duMech.template sync<LMPHostType>();
+    comm->reverse_comm_pair(this);
+  }
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyZero, const int &ii) const {
+  d_duCond[ii] = 0.0;
+  d_duMech[ii] = 0.0;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii, EV_FLOAT& ev) const {
+
+  // The f array is atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+
+  int i,j,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double rsq,r,rinv,wd,wr,factor_dpd;
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+
+  for (jj = 0; jj < jnum; jj++) {
+    j = d_neighbors(i,jj);
+    factor_dpd = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    double cutsq_ij = STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype);
+    if (rsq < cutsq_ij) {
+      r = sqrt(rsq);
+      if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
+      rinv = 1.0/r;
+      double cut_ij = STACKPARAMS?m_params[itype][jtype].cut:params(itype,jtype).cut;
+      wr = 1.0 - r/cut_ij;
+      wd = wr*wr;
+
+      // conservative force = a0 * wr
+      double a0_ij = STACKPARAMS?m_params[itype][jtype].a0:params(itype,jtype).a0;
+      fpair = a0_ij*wr;
+      fpair *= factor_dpd*rinv;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
+      }
+
+      if (eflag) {
+        // unshifted eng of conservative term:
+        // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/d_cut(itype,jtype));
+        // eng shifted to 0.0 at cutoff
+        evdwl = 0.5*a0_ij*cut_ij * wd;
+        evdwl *= factor_dpd;
+        if (EVFLAG)
+          ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
+      }
+
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+    }
+  }
+
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>(), ii, ev);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii, EV_FLOAT& ev) const {
+
+  // These array are atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_duCond = d_duCond;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_duMech = d_duMech;
+
+  int i,j,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,fpair;
+  double vxtmp,vytmp,vztmp,delvx,delvy,delvz;
+  double rsq,r,rinv,wd,wr,factor_dpd,uTmp;
+  double dot,randnum;
+
+  double kappa_ij, alpha_ij, theta_ij, gamma_ij;
+  double mass_i, mass_j;
+  double massinv_i, massinv_j;
+  double randPair, mu_ij;
+
+  rand_type rand_gen = rand_pool.get_state();
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  vxtmp = v(i,0);
+  vytmp = v(i,1);
+  vztmp = v(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+
+  for (jj = 0; jj < jnum; jj++) {
+    j = d_neighbors(i,jj);
+    factor_dpd = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    double cutsq_ij = STACKPARAMS?m_cutsq[itype][jtype]:d_cutsq(itype,jtype);
+    if (rsq < cutsq_ij) {
+      r = sqrt(rsq);
+      if (r < EPSILON) continue;     // r can be 0.0 in DPD systems
+      rinv = 1.0/r;
+      double cut_ij = STACKPARAMS?m_params[itype][jtype].cut:params(itype,jtype).cut;
+      wr = 1.0 - r/cut_ij;
+      wd = wr*wr;
+
+      delvx = vxtmp - v(j,0);
+      delvy = vytmp - v(j,1);
+      delvz = vztmp - v(j,2);
+      dot = delx*delvx + dely*delvy + delz*delvz;
+      randnum = rand_gen.normal();
+
+      // Compute the current temperature
+      theta_ij = 0.5*(1.0/dpdTheta[i] + 1.0/dpdTheta[j]);
+      theta_ij = 1.0/theta_ij;
+
+      double sigma_ij = STACKPARAMS?m_params[itype][jtype].sigma:params(itype,jtype).sigma;
+      gamma_ij = sigma_ij*sigma_ij
+                 / (2.0*boltz*theta_ij);
+
+      // conservative force = a0 * wr
+      // drag force = -gamma * wr^2 * (delx dot delv) / r
+      // random force = sigma * wr * rnd * dtinvsqrt;
+
+      double a0_ij = STACKPARAMS?m_params[itype][jtype].a0:params(itype,jtype).a0;
+      fpair = a0_ij*wr;
+      fpair -= gamma_ij*wd*dot*rinv;
+      fpair += sigma_ij*wr*randnum*dtinvsqrt;
+      fpair *= factor_dpd*rinv;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
+      }
+
+      if (rmass) {
+        mass_i = rmass[i];
+        mass_j = rmass[j];
+      } else {
+        mass_i = mass[itype];
+        mass_j = mass[jtype];
+      }
+      massinv_i = 1.0 / mass_i;
+      massinv_j = 1.0 / mass_j;
+
+      // Compute the mechanical and conductive energy, uMech and uCond
+      mu_ij = massinv_i + massinv_j;
+      mu_ij *= ftm2v;
+
+      uTmp = gamma_ij*wd*rinv*rinv*dot*dot
+             - 0.5*sigma_ij*sigma_ij*mu_ij*wd;
+      uTmp -= sigma_ij*wr*rinv*dot*randnum*dtinvsqrt;
+      uTmp *= 0.5;
+
+      a_duMech[i] += uTmp;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        a_duMech[j] += uTmp;
+      }
+
+      // Compute uCond
+      randnum = rand_gen.normal();
+      kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa;
+      alpha_ij = sqrt(2.0*boltz*kappa_ij);
+      randPair = alpha_ij*wr*randnum*dtinvsqrt;
+
+      uTmp = kappa_ij*(1.0/dpdTheta[i] - 1.0/dpdTheta[j])*wd;
+      uTmp += randPair;
+
+      a_duCond[i] += uTmp;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        a_duCond[j] -= uTmp;
+      }
+
+      if (eflag) {
+        // unshifted eng of conservative term:
+        // evdwl = -a0[itype][jtype]*r * (1.0-0.5*r/d_cut(itype,jtype));
+        // eng shifted to 0.0 at cutoff
+        evdwl = 0.5*a0_ij*cut_ij * wd;
+        evdwl *= factor_dpd;
+        if (EVFLAG)
+          ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
+      }
+
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+    }
+  }
+
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+
+  rand_pool.free_state(rand_gen);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>(), ii, ev);
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairDPDfdtEnergyKokkos<DeviceType>::allocate()
+{
+  PairDPDfdtEnergy::allocate();
+
+  int n = atom->ntypes;
+  int nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+
+  memory->destroy(cutsq);
+  memory->create_kokkos(k_cutsq,cutsq,n+1,n+1,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+
+  k_params = Kokkos::DualView<params_dpd**,Kokkos::LayoutRight,DeviceType>("PairDPDfdtEnergy::params",n+1,n+1);
+  params = k_params.template view<DeviceType>();
+
+  if (!splitFDT_flag) {
+    memory->destroy(duCond);
+    memory->destroy(duMech);
+    memory->create_kokkos(k_duCond,duCond,nlocal+nghost+1,"pair:duCond");
+    memory->create_kokkos(k_duMech,duMech,nlocal+nghost+1,"pair:duMech");
+    d_duCond = k_duCond.view<DeviceType>();
+    d_duMech = k_duMech.view<DeviceType>();
+    h_duCond = k_duCond.h_view;
+    h_duMech = k_duMech.h_view;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairDPDfdtEnergyKokkos<DeviceType>::init_one(int i, int j)
+{
+  double cutone = PairDPDfdtEnergy::init_one(i,j);
+
+  k_params.h_view(i,j).cut = cut[i][j];
+  k_params.h_view(i,j).a0 = a0[i][j];
+  k_params.h_view(i,j).sigma = sigma[i][j];
+  k_params.h_view(i,j).kappa = kappa[i][j];
+  k_params.h_view(j,i) = k_params.h_view(i,j);
+  if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_params[i][j] = m_params[j][i] = k_params.h_view(i,j);
+    m_cutsq[j][i] = m_cutsq[i][j] = cutone*cutone;
+  }
+
+  k_cutsq.h_view(i,j) = cutone*cutone;
+  k_cutsq.h_view(j,i) = k_cutsq.h_view(i,j);
+  k_cutsq.template modify<LMPHostType>();
+  k_params.template modify<LMPHostType>();
+
+  return cutone;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void PairDPDfdtEnergyKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int VFLAG = vflag_either;
+
+  // The eatom and vatom arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>();
+  Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>();
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      const E_FLOAT epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        v_vatom(j,0) += 0.5*v0;
+        v_vatom(j,1) += 0.5*v1;
+        v_vatom(j,2) += 0.5*v2;
+        v_vatom(j,3) += 0.5*v3;
+        v_vatom(j,4) += 0.5*v4;
+        v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int PairDPDfdtEnergyKokkos<DeviceType>::sbmask(const int& j) const {
+  return j >> SBBITS & 3;
+}
+
+namespace LAMMPS_NS {
+template class PairDPDfdtEnergyKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairDPDfdtEnergyKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
new file mode 100644
index 0000000000..424779f839
--- /dev/null
+++ b/src/KOKKOS/pair_dpd_fdt_energy_kokkos.h
@@ -0,0 +1,182 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(dpd/fdt/energy/kk,PairDPDfdtEnergyKokkos<LMPDeviceType>)
+PairStyle(dpd/fdt/energy/kk/device,PairDPDfdtEnergyKokkos<LMPDeviceType>)
+PairStyle(dpd/fdt/energy/kk/host,PairDPDfdtEnergyKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
+#define LMP_PAIR_DPD_FDT_ENERGY_KOKKOS_H
+
+#if !defined(DPD_USE_RAN_MARS) && !defined(DPD_USE_Random_XorShift64) && !defined(Random_XorShift1024)
+#define DPD_USE_Random_XorShift64
+#endif
+
+#include "pair_dpd_fdt_energy.h"
+#include "pair_kokkos.h"
+#include "kokkos_type.h"
+#ifdef DPD_USE_RAN_MARS
+#include "rand_pool_wrap_kokkos.h"
+#else
+#include "Kokkos_Random.hpp"
+#endif
+
+namespace LAMMPS_NS {
+
+struct TagPairDPDfdtEnergyZero{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+struct TagPairDPDfdtEnergyComputeSplit{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+struct TagPairDPDfdtEnergyComputeNoSplit{};
+
+template<class DeviceType>
+class PairDPDfdtEnergyKokkos : public PairDPDfdtEnergy {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  PairDPDfdtEnergyKokkos(class LAMMPS *);
+  virtual ~PairDPDfdtEnergyKokkos();
+  virtual void compute(int, int);
+  void init_style();
+  double init_one(int, int);
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyZero, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool STACKPARAMS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairDPDfdtEnergyComputeNoSplit<NEIGHFLAG,NEWTON_PAIR,EVFLAG,STACKPARAMS>, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR>
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int sbmask(const int& j) const;
+
+  struct params_dpd {
+    KOKKOS_INLINE_FUNCTION
+    params_dpd(){cut=0;a0=0;sigma=0;kappa=0;};
+    KOKKOS_INLINE_FUNCTION
+    params_dpd(int i){cut=0;a0=0;sigma=0;kappa=0;};
+    F_FLOAT cut,a0,sigma,kappa;
+  };
+
+  DAT::tdual_efloat_1d k_duCond,k_duMech;
+
+#ifdef DPD_USE_RAN_MARS
+  RandPoolWrap rand_pool;
+  typedef RandWrap rand_type;
+#elif defined(DPD_USE_Random_XorShift64)
+  Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift64_Pool<DeviceType>::generator_type rand_type;
+#elif defined(DPD_USE_Random_XorShift1024)
+  Kokkos::Random_XorShift1024_Pool<DeviceType> rand_pool;
+  typedef typename Kokkos::Random_XorShift1024_Pool<DeviceType>::generator_type rand_type;
+#endif
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+ protected:
+  int eflag,vflag;
+  int nlocal,neighflag;
+  double dtinvsqrt;
+  double boltz,ftm2v;
+  double special_lj[4];
+
+  virtual void allocate();
+
+  Kokkos::DualView<params_dpd**,Kokkos::LayoutRight,DeviceType> k_params;
+  typename Kokkos::DualView<params_dpd**,
+    Kokkos::LayoutRight,DeviceType>::t_dev_const_um params;
+  // hardwired to space for MAX_TYPES_STACKPARAMS (12) atom types
+  params_dpd m_params[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_x_array c_x;
+  typename ArrayTypes<DeviceType>::t_v_array_randomread v;
+  typename ArrayTypes<DeviceType>::t_f_array f;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_float_1d_randomread mass;
+  double *rmass;
+  typename AT::t_efloat_1d dpdTheta;
+  typename AT::t_efloat_1d d_duCond,d_duMech;
+  HAT::t_efloat_1d h_duCond,h_duMech;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  friend void pair_virial_fdotr_compute<PairDPDfdtEnergyKokkos>(PairDPDfdtEnergyKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair dpd/fdt/energy requires ghost atoms store velocity
+
+Use the communicate vel yes command to enable this.
+
+E: Pair dpd/fdt/energy requires newton pair on
+
+Self-explanatory.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+*/
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.cpp b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
new file mode 100644
index 0000000000..8d65be23af
--- /dev/null
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.cpp
@@ -0,0 +1,2644 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Stan Moore (Sandia)
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_exp6_rx_kokkos.h"
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "neigh_list.h"
+#include "math_const.h"
+#include "math_special_kokkos.h"
+#include "memory.h"
+#include "error.h"
+#include "modify.h"
+#include "fix.h"
+#include <float.h>
+#include "atom_masks.h"
+#include "neigh_request.h"
+
+using namespace LAMMPS_NS;
+using namespace MathConst;
+using namespace MathSpecialKokkos;
+
+#define MAXLINE 1024
+#define DELTA 4
+
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+#define oneFluidApproxParameter (-1)
+#define isOneFluidApprox(_site) ( (_site) == oneFluidApproxParameter )
+
+#define exp6PotentialType (1)
+#define isExp6PotentialType(_type) ( (_type) == exp6PotentialType )
+
+namespace /* anonymous */
+{
+
+//typedef double TimerType;
+//TimerType getTimeStamp(void) { return MPI_Wtime(); }
+//double getElapsedTime( const TimerType &t0, const TimerType &t1) { return t1-t0; }
+
+typedef struct timespec TimerType;
+TimerType getTimeStamp(void) { TimerType tick; clock_gettime( CLOCK_MONOTONIC, &tick); return tick; }
+double getElapsedTime( const TimerType &t0, const TimerType &t1)
+{
+   return (t1.tv_sec - t0.tv_sec) + 1e-9*(t1.tv_nsec - t0.tv_nsec);
+}
+
+} // end namespace
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairExp6rxKokkos<DeviceType>::PairExp6rxKokkos(LAMMPS *lmp) : PairExp6rx(lmp)
+{
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  k_error_flag = DAT::tdual_int_scalar("pair:error_flag");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairExp6rxKokkos<DeviceType>::~PairExp6rxKokkos()
+{
+  if (copymode) return;
+
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
+  memory->destroy_kokkos(k_cutsq,cutsq);
+
+  for (int i=0; i < nparams; ++i) {
+    delete[] params[i].name;
+    delete[] params[i].potential;
+  }
+  memory->destroy_kokkos(k_params,params);
+
+  memory->destroy_kokkos(k_mol2param,mol2param);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::init_style()
+{
+  PairExp6rx::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with exp6/rx/kk");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  TimerType t_start = getTimeStamp();
+
+  copymode = 1;
+
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
+  else evflag = vflag_fdotr = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.template view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.template view<DeviceType>();
+  }
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+  nlocal = atom->nlocal;
+  special_lj[0] = force->special_lj[0];
+  special_lj[1] = force->special_lj[1];
+  special_lj[2] = force->special_lj[2];
+  special_lj[3] = force->special_lj[3];
+  newton_pair = force->newton_pair;
+
+  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
+  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
+  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
+  k_cutsq.template sync<DeviceType>();
+
+  // Initialize the Exp6 parameter data for both the local
+  // and ghost atoms. Make the parameter data persistent
+  // and exchange like any other atom property later.
+
+  TimerType t_mix_start = getTimeStamp();
+  {
+     const int np_total = nlocal + atom->nghost;
+
+     if (np_total > PairExp6ParamData.epsilon1.dimension_0()) {
+       PairExp6ParamData.epsilon1      = typename AT::t_float_1d("PairExp6ParamData.epsilon1"     ,np_total);
+       PairExp6ParamData.alpha1        = typename AT::t_float_1d("PairExp6ParamData.alpha1"       ,np_total);
+       PairExp6ParamData.rm1           = typename AT::t_float_1d("PairExp6ParamData.rm1"          ,np_total);
+       PairExp6ParamData.mixWtSite1    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1"   ,np_total);
+       PairExp6ParamData.epsilon2      = typename AT::t_float_1d("PairExp6ParamData.epsilon2"     ,np_total);
+       PairExp6ParamData.alpha2        = typename AT::t_float_1d("PairExp6ParamData.alpha2"       ,np_total);
+       PairExp6ParamData.rm2           = typename AT::t_float_1d("PairExp6ParamData.rm2"          ,np_total);
+       PairExp6ParamData.mixWtSite2    = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2"   ,np_total);
+       PairExp6ParamData.epsilonOld1   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld1"  ,np_total);
+       PairExp6ParamData.alphaOld1     = typename AT::t_float_1d("PairExp6ParamData.alphaOld1"    ,np_total);
+       PairExp6ParamData.rmOld1        = typename AT::t_float_1d("PairExp6ParamData.rmOld1"       ,np_total);
+       PairExp6ParamData.mixWtSite1old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite1old",np_total);
+       PairExp6ParamData.epsilonOld2   = typename AT::t_float_1d("PairExp6ParamData.epsilonOld2"  ,np_total);
+       PairExp6ParamData.alphaOld2     = typename AT::t_float_1d("PairExp6ParamData.alphaOld2"    ,np_total);
+       PairExp6ParamData.rmOld2        = typename AT::t_float_1d("PairExp6ParamData.rmOld2"       ,np_total);
+       PairExp6ParamData.mixWtSite2old = typename AT::t_float_1d("PairExp6ParamData.mixWtSite2old",np_total);
+
+       PairExp6ParamDataVect.epsilon          = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon"         ,np_total);
+       PairExp6ParamDataVect.rm3              = typename AT::t_float_1d("PairExp6ParamDataVect.rm3"             ,np_total);
+       PairExp6ParamDataVect.alpha            = typename AT::t_float_1d("PairExp6ParamDataVect.alpha"           ,np_total);
+       PairExp6ParamDataVect.xMolei           = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei"          ,np_total);
+       PairExp6ParamDataVect.epsilon_old      = typename AT::t_float_1d("PairExp6ParamDataVect.epsilon_old"     ,np_total);
+       PairExp6ParamDataVect.rm3_old          = typename AT::t_float_1d("PairExp6ParamDataVect.rm3_old"         ,np_total);
+       PairExp6ParamDataVect.alpha_old        = typename AT::t_float_1d("PairExp6ParamDataVect.alpha_old"       ,np_total);
+       PairExp6ParamDataVect.xMolei_old       = typename AT::t_float_1d("PairExp6ParamDataVect.xMolei_old"      ,np_total);
+       PairExp6ParamDataVect.fractionOFA      = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFA"     ,np_total);
+       PairExp6ParamDataVect.fraction1        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction1"       ,np_total);
+       PairExp6ParamDataVect.fraction2        = typename AT::t_float_1d("PairExp6ParamDataVect.fraction2"       ,np_total);
+       PairExp6ParamDataVect.nMoleculesOFA    = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFA"   ,np_total);
+       PairExp6ParamDataVect.nMolecules1      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules1"     ,np_total);
+       PairExp6ParamDataVect.nMolecules2      = typename AT::t_float_1d("PairExp6ParamDataVect.nMolecules2"     ,np_total);
+       PairExp6ParamDataVect.nTotal           = typename AT::t_float_1d("PairExp6ParamDataVect.nTotal"          ,np_total);
+       PairExp6ParamDataVect.fractionOFAold   = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOFAold"  ,np_total);
+       PairExp6ParamDataVect.fractionOld1     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld1"    ,np_total);
+       PairExp6ParamDataVect.fractionOld2     = typename AT::t_float_1d("PairExp6ParamDataVect.fractionOld2"    ,np_total);
+       PairExp6ParamDataVect.nMoleculesOFAold = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOFAold",np_total);
+       PairExp6ParamDataVect.nMoleculesOld1   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld1"  ,np_total);
+       PairExp6ParamDataVect.nMoleculesOld2   = typename AT::t_float_1d("PairExp6ParamDataVect.nMoleculesOld2"  ,np_total);
+       PairExp6ParamDataVect.nTotalold        = typename AT::t_float_1d("PairExp6ParamDataVect.nTotalold"       ,np_total);
+     } else
+       Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroMixingWeights>(0,np_total),*this);
+
+#ifdef KOKKOS_HAVE_CUDA
+     Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxgetMixingWeights>(0,np_total),*this);
+#else
+     int errorFlag = 0;
+     getMixingWeightsVect (np_total, errorFlag, PairExp6ParamData.epsilon1,
+                                                PairExp6ParamData.alpha1,
+                                                PairExp6ParamData.rm1,
+                                                PairExp6ParamData.mixWtSite1,
+                                                PairExp6ParamData.epsilon2,
+                                                PairExp6ParamData.alpha2,
+                                                PairExp6ParamData.rm2,
+                                                PairExp6ParamData.mixWtSite2,
+                                                PairExp6ParamData.epsilonOld1,
+                                                PairExp6ParamData.alphaOld1,
+                                                PairExp6ParamData.rmOld1,
+                                                PairExp6ParamData.mixWtSite1old,
+                                                PairExp6ParamData.epsilonOld2,
+                                                PairExp6ParamData.alphaOld2,
+                                                PairExp6ParamData.rmOld2,
+                                                PairExp6ParamData.mixWtSite2old);
+     if (errorFlag == 1)
+       error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
+     else if (errorFlag == 2)
+       error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
+#endif
+  }
+  TimerType t_mix_stop = getTimeStamp();
+
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view() == 1)
+    error->all(FLERR,"The number of molecules in CG particle is less than 10*DBL_EPSILON.");
+  else if (k_error_flag.h_view() == 2)
+    error->all(FLERR,"Computed fraction less than -10*DBL_EPSILON");
+
+  int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+
+#ifdef KOKKOS_HAVE_CUDA  // Use atomics
+
+  if (neighflag == HALF) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALF,0,0> >(0,inum),*this);
+    }
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<HALFTHREAD,0,0> >(0,inum),*this);
+    }
+  } else if (neighflag == FULL) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCompute<FULL,0,0> >(0,inum),*this);
+    }
+  }
+
+#else // No atomics
+
+  num_threads = lmp->kokkos->num_threads;
+  int nmax = f.dimension_0();
+  if (nmax > t_f.dimension_1()) {
+    t_f = t_f_array_thread("pair_exp6_rx:t_f",num_threads,nmax);
+    t_uCG = t_efloat_1d_thread("pair_exp6_rx:t_uCG",num_threads,nmax);
+    t_uCGnew = t_efloat_1d_thread("pair_exp6_rx:t_UCGnew",num_threads,nmax);
+  }
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxZeroDupViews>(0,nmax),*this);
+
+  if (neighflag == HALF) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALF,0,0> >(0,inum),*this);
+    }
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<HALFTHREAD,0,0> >(0,inum),*this);
+    }
+  } else if (neighflag == FULL) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,1,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,1,0> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,0,1> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxComputeNoAtomics<FULL,0,0> >(0,inum),*this);
+    }
+  }
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairExp6rxCollapseDupViews>(0,nmax),*this);
+
+#endif
+
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view())
+    error->all(FLERR,"alpha_ij is 6.0 in pair exp6");
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+
+  copymode = 0;
+
+  //TimerType t_stop = getTimeStamp();
+  //printf("PairExp6rxKokkos::compute %f %f\n", getElapsedTime(t_start, t_stop), getElapsedTime(t_mix_start, t_mix_stop));
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxZeroMixingWeights, const int &i) const {
+  PairExp6ParamData.epsilon1[i] = 0.0;
+  PairExp6ParamData.alpha1[i] = 0.0;
+  PairExp6ParamData.rm1[i] = 0.0;
+  PairExp6ParamData.mixWtSite1[i] = 0.0;
+  PairExp6ParamData.epsilon2[i] = 0.0;
+  PairExp6ParamData.alpha2[i] = 0.0;
+  PairExp6ParamData.rm2[i] = 0.0;
+  PairExp6ParamData.mixWtSite2[i] = 0.0;
+  PairExp6ParamData.epsilonOld1[i] = 0.0;
+  PairExp6ParamData.alphaOld1[i] = 0.0;
+  PairExp6ParamData.rmOld1[i] = 0.0;
+  PairExp6ParamData.mixWtSite1old[i] = 0.0;
+  PairExp6ParamData.epsilonOld2[i] = 0.0;
+  PairExp6ParamData.alphaOld2[i] = 0.0;
+  PairExp6ParamData.rmOld2[i] = 0.0;
+  PairExp6ParamData.mixWtSite2old[i] = 0.0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxgetMixingWeights, const int &i) const {
+  getMixingWeights (i, PairExp6ParamData.epsilon1[i],
+                    PairExp6ParamData.alpha1[i],
+                    PairExp6ParamData.rm1[i],
+                    PairExp6ParamData.mixWtSite1[i],
+                    PairExp6ParamData.epsilon2[i],
+                    PairExp6ParamData.alpha2[i],
+                    PairExp6ParamData.rm2[i],
+                    PairExp6ParamData.mixWtSite2[i],
+                    PairExp6ParamData.epsilonOld1[i],
+                    PairExp6ParamData.alphaOld1[i],
+                    PairExp6ParamData.rmOld1[i],
+                    PairExp6ParamData.mixWtSite1old[i],
+                    PairExp6ParamData.epsilonOld2[i],
+                    PairExp6ParamData.alphaOld2[i],
+                    PairExp6ParamData.rmOld2[i],
+                    PairExp6ParamData.mixWtSite2old[i]);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+
+  {
+    const bool one_type = (ntypes == 1);
+    if (isite1 == isite2)
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, true, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, true,false>(ii, ev);
+    else
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,true, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,true,false>(ii, ev);
+    return;
+  }
+
+  // These arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCG = uCG;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCGnew = uCGnew;
+
+  int i,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
+  double rsq,r2inv,r6inv,forceExp6,factor_lj;
+  double rCut,rCutInv,rCut2inv,rCut6inv,rCutExp,urc,durc;
+  double rm2ij,rm6ij;
+  double r,rexp;
+
+  double alphaOld12_ij, rmOld12_ij, epsilonOld12_ij;
+  double alphaOld21_ij, rmOld21_ij, epsilonOld21_ij;
+  double alpha12_ij, rm12_ij, epsilon12_ij;
+  double alpha21_ij, rm21_ij, epsilon21_ij;
+  double rminv, buck1, buck2;
+  double epsilonOld1_i,alphaOld1_i,rmOld1_i;
+  double epsilonOld1_j,alphaOld1_j,rmOld1_j;
+  double epsilonOld2_i,alphaOld2_i,rmOld2_i;
+  double epsilonOld2_j,alphaOld2_j,rmOld2_j;
+  double epsilon1_i,alpha1_i,rm1_i;
+  double epsilon1_j,alpha1_j,rm1_j;
+  double epsilon2_i,alpha2_i,rm2_i;
+  double epsilon2_j,alpha2_j,rm2_j;
+  double evdwlOldEXP6_12, evdwlOldEXP6_21, fpairOldEXP6_12, fpairOldEXP6_21;
+  double evdwlEXP6_12, evdwlEXP6_21;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
+
+  const int nRep = 12;
+  const double shift = 1.05;
+  double rin1, aRep, uin1, win1, uin1rep, rin1exp, rin6, rin6inv;
+
+  evdwlOld = 0.0;
+  evdwl = 0.0;
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+
+  {
+     epsilon1_i     = PairExp6ParamData.epsilon1[i];
+     alpha1_i       = PairExp6ParamData.alpha1[i];
+     rm1_i          = PairExp6ParamData.rm1[i];
+     mixWtSite1_i    = PairExp6ParamData.mixWtSite1[i];
+     epsilon2_i     = PairExp6ParamData.epsilon2[i];
+     alpha2_i       = PairExp6ParamData.alpha2[i];
+     rm2_i          = PairExp6ParamData.rm2[i];
+     mixWtSite2_i    = PairExp6ParamData.mixWtSite2[i];
+     epsilonOld1_i  = PairExp6ParamData.epsilonOld1[i];
+     alphaOld1_i    = PairExp6ParamData.alphaOld1[i];
+     rmOld1_i       = PairExp6ParamData.rmOld1[i];
+     mixWtSite1old_i = PairExp6ParamData.mixWtSite1old[i];
+     epsilonOld2_i  = PairExp6ParamData.epsilonOld2[i];
+     alphaOld2_i    = PairExp6ParamData.alphaOld2[i];
+     rmOld2_i       = PairExp6ParamData.rmOld2[i];
+     mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
+  }
+
+  for (jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    factor_lj = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    if (rsq < d_cutsq(itype,jtype)) { // optimize
+      r2inv = 1.0/rsq;
+      r6inv = r2inv*r2inv*r2inv;
+
+      r = sqrt(rsq);
+      rCut2inv = 1.0/d_cutsq(itype,jtype);
+      rCut6inv = rCut2inv*rCut2inv*rCut2inv;
+      rCut = sqrt(d_cutsq(itype,jtype));
+      rCutInv = 1.0/rCut;
+
+      //
+      // A. Compute the exp-6 potential
+      //
+
+      // A1.  Get alpha, epsilon and rm for particle j
+
+      {
+         epsilon1_j     = PairExp6ParamData.epsilon1[j];
+         alpha1_j       = PairExp6ParamData.alpha1[j];
+         rm1_j          = PairExp6ParamData.rm1[j];
+         mixWtSite1_j    = PairExp6ParamData.mixWtSite1[j];
+         epsilon2_j     = PairExp6ParamData.epsilon2[j];
+         alpha2_j       = PairExp6ParamData.alpha2[j];
+         rm2_j          = PairExp6ParamData.rm2[j];
+         mixWtSite2_j    = PairExp6ParamData.mixWtSite2[j];
+         epsilonOld1_j  = PairExp6ParamData.epsilonOld1[j];
+         alphaOld1_j    = PairExp6ParamData.alphaOld1[j];
+         rmOld1_j       = PairExp6ParamData.rmOld1[j];
+         mixWtSite1old_j = PairExp6ParamData.mixWtSite1old[j];
+         epsilonOld2_j  = PairExp6ParamData.epsilonOld2[j];
+         alphaOld2_j    = PairExp6ParamData.alphaOld2[j];
+         rmOld2_j       = PairExp6ParamData.rmOld2[j];
+         mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
+      }
+
+      // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
+      alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
+      rmOld12_ij = 0.5*(rmOld1_i + rmOld2_j);
+      epsilonOld12_ij = sqrt(epsilonOld1_i*epsilonOld2_j);
+      alphaOld21_ij = sqrt(alphaOld2_i*alphaOld1_j);
+      rmOld21_ij = 0.5*(rmOld2_i + rmOld1_j);
+      epsilonOld21_ij = sqrt(epsilonOld2_i*epsilonOld1_j);
+
+      alpha12_ij = sqrt(alpha1_i*alpha2_j);
+      rm12_ij = 0.5*(rm1_i + rm2_j);
+      epsilon12_ij = sqrt(epsilon1_i*epsilon2_j);
+      alpha21_ij = sqrt(alpha2_i*alpha1_j);
+      rm21_ij = 0.5*(rm2_i + rm1_j);
+      epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
+
+      evdwlOldEXP6_12 = 0.0;
+      evdwlOldEXP6_21 = 0.0;
+      evdwlEXP6_12 = 0.0;
+      evdwlEXP6_21 = 0.0;
+      fpairOldEXP6_12 = 0.0;
+      fpairOldEXP6_21 = 0.0;
+
+      if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
+        if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
+          k_error_flag.template view<DeviceType>()() = 1;
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld12_ij;
+        buck1 = epsilonOld12_ij / (alphaOld12_ij - 6.0);
+        rexp = expValue(alphaOld12_ij*(1.0-r*rminv));
+        rm2ij = rmOld12_ij*rmOld12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld12_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld12_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld12_ij*func_rin(alphaOld12_ij);
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = buck1*(6.0*rexp - alphaOld12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld21_ij;
+        buck1 = epsilonOld21_ij / (alphaOld21_ij - 6.0);
+        buck2 = 6.0*alphaOld21_ij;
+        rexp = expValue(alphaOld21_ij*(1.0-r*rminv));
+        rm2ij = rmOld21_ij*rmOld21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld21_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld21_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld21_ij*func_rin(alphaOld21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = buck1*(6.0*rexp - alphaOld21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        if (isite1 == isite2)
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12;
+        else
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*evdwlOldEXP6_21;
+
+        evdwlOld *= factor_lj;
+
+        uCG_i += 0.5*evdwlOld;
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+          a_uCG[j] += 0.5*evdwlOld;
+      }
+
+      if(rm12_ij!=0.0 && rm21_ij!=0.0){
+        if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
+          k_error_flag.template view<DeviceType>()() = 1;
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rm12_ij;
+        buck1 = epsilon12_ij / (alpha12_ij - 6.0);
+        buck2 = 6.0*alpha12_ij;
+        rexp = expValue(alpha12_ij*(1.0-r*rminv));
+        rm2ij = rm12_ij*rm12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha12_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm12_ij*func_rin(alpha12_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_12 = buck1*(6.0*rexp - alpha12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        rminv = 1.0/rm21_ij;
+        buck1 = epsilon21_ij / (alpha21_ij - 6.0);
+        buck2 = 6.0*alpha21_ij;
+        rexp = expValue(alpha21_ij*(1.0-r*rminv));
+        rm2ij = rm21_ij*rm21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha21_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm21_ij*func_rin(alpha21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+      }
+
+      //
+      // Apply Mixing Rule to get the overall force for the CG pair
+      //
+      if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+      else fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
+      }
+
+      if (isite1 == isite2) evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+      else evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
+      evdwl *= factor_lj;
+
+      uCGnew_i   += 0.5*evdwl;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        a_uCGnew[j] += 0.5*evdwl;
+      evdwl = evdwlOld;
+      if (EVFLAG)
+        ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
+      //if (vflag_either || eflag_atom) 
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+    }
+  }
+
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+  a_uCG[i] += uCG_i;
+  a_uCGnew[i] += uCGnew_i;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+}
+
+// Experimental thread-safety using duplicated data instead of atomics
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii, EV_FLOAT& ev) const {
+
+  {
+    const bool one_type = (ntypes == 1);
+    if (isite1 == isite2)
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, false, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,true, false,false>(ii, ev);
+    else
+      if (one_type)
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,false, true>(ii, ev);
+      else
+        this->vectorized_operator<NEIGHFLAG,NEWTON_PAIR,EVFLAG,false,false,false>(ii, ev);
+    return;
+  }
+
+  int tid = 0;
+#ifndef KOKKOS_HAVE_CUDA
+  tid = DeviceType::hardware_thread_id();
+#endif
+
+  int i,jj,jnum,itype,jtype;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
+  double rsq,r2inv,r6inv,forceExp6,factor_lj;
+  double rCut,rCutInv,rCut2inv,rCut6inv,rCutExp,urc,durc;
+  double rm2ij,rm6ij;
+  double r,rexp;
+
+  double alphaOld12_ij, rmOld12_ij, epsilonOld12_ij;
+  double alphaOld21_ij, rmOld21_ij, epsilonOld21_ij;
+  double alpha12_ij, rm12_ij, epsilon12_ij;
+  double alpha21_ij, rm21_ij, epsilon21_ij;
+  double rminv, buck1, buck2;
+  double epsilonOld1_i,alphaOld1_i,rmOld1_i;
+  double epsilonOld1_j,alphaOld1_j,rmOld1_j;
+  double epsilonOld2_i,alphaOld2_i,rmOld2_i;
+  double epsilonOld2_j,alphaOld2_j,rmOld2_j;
+  double epsilon1_i,alpha1_i,rm1_i;
+  double epsilon1_j,alpha1_j,rm1_j;
+  double epsilon2_i,alpha2_i,rm2_i;
+  double epsilon2_j,alpha2_j,rm2_j;
+  double evdwlOldEXP6_12, evdwlOldEXP6_21, fpairOldEXP6_12, fpairOldEXP6_21;
+  double evdwlEXP6_12, evdwlEXP6_21;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
+
+  const int nRep = 12;
+  const double shift = 1.05;
+  double rin1, aRep, uin1, win1, uin1rep, rin1exp, rin6, rin6inv;
+
+  evdwlOld = 0.0;
+  evdwl = 0.0;
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+
+  {
+     epsilon1_i     = PairExp6ParamData.epsilon1[i];
+     alpha1_i       = PairExp6ParamData.alpha1[i];
+     rm1_i          = PairExp6ParamData.rm1[i];
+     mixWtSite1_i    = PairExp6ParamData.mixWtSite1[i];
+     epsilon2_i     = PairExp6ParamData.epsilon2[i];
+     alpha2_i       = PairExp6ParamData.alpha2[i];
+     rm2_i          = PairExp6ParamData.rm2[i];
+     mixWtSite2_i    = PairExp6ParamData.mixWtSite2[i];
+     epsilonOld1_i  = PairExp6ParamData.epsilonOld1[i];
+     alphaOld1_i    = PairExp6ParamData.alphaOld1[i];
+     rmOld1_i       = PairExp6ParamData.rmOld1[i];
+     mixWtSite1old_i = PairExp6ParamData.mixWtSite1old[i];
+     epsilonOld2_i  = PairExp6ParamData.epsilonOld2[i];
+     alphaOld2_i    = PairExp6ParamData.alphaOld2[i];
+     rmOld2_i       = PairExp6ParamData.rmOld2[i];
+     mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
+  }
+
+  for (jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    factor_lj = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    if (rsq < d_cutsq(itype,jtype)) { // optimize
+      r2inv = 1.0/rsq;
+      r6inv = r2inv*r2inv*r2inv;
+
+      r = sqrt(rsq);
+      rCut2inv = 1.0/d_cutsq(itype,jtype);
+      rCut6inv = rCut2inv*rCut2inv*rCut2inv;
+      rCut = sqrt(d_cutsq(itype,jtype));
+      rCutInv = 1.0/rCut;
+
+      //
+      // A. Compute the exp-6 potential
+      //
+
+      // A1.  Get alpha, epsilon and rm for particle j
+
+      {
+         epsilon1_j     = PairExp6ParamData.epsilon1[j];
+         alpha1_j       = PairExp6ParamData.alpha1[j];
+         rm1_j          = PairExp6ParamData.rm1[j];
+         mixWtSite1_j    = PairExp6ParamData.mixWtSite1[j];
+         epsilon2_j     = PairExp6ParamData.epsilon2[j];
+         alpha2_j       = PairExp6ParamData.alpha2[j];
+         rm2_j          = PairExp6ParamData.rm2[j];
+         mixWtSite2_j    = PairExp6ParamData.mixWtSite2[j];
+         epsilonOld1_j  = PairExp6ParamData.epsilonOld1[j];
+         alphaOld1_j    = PairExp6ParamData.alphaOld1[j];
+         rmOld1_j       = PairExp6ParamData.rmOld1[j];
+         mixWtSite1old_j = PairExp6ParamData.mixWtSite1old[j];
+         epsilonOld2_j  = PairExp6ParamData.epsilonOld2[j];
+         alphaOld2_j    = PairExp6ParamData.alphaOld2[j];
+         rmOld2_j       = PairExp6ParamData.rmOld2[j];
+         mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
+      }
+
+      // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
+      alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
+      rmOld12_ij = 0.5*(rmOld1_i + rmOld2_j);
+      epsilonOld12_ij = sqrt(epsilonOld1_i*epsilonOld2_j);
+      alphaOld21_ij = sqrt(alphaOld2_i*alphaOld1_j);
+      rmOld21_ij = 0.5*(rmOld2_i + rmOld1_j);
+      epsilonOld21_ij = sqrt(epsilonOld2_i*epsilonOld1_j);
+
+      alpha12_ij = sqrt(alpha1_i*alpha2_j);
+      rm12_ij = 0.5*(rm1_i + rm2_j);
+      epsilon12_ij = sqrt(epsilon1_i*epsilon2_j);
+      alpha21_ij = sqrt(alpha2_i*alpha1_j);
+      rm21_ij = 0.5*(rm2_i + rm1_j);
+      epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
+
+      evdwlOldEXP6_12 = 0.0;
+      evdwlOldEXP6_21 = 0.0;
+      evdwlEXP6_12 = 0.0;
+      evdwlEXP6_21 = 0.0;
+      fpairOldEXP6_12 = 0.0;
+      fpairOldEXP6_21 = 0.0;
+
+      if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0){
+        if(alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0)
+          k_error_flag.template view<DeviceType>()() = 1;
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld12_ij;
+        buck1 = epsilonOld12_ij / (alphaOld12_ij - 6.0);
+        rexp = expValue(alphaOld12_ij*(1.0-r*rminv));
+        rm2ij = rmOld12_ij*rmOld12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld12_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld12_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld12_ij*func_rin(alphaOld12_ij);
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_12 = buck1*(6.0*rexp - alphaOld12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rmOld21_ij;
+        buck1 = epsilonOld21_ij / (alphaOld21_ij - 6.0);
+        buck2 = 6.0*alphaOld21_ij;
+        rexp = expValue(alphaOld21_ij*(1.0-r*rminv));
+        rm2ij = rmOld21_ij*rmOld21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alphaOld21_ij*(1.0-rCut*rminv));
+        buck2 = 6.0*alphaOld21_ij;
+        urc = buck1*(6.0*rCutExp - alphaOld21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rmOld21_ij*func_rin(alphaOld21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alphaOld21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          forceExp6 = double(nRep)*aRep/powint(r,nRep);
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+          fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+          evdwlOldEXP6_21 = buck1*(6.0*rexp - alphaOld21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        if (isite1 == isite2)
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12;
+        else
+          evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*evdwlOldEXP6_21;
+
+        evdwlOld *= factor_lj;
+
+        uCG_i += 0.5*evdwlOld;
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+          t_uCG(tid,j) += 0.5*evdwlOld;
+      }
+
+      if(rm12_ij!=0.0 && rm21_ij!=0.0){
+        if(alpha21_ij == 6.0 || alpha12_ij == 6.0)
+          k_error_flag.template view<DeviceType>()() = 1;
+
+        // A3.  Compute some convenient quantities for evaluating the force
+        rminv = 1.0/rm12_ij;
+        buck1 = epsilon12_ij / (alpha12_ij - 6.0);
+        buck2 = 6.0*alpha12_ij;
+        rexp = expValue(alpha12_ij*(1.0-r*rminv));
+        rm2ij = rm12_ij*rm12_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha12_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha12_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm12_ij*func_rin(alpha12_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha12_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_12 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_12 = buck1*(6.0*rexp - alpha12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+
+        rminv = 1.0/rm21_ij;
+        buck1 = epsilon21_ij / (alpha21_ij - 6.0);
+        buck2 = 6.0*alpha21_ij;
+        rexp = expValue(alpha21_ij*(1.0-r*rminv));
+        rm2ij = rm21_ij*rm21_ij;
+        rm6ij = rm2ij*rm2ij*rm2ij;
+
+        // Compute the shifted potential
+        rCutExp = expValue(alpha21_ij*(1.0-rCut*rminv));
+        urc = buck1*(6.0*rCutExp - alpha21_ij*rm6ij*rCut6inv);
+        durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+        rin1 = shift*rm21_ij*func_rin(alpha21_ij);
+
+        if(r < rin1){
+          rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+          rin6inv = 1.0/rin6;
+
+          rin1exp = expValue(alpha21_ij*(1.0-rin1*rminv));
+
+          uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+          win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+          aRep = win1*powint(rin1,nRep)/nRep;
+
+          uin1rep = aRep/powint(rin1,nRep);
+
+          evdwlEXP6_21 = uin1 - uin1rep + aRep/powint(r,nRep);
+        } else {
+          evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+        }
+      }
+
+      //
+      // Apply Mixing Rule to get the overall force for the CG pair
+      //
+      if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+      else fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        t_f(tid,j,0) -= delx*fpair;
+        t_f(tid,j,1) -= dely*fpair;
+        t_f(tid,j,2) -= delz*fpair;
+      }
+
+      if (isite1 == isite2) evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+      else evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
+      evdwl *= factor_lj;
+
+      uCGnew_i += 0.5*evdwl;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        t_uCGnew(tid,j) += 0.5*evdwl;
+      evdwl = evdwlOld;
+      if (EVFLAG)
+        ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
+      //if (vflag_either || eflag_atom) 
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair,delx,dely,delz);
+    }
+  }
+
+  t_f(tid,i,0) += fx_i;
+  t_f(tid,i,1) += fy_i;
+  t_f(tid,i,2) += fz_i;
+  t_uCG(tid,i) += uCG_i;
+  t_uCGnew(tid,i) += uCGnew_i;
+}
+
+// Experimental thread-safe approach using duplicated data instead of atomics and
+// temporary local short vector arrays for the inner j-loop to increase vectorization.
+
+template<int n>
+  KOKKOS_INLINE_FUNCTION
+double __powint(const double& x, const int)
+{
+   static_assert(n == 12, "__powint<> only supports specific integer powers.");
+
+   if (n == 12)
+   {
+     // Do x^12 here ... x^12 = (x^3)^4
+     double x3 = x*x*x;
+     return x3*x3*x3*x3;
+   }
+}
+
+template<class DeviceType>
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics, bool OneType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::vectorized_operator(const int &ii, EV_FLOAT& ev) const
+{
+  // These arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCG = uCG;
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_uCGnew = uCGnew;
+
+  int tid = 0;
+#ifndef KOKKOS_HAVE_CUDA
+  tid = DeviceType::hardware_thread_id();
+#endif
+
+  const int nRep = 12;
+  const double shift = 1.05;
+
+  const int i = d_ilist[ii];
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
+  const int itype = type[i];
+  const int jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+
+  // Constant values for this atom.
+  const double epsilon1_i      = PairExp6ParamData.epsilon1[i];
+  const double alpha1_i        = PairExp6ParamData.alpha1[i];
+  const double rm1_i           = PairExp6ParamData.rm1[i];
+  const double mixWtSite1_i    = PairExp6ParamData.mixWtSite1[i];
+  const double epsilon2_i      = PairExp6ParamData.epsilon2[i];
+  const double alpha2_i        = PairExp6ParamData.alpha2[i];
+  const double rm2_i           = PairExp6ParamData.rm2[i];
+  const double mixWtSite2_i    = PairExp6ParamData.mixWtSite2[i];
+  const double epsilonOld1_i   = PairExp6ParamData.epsilonOld1[i];
+  const double alphaOld1_i     = PairExp6ParamData.alphaOld1[i];
+  const double rmOld1_i        = PairExp6ParamData.rmOld1[i];
+  const double mixWtSite1old_i = PairExp6ParamData.mixWtSite1old[i];
+  const double epsilonOld2_i   = PairExp6ParamData.epsilonOld2[i];
+  const double alphaOld2_i     = PairExp6ParamData.alphaOld2[i];
+  const double rmOld2_i        = PairExp6ParamData.rmOld2[i];
+  const double mixWtSite2old_i = PairExp6ParamData.mixWtSite2old[i];
+
+  const double cutsq_type11 = d_cutsq(1,1);
+  const double rCut2inv_type11 = 1.0/ cutsq_type11;
+  const double rCut6inv_type11 = rCut2inv_type11*rCut2inv_type11*rCut2inv_type11;
+  const double rCut_type11 = sqrt( cutsq_type11 );
+  const double rCutInv_type11 = 1.0/rCut_type11;
+
+  // Do error testing locally.
+  bool hasError = false;
+
+  // Process this many neighbors concurrently -- if possible.
+  const int batchSize = 8;
+
+  int neigh_j[batchSize];
+  double evdwlOld_j[batchSize];
+  double uCGnew_j[batchSize];
+  double fpair_j[batchSize];
+  double delx_j[batchSize];
+  double dely_j[batchSize];
+  double delz_j[batchSize];
+  double cutsq_j[batchSize];
+
+  for (int jptr = 0; jptr < jnum; )
+  {
+    // The core computation here is very expensive so let's only bother with
+    // those that pass rsq < cutsq.
+
+    for (int j = 0; j < batchSize; ++j)
+    {
+      evdwlOld_j[j] = 0.0;
+      uCGnew_j[j] = 0.0;
+      fpair_j[j] = 0.0;
+      //delx_j[j] = 0.0;
+      //dely_j[j] = 0.0;
+      //delz_j[j] = 0.0;
+      //cutsq_j[j] = 0.0;
+    }
+
+    int niters = 0;
+
+    for (; (jptr < jnum) && (niters < batchSize); ++jptr)
+    {
+      const int j = d_neighbors(i,jptr) & NEIGHMASK;
+
+      const double delx = xtmp - x(j,0);
+      const double dely = ytmp - x(j,1);
+      const double delz = ztmp - x(j,2);
+
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      const int jtype = type[j];
+
+      const double cutsq_ij = (OneType) ? cutsq_type11 : d_cutsq(itype,jtype);
+
+      if (rsq < cutsq_ij)
+      {
+        delx_j [niters] = delx;
+        dely_j [niters] = dely;
+        delz_j [niters] = delz;
+        if (OneType == false)
+          cutsq_j[niters] = cutsq_ij;
+
+        neigh_j[niters] = d_neighbors(i,jptr);
+
+        ++niters;
+      }
+    }
+
+    // reduction here.
+    #pragma simd reduction(+: fx_i, fy_i, fz_i, uCG_i, uCGnew_i) reduction(|: hasError)
+    for (int jlane = 0; jlane < niters; jlane++)
+    {
+      int j = neigh_j[jlane];
+      const double factor_lj = special_lj[sbmask(j)];
+      j &= NEIGHMASK;
+
+      const double delx = delx_j[jlane];
+      const double dely = dely_j[jlane];
+      const double delz = delz_j[jlane];
+
+      const double rsq = delx*delx + dely*dely + delz*delz;
+      // const int jtype = type[j];
+
+      // if (rsq < d_cutsq(itype,jtype)) // optimize
+      {
+        const double r2inv = 1.0/rsq;
+        const double r6inv = r2inv*r2inv*r2inv;
+
+        const double r = sqrt(rsq);
+        const double rCut2inv = (OneType) ? rCut2inv_type11 : (1.0/ cutsq_j[jlane]);
+        const double rCut6inv = (OneType) ? rCut6inv_type11 : (rCut2inv*rCut2inv*rCut2inv);
+        const double rCut =     (OneType) ? rCut_type11     : (sqrt( cutsq_j[jlane] ));
+        const double rCutInv =  (OneType) ? rCutInv_type11  : (1.0/rCut);
+
+        //
+        // A. Compute the exp-6 potential
+        //
+
+        // A1.  Get alpha, epsilon and rm for particle j
+
+        const double epsilon1_j      = PairExp6ParamData.epsilon1[j];
+        const double alpha1_j        = PairExp6ParamData.alpha1[j];
+        const double rm1_j           = PairExp6ParamData.rm1[j];
+        const double mixWtSite1_j    = PairExp6ParamData.mixWtSite1[j];
+        const double epsilon2_j      = PairExp6ParamData.epsilon2[j];
+        const double alpha2_j        = PairExp6ParamData.alpha2[j];
+        const double rm2_j           = PairExp6ParamData.rm2[j];
+        const double mixWtSite2_j    = PairExp6ParamData.mixWtSite2[j];
+        const double epsilonOld1_j   = PairExp6ParamData.epsilonOld1[j];
+        const double alphaOld1_j     = PairExp6ParamData.alphaOld1[j];
+        const double rmOld1_j        = PairExp6ParamData.rmOld1[j];
+        const double mixWtSite1old_j = PairExp6ParamData.mixWtSite1old[j];
+        const double epsilonOld2_j   = PairExp6ParamData.epsilonOld2[j];
+        const double alphaOld2_j     = PairExp6ParamData.alphaOld2[j];
+        const double rmOld2_j        = PairExp6ParamData.rmOld2[j];
+        const double mixWtSite2old_j = PairExp6ParamData.mixWtSite2old[j];
+
+        // A2.  Apply Lorentz-Berthelot mixing rules for the i-j pair
+        const double alphaOld12_ij = sqrt(alphaOld1_i*alphaOld2_j);
+        const double rmOld12_ij = 0.5*(rmOld1_i + rmOld2_j);
+        const double epsilonOld12_ij = sqrt(epsilonOld1_i*epsilonOld2_j);
+        const double alphaOld21_ij = sqrt(alphaOld2_i*alphaOld1_j);
+        const double rmOld21_ij = 0.5*(rmOld2_i + rmOld1_j);
+        const double epsilonOld21_ij = sqrt(epsilonOld2_i*epsilonOld1_j);
+
+        const double alpha12_ij = sqrt(alpha1_i*alpha2_j);
+        const double rm12_ij = 0.5*(rm1_i + rm2_j);
+        const double epsilon12_ij = sqrt(epsilon1_i*epsilon2_j);
+        const double alpha21_ij = sqrt(alpha2_i*alpha1_j);
+        const double rm21_ij = 0.5*(rm2_i + rm1_j);
+        const double epsilon21_ij = sqrt(epsilon2_i*epsilon1_j);
+
+        double evdwlOldEXP6_12 = 0.0;
+        double evdwlOldEXP6_21 = 0.0;
+        double evdwlEXP6_12 = 0.0;
+        double evdwlEXP6_21 = 0.0;
+        double fpairOldEXP6_12 = 0.0;
+        double fpairOldEXP6_21 = 0.0;
+
+        if(rmOld12_ij!=0.0 && rmOld21_ij!=0.0)
+        {
+          hasError |= (alphaOld21_ij == 6.0 || alphaOld12_ij == 6.0);
+
+          // A3.  Compute some convenient quantities for evaluating the force
+          double rminv = 1.0/rmOld12_ij;
+          double buck1 = epsilonOld12_ij / (alphaOld12_ij - 6.0);
+          double rexp = expValue(alphaOld12_ij*(1.0-r*rminv));
+          double rm2ij = rmOld12_ij*rmOld12_ij;
+          double rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          double rCutExp = expValue(alphaOld12_ij*(1.0-rCut*rminv));
+          double buck2 = 6.0*alphaOld12_ij;
+          double urc = buck1*(6.0*rCutExp - alphaOld12_ij*rm6ij*rCut6inv);
+          double durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+          double rin1 = shift*rmOld12_ij*func_rin(alphaOld12_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alphaOld12_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alphaOld12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            const double forceExp6 = double(nRep)*aRep/__powint<12>(r,nRep);
+            fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_12 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            const double forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+            fpairOldEXP6_12 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_12 = buck1*(6.0*rexp - alphaOld12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+
+          // A3.  Compute some convenient quantities for evaluating the force
+          rminv = 1.0/rmOld21_ij;
+          buck1 = epsilonOld21_ij / (alphaOld21_ij - 6.0);
+          buck2 = 6.0*alphaOld21_ij;
+          rexp = expValue(alphaOld21_ij*(1.0-r*rminv));
+          rm2ij = rmOld21_ij*rmOld21_ij;
+          rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          rCutExp = expValue(alphaOld21_ij*(1.0-rCut*rminv));
+          buck2 = 6.0*alphaOld21_ij;
+          urc = buck1*(6.0*rCutExp - alphaOld21_ij*rm6ij*rCut6inv);
+          durc = -buck1*buck2*(rCutExp* rminv - rCutInv*rm6ij*rCut6inv);
+          rin1 = shift*rmOld21_ij*func_rin(alphaOld21_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alphaOld21_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alphaOld21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            const double forceExp6 = double(nRep)*aRep/__powint<12>(r,nRep);
+            fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_21 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            const double forceExp6 = buck1*buck2*(r*rexp*rminv - rm6ij*r6inv) + r*durc;
+            fpairOldEXP6_21 = factor_lj*forceExp6*r2inv;
+
+            evdwlOldEXP6_21 = buck1*(6.0*rexp - alphaOld21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+
+          double evdwlOld;
+          if (Site1EqSite2)
+            evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12;
+          else
+            evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwlOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*evdwlOldEXP6_21;
+
+          evdwlOld *= factor_lj;
+
+          uCG_i += 0.5*evdwlOld;
+
+          evdwlOld_j[jlane] = evdwlOld;
+        }
+
+        if(rm12_ij!=0.0 && rm21_ij!=0.0)
+        {
+          hasError |= (alpha21_ij == 6.0 || alpha12_ij == 6.0);
+
+          // A3.  Compute some convenient quantities for evaluating the force
+          double rminv = 1.0/rm12_ij;
+          double buck1 = epsilon12_ij / (alpha12_ij - 6.0);
+          double buck2 = 6.0*alpha12_ij;
+          double rexp = expValue(alpha12_ij*(1.0-r*rminv));
+          double rm2ij = rm12_ij*rm12_ij;
+          double rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          double rCutExp = expValue(alpha12_ij*(1.0-rCut*rminv));
+          double urc = buck1*(6.0*rCutExp - alpha12_ij*rm6ij*rCut6inv);
+          double durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+          double rin1 = shift*rm12_ij*func_rin(alpha12_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alpha12_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alpha12_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            evdwlEXP6_12 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            evdwlEXP6_12 = buck1*(6.0*rexp - alpha12_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+
+          rminv = 1.0/rm21_ij;
+          buck1 = epsilon21_ij / (alpha21_ij - 6.0);
+          buck2 = 6.0*alpha21_ij;
+          rexp = expValue(alpha21_ij*(1.0-r*rminv));
+          rm2ij = rm21_ij*rm21_ij;
+          rm6ij = rm2ij*rm2ij*rm2ij;
+
+          // Compute the shifted potential
+          rCutExp = expValue(alpha21_ij*(1.0-rCut*rminv));
+          urc = buck1*(6.0*rCutExp - alpha21_ij*rm6ij*rCut6inv);
+          durc = -buck1*buck2*(rCutExp*rminv - rCutInv*rm6ij*rCut6inv);
+          rin1 = shift*rm21_ij*func_rin(alpha21_ij);
+
+          if(r < rin1){
+            const double rin6 = rin1*rin1*rin1*rin1*rin1*rin1;
+            const double rin6inv = 1.0/rin6;
+
+            const double rin1exp = expValue(alpha21_ij*(1.0-rin1*rminv));
+
+            const double uin1 = buck1*(6.0*rin1exp - alpha21_ij*rm6ij*rin6inv) - urc - durc*(rin1-rCut);
+
+            const double win1 = buck1*buck2*(rin1*rin1exp*rminv - rm6ij*rin6inv) + rin1*durc;
+
+            const double aRep = win1*__powint<12>(rin1,nRep)/nRep;
+
+            const double uin1rep = aRep/__powint<12>(rin1,nRep);
+
+            evdwlEXP6_21 = uin1 - uin1rep + aRep/__powint<12>(r,nRep);
+          } else {
+            evdwlEXP6_21 = buck1*(6.0*rexp - alpha21_ij*rm6ij*r6inv) - urc - durc*(r-rCut);
+          }
+        }
+
+        //
+        // Apply Mixing Rule to get the overall force for the CG pair
+        //
+        double fpair;
+        if (Site1EqSite2)
+          fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12;
+        else
+          fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpairOldEXP6_12 + sqrt(mixWtSite2old_i*mixWtSite1old_j)*fpairOldEXP6_21;
+
+        double evdwl;
+        if (Site1EqSite2)
+          evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12;
+        else
+          evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwlEXP6_12 + sqrt(mixWtSite2_i*mixWtSite1_j)*evdwlEXP6_21;
+
+        evdwl *= factor_lj;
+
+        fpair_j[jlane] = fpair;
+
+        fx_i += delx*fpair;
+        fy_i += dely*fpair;
+        fz_i += delz*fpair;
+
+        uCGnew_i += 0.5*evdwl;
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD))
+          uCGnew_j[jlane] = 0.5*evdwl;
+
+      } // if rsq < cutsq
+
+    } // end jlane loop.
+
+    for (int jlane = 0; jlane < niters; jlane++)
+    {
+      const int j = neigh_j[jlane] & NEIGHMASK;
+
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        if (UseAtomics)
+          a_uCG(j) += 0.5*evdwlOld_j[jlane];
+        else
+          t_uCG(tid,j) += 0.5*evdwlOld_j[jlane];
+
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        if (UseAtomics)
+          a_uCGnew(j) += uCGnew_j[jlane];
+        else
+          t_uCGnew(tid,j) += uCGnew_j[jlane];
+
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        if (UseAtomics)
+        {
+          a_f(j,0) -= delx_j[jlane]*fpair_j[jlane];
+          a_f(j,1) -= dely_j[jlane]*fpair_j[jlane];
+          a_f(j,2) -= delz_j[jlane]*fpair_j[jlane];
+        }
+        else
+        {
+          t_f(tid,j,0) -= delx_j[jlane]*fpair_j[jlane];
+          t_f(tid,j,1) -= dely_j[jlane]*fpair_j[jlane];
+          t_f(tid,j,2) -= delz_j[jlane]*fpair_j[jlane];
+        }
+      }
+
+      double evdwl = evdwlOld_j[jlane];
+      if (EVFLAG)
+        ev.evdwl += (((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR||(j<nlocal)))?1.0:0.5)*evdwl;
+      //if (vflag_either || eflag_atom) 
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,evdwl,fpair_j[jlane],delx_j[jlane],dely_j[jlane],delz_j[jlane]);
+    }
+  }
+
+  if (hasError)
+    k_error_flag.template view<DeviceType>()() = 1;
+
+  if (UseAtomics)
+  {
+    a_f(i,0) += fx_i;
+    a_f(i,1) += fy_i;
+    a_f(i,2) += fz_i;
+    a_uCG(i) += uCG_i;
+    a_uCGnew(i) += uCGnew_i;
+  }
+  else
+  {
+    t_f(tid,i,0) += fx_i;
+    t_f(tid,i,1) += fy_i;
+    t_f(tid,i,2) += fz_i;
+    t_uCG(tid,i) += uCG_i;
+    t_uCGnew(tid,i) += uCGnew_i;
+  }
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>(), ii, ev);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxCollapseDupViews, const int &i) const {
+  for (int n = 0; n < num_threads; n++) {
+    f(i,0) += t_f(n,i,0);
+    f(i,1) += t_f(n,i,1);
+    f(i,2) += t_f(n,i,2);
+    uCG(i) += t_uCG(n,i);
+    uCGnew(i) += t_uCGnew(n,i);
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::operator()(TagPairExp6rxZeroDupViews, const int &i) const {
+  for (int n = 0; n < num_threads; n++) {
+    t_f(n,i,0) = 0.0;
+    t_f(n,i,1) = 0.0;
+    t_f(n,i,2) = 0.0;
+    t_uCG(n,i) = 0.0;
+    t_uCGnew(n,i) = 0.0;
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+  ntypes = atom->ntypes;
+
+  memory->create(setflag,ntypes+1,ntypes+1,"pair:setflag");
+  for (int i = 1; i <= ntypes; i++)
+    for (int j = i; j <= ntypes; j++)
+      setflag[i][j] = 0;
+
+  memory->create_kokkos(k_cutsq,cutsq,ntypes+1,ntypes+1,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+  k_cutsq.template modify<LMPHostType>();
+
+  memory->create(cut,ntypes+1,ntypes+1,"pair:cut_lj");
+}
+
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  PairExp6rx::coeff(narg,arg);
+  
+  if (scalingFlag == POLYNOMIAL)
+    for (int i = 0; i < 6; i++) {
+      s_coeffAlpha[i] = coeffAlpha[i];
+      s_coeffEps[i] = coeffEps[i];
+      s_coeffRm[i] = coeffRm[i];
+    }
+
+  k_params.template modify<LMPHostType>();
+  k_params.template sync<DeviceType>();
+  d_params = k_params.template view<DeviceType>();
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::read_file(char *file)
+{
+  int params_per_line = 5;
+  char **words = new char*[params_per_line+1];
+
+  memory->destroy_kokkos(k_params,params);
+  params = NULL;
+  nparams = maxparam = 0;
+
+  // open file on proc 0
+
+  FILE *fp;
+  fp = NULL;
+  if (comm->me == 0) {
+    fp = force->open_potential(file);
+    if (fp == NULL) {
+      char str[128];
+      sprintf(str,"Cannot open exp6/rx potential file %s",file);
+      error->one(FLERR,str);
+    }
+  }
+
+  // read each set of params from potential file
+  // one set of params can span multiple lines
+
+  int n,nwords,ispecies;
+  char line[MAXLINE],*ptr;
+  int eof = 0;
+
+  while (1) {
+    if (comm->me == 0) {
+      ptr = fgets(line,MAXLINE,fp);
+      if (ptr == NULL) {
+        eof = 1;
+        fclose(fp);
+      } else n = strlen(line) + 1;
+    }
+    MPI_Bcast(&eof,1,MPI_INT,0,world);
+    if (eof) break;
+    MPI_Bcast(&n,1,MPI_INT,0,world);
+    MPI_Bcast(line,n,MPI_CHAR,0,world);
+
+    // strip comment, skip line if blank
+
+    if ((ptr = strchr(line,'#'))) *ptr = '\0';
+    nwords = atom->count_words(line);
+    if (nwords == 0) continue;
+
+    // concatenate additional lines until have params_per_line words
+
+    while (nwords < params_per_line) {
+      n = strlen(line);
+      if (comm->me == 0) {
+        ptr = fgets(&line[n],MAXLINE-n,fp);
+        if (ptr == NULL) {
+          eof = 1;
+          fclose(fp);
+        } else n = strlen(line) + 1;
+      }
+      MPI_Bcast(&eof,1,MPI_INT,0,world);
+      if (eof) break;
+      MPI_Bcast(&n,1,MPI_INT,0,world);
+      MPI_Bcast(line,n,MPI_CHAR,0,world);
+      if ((ptr = strchr(line,'#'))) *ptr = '\0';
+      nwords = atom->count_words(line);
+    }
+
+    if (nwords != params_per_line)
+      error->all(FLERR,"Incorrect format in exp6/rx potential file");
+
+    // words = ptrs to all words in line
+
+    nwords = 0;
+    words[nwords++] = strtok(line," \t\n\r\f");
+    while ((words[nwords++] = strtok(NULL," \t\n\r\f"))) continue;
+
+    for (ispecies = 0; ispecies < nspecies; ispecies++)
+      if (strcmp(words[0],&atom->dname[ispecies][0]) == 0) break;
+    if (ispecies == nspecies) continue;
+
+    // load up parameter settings and error check their values
+
+    if (nparams == maxparam) {
+      k_params.template modify<LMPHostType>();
+      maxparam += DELTA;
+      memory->grow_kokkos(k_params,params,maxparam,
+                          "pair:params");
+    }
+
+    params[nparams].ispecies = ispecies;
+
+    n = strlen(&atom->dname[ispecies][0]) + 1;
+    params[nparams].name = new char[n];
+    strcpy(params[nparams].name,&atom->dname[ispecies][0]);
+
+    n = strlen(words[1]) + 1;
+    params[nparams].potential = new char[n];
+    strcpy(params[nparams].potential,words[1]);
+    if (strcmp(params[nparams].potential,"exp6") == 0){
+      params[nparams].alpha = atof(words[2]);
+      params[nparams].epsilon = atof(words[3]);
+      params[nparams].rm = atof(words[4]);
+      if (params[nparams].epsilon <= 0.0 || params[nparams].rm <= 0.0 ||
+          params[nparams].alpha < 0.0)
+        error->all(FLERR,"Illegal exp6/rx parameters.  Rm and Epsilon must be greater than zero.  Alpha cannot be negative.");
+    } else {
+      error->all(FLERR,"Illegal exp6/rx parameters.  Interaction potential does not exist.");
+    }
+    nparams++;
+  }
+
+  delete [] words;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairExp6rxKokkos<DeviceType>::setup()
+{
+  int i,j,n;
+
+  // set mol2param for all combinations
+  // must be a single exact match to lines read from file
+
+  memory->destroy_kokkos(k_mol2param,mol2param);
+  memory->create_kokkos(k_mol2param,mol2param,nspecies,"pair:mol2param");
+
+  for (i = 0; i < nspecies; i++) {
+    n = -1;
+    for (j = 0; j < nparams; j++) {
+      if (i == params[j].ispecies) {
+        if (n >= 0) error->all(FLERR,"Potential file has duplicate entry");
+        n = j;
+      }
+    }
+    mol2param[i] = n;
+  }
+
+  k_mol2param.template modify<LMPHostType>();
+  k_mol2param.template sync<DeviceType>();
+  d_mol2param = k_mol2param.template view<DeviceType>();
+
+  neighflag = lmp->kokkos->neighflag;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::getMixingWeights(int id,double &epsilon1,double &alpha1,double &rm1, double &mixWtSite1,double &epsilon2,double &alpha2,double &rm2,double &mixWtSite2,double &epsilon1_old,double &alpha1_old,double &rm1_old, double &mixWtSite1old,double &epsilon2_old,double &alpha2_old,double &rm2_old,double &mixWtSite2old) const
+{
+  int iparam, jparam;
+  double rmi, rmj, rmij, rm3ij;
+  double epsiloni, epsilonj, epsilonij;
+  double alphai, alphaj, alphaij;
+  double epsilon_old, rm3_old, alpha_old;
+  double epsilon, rm3, alpha;
+  double xMolei, xMolej, xMolei_old, xMolej_old;
+
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalold;
+
+  rm3 = 0.0;
+  epsilon = 0.0;
+  alpha = 0.0;
+  epsilon_old = 0.0;
+  rm3_old = 0.0;
+  alpha_old = 0.0;
+  fractionOFA = 0.0;
+  fractionOFAold = 0.0;
+  nMoleculesOFA = 0.0;
+  nMoleculesOFAold = 0.0;
+  nTotal = 0.0;
+  nTotalold = 0.0;
+
+  // Compute the total number of molecules in the old and new CG particle as well as the total number of molecules in the fluid portion of the old and new CG particle
+  for (int ispecies = 0; ispecies < nspecies; ispecies++){
+    nTotal += dvector(ispecies,id);
+    nTotalold += dvector(ispecies+nspecies,id);
+
+    iparam = d_mol2param[ispecies];
+
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+      nMoleculesOFAold += dvector(ispecies+nspecies,id);
+      nMoleculesOFA += dvector(ispecies,id);
+    }
+  }
+  if(nTotal < MY_EPSILON || nTotalold < MY_EPSILON)
+    k_error_flag.template view<DeviceType>()() = 1;
+
+  // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
+  fractionOFAold = nMoleculesOFAold / nTotalold;
+  fractionOFA = nMoleculesOFA / nTotal;
+
+  for (int ispecies = 0; ispecies < nspecies; ispecies++) {
+    iparam = d_mol2param[ispecies];
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+
+    // If Site1 matches a pure species, then grab the parameters
+    if (isite1 == d_params[iparam].ispecies){
+      rm1_old = d_params[iparam].rm;
+      rm1 = d_params[iparam].rm;
+      epsilon1_old = d_params[iparam].epsilon;
+      epsilon1 = d_params[iparam].epsilon;
+      alpha1_old = d_params[iparam].alpha;
+      alpha1 = d_params[iparam].alpha;
+
+      // Compute the mole fraction of Site1
+      nMoleculesOld1 = dvector(ispecies+nspecies,id);
+      nMolecules1 = dvector(ispecies,id);
+      fractionOld1 = nMoleculesOld1/nTotalold;
+      fraction1 = nMolecules1/nTotal;
+    }
+
+    // If Site2 matches a pure species, then grab the parameters
+    if (isite2 == d_params[iparam].ispecies){
+      rm2_old = d_params[iparam].rm;
+      rm2 = d_params[iparam].rm;
+      epsilon2_old = d_params[iparam].epsilon;
+      epsilon2 = d_params[iparam].epsilon;
+      alpha2_old = d_params[iparam].alpha;
+      alpha2 = d_params[iparam].alpha;
+
+      // Compute the mole fraction of Site2
+      nMoleculesOld2 = dvector(ispecies+nspecies,id);
+      nMolecules2 = dvector(ispecies,id);
+      fractionOld2 = dvector(ispecies+nspecies,id)/nTotalold;
+      fraction2 = nMolecules2/nTotal;
+    }
+
+    // If Site1 or Site2 matches is a fluid, then compute the paramters
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+      rmi = d_params[iparam].rm;
+      epsiloni = d_params[iparam].epsilon;
+      alphai = d_params[iparam].alpha;
+      if(nMoleculesOFA<MY_EPSILON) xMolei = 0.0;
+      else xMolei = dvector(ispecies,id)/nMoleculesOFA;
+      if(nMoleculesOFAold<MY_EPSILON) xMolei_old = 0.0;
+      else xMolei_old = dvector(ispecies+nspecies,id)/nMoleculesOFAold;
+
+      for (int jspecies = 0; jspecies < nspecies; jspecies++) {
+        jparam = d_mol2param[jspecies];
+        if (jparam < 0 || d_params[jparam].potentialType != exp6PotentialType ) continue;
+        if (isite1 == d_params[jparam].ispecies || isite2 == d_params[jparam].ispecies) continue;
+        rmj = d_params[jparam].rm;
+        epsilonj = d_params[jparam].epsilon;
+        alphaj = d_params[jparam].alpha;
+        if(nMoleculesOFA<MY_EPSILON) xMolej = 0.0;
+        else xMolej = dvector(jspecies,id)/nMoleculesOFA;
+        if(nMoleculesOFAold<MY_EPSILON) xMolej_old = 0.0;
+        else xMolej_old = dvector(jspecies+nspecies,id)/nMoleculesOFAold;
+
+        rmij = (rmi+rmj)/2.0;
+        rm3ij = rmij*rmij*rmij;
+        epsilonij = sqrt(epsiloni*epsilonj);
+        alphaij = sqrt(alphai*alphaj);
+
+        if(fractionOFAold > 0.0){
+          rm3_old += xMolei_old*xMolej_old*rm3ij;
+          epsilon_old += xMolei_old*xMolej_old*rm3ij*epsilonij;
+          alpha_old += xMolei_old*xMolej_old*rm3ij*epsilonij*alphaij;
+        }
+        if(fractionOFA > 0.0){
+          rm3 += xMolei*xMolej*rm3ij;
+          epsilon += xMolei*xMolej*rm3ij*epsilonij;
+          alpha += xMolei*xMolej*rm3ij*epsilonij*alphaij;
+        }
+      }
+    }
+  }
+
+  if (isOneFluidApprox(isite1)){
+    rm1 = cbrt(rm3);
+    if(rm1 < MY_EPSILON) {
+      rm1 = 0.0;
+      epsilon1 = 0.0;
+      alpha1 = 0.0;
+    } else {
+      epsilon1 = epsilon / rm3;
+      alpha1 = alpha / epsilon1 / rm3;
+    }
+    nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+    fraction1 = fractionOFA;
+
+    rm1_old = cbrt(rm3_old);
+    if(rm1_old < MY_EPSILON) {
+      rm1_old = 0.0;
+      epsilon1_old = 0.0;
+      alpha1_old = 0.0;
+    } else {
+      epsilon1_old = epsilon_old / rm3_old;
+      alpha1_old = alpha_old / epsilon1_old / rm3_old;
+    }
+    nMoleculesOld1 = 1.0-(nTotalold-nMoleculesOFAold);
+    fractionOld1 = fractionOFAold;
+
+    if(scalingFlag == EXPONENT){
+      exponentScaling(nMoleculesOFA,epsilon1,rm1);
+      exponentScaling(nMoleculesOFAold,epsilon1_old,rm1_old);
+    } else if(scalingFlag == POLYNOMIAL){
+      polynomialScaling(nMoleculesOFA,alpha1,epsilon1,rm1);
+      polynomialScaling(nMoleculesOFAold,alpha1_old,epsilon1_old,rm1_old);
+    }
+  }
+
+  if (isOneFluidApprox(isite2)){
+    rm2 = cbrt(rm3);
+    if(rm2 < MY_EPSILON) {
+      rm2 = 0.0;
+      epsilon2 = 0.0;
+      alpha2 = 0.0;
+    } else {
+      epsilon2 = epsilon / rm3;
+      alpha2 = alpha / epsilon2 / rm3;
+    }
+    nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+    fraction2 = fractionOFA;
+
+    rm2_old = cbrt(rm3_old);
+    if(rm2_old < MY_EPSILON) {
+      rm2_old = 0.0;
+      epsilon2_old = 0.0;
+      alpha2_old = 0.0;
+    } else {
+      epsilon2_old = epsilon_old / rm3_old;
+      alpha2_old = alpha_old / epsilon2_old / rm3_old;
+    }
+    nMoleculesOld2 = 1.0-(nTotalold-nMoleculesOFAold);
+    fractionOld2 = fractionOFAold;
+
+    if(scalingFlag == EXPONENT){
+      exponentScaling(nMoleculesOFA,epsilon2,rm2);
+      exponentScaling(nMoleculesOFAold,epsilon2_old,rm2_old);
+    } else if(scalingFlag == POLYNOMIAL){
+      polynomialScaling(nMoleculesOFA,alpha2,epsilon2,rm2);
+      polynomialScaling(nMoleculesOFAold,alpha2_old,epsilon2_old,rm2_old);
+    }
+  }
+
+  // Check that no fractions are less than zero
+  if(fraction1 < 0.0 || nMolecules1 < 0.0){
+    if(fraction1 < -MY_EPSILON || nMolecules1 < -MY_EPSILON){
+      k_error_flag.template view<DeviceType>()() = 2;
+    }
+    nMolecules1 = 0.0;
+    fraction1 = 0.0;
+  }
+  if(fraction2 < 0.0 || nMolecules2 < 0.0){
+    if(fraction2 < -MY_EPSILON || nMolecules2 < -MY_EPSILON){
+      k_error_flag.template view<DeviceType>()() = 2;
+    }
+    nMolecules2 = 0.0;
+    fraction2 = 0.0;
+  }
+  if(fractionOld1 < 0.0 || nMoleculesOld1 < 0.0){
+    if(fractionOld1 < -MY_EPSILON || nMoleculesOld1 < -MY_EPSILON){
+      k_error_flag.template view<DeviceType>()() = 2;
+    }
+    nMoleculesOld1 = 0.0;
+    fractionOld1 = 0.0;
+  }
+  if(fractionOld2 < 0.0 || nMoleculesOld2 < 0.0){
+    if(fractionOld2 < -MY_EPSILON || nMoleculesOld2 < -MY_EPSILON){
+      k_error_flag.template view<DeviceType>()() = 2;
+    }
+    nMoleculesOld2 = 0.0;
+    fractionOld2 = 0.0;
+  }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
+
+#ifdef _OPENMP
+void partition_range( const int begin, const int end, int &thread_begin, int &thread_end, const int chunkSize = 1)
+{
+   int threadId = omp_get_thread_num();
+   int nThreads = omp_get_num_threads();
+
+   const int len = end - begin;
+   const int nBlocks = (len + (chunkSize - 1)) / chunkSize;
+   const int nBlocksPerThread = nBlocks / nThreads;
+   const int nRemaining = nBlocks - nBlocksPerThread * nThreads;
+   int block_lo, block_hi;
+   if (threadId < nRemaining)
+   {
+      block_lo = threadId * nBlocksPerThread + threadId;
+      block_hi = block_lo + nBlocksPerThread + 1;
+   }
+   else
+   {
+      block_lo = threadId * nBlocksPerThread + nRemaining;
+      block_hi = block_lo + nBlocksPerThread;
+   }
+
+   thread_begin = std::min(begin + block_lo * chunkSize, end);
+   thread_end   = std::min(begin + block_hi * chunkSize, end);
+   //printf("tid: %d %d %d %d %d\n", threadId, block_lo, block_hi, thread_begin, thread_end);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+#ifndef KOKKOS_HAVE_CUDA
+template<class DeviceType>
+  template<class ArrayT>
+void PairExp6rxKokkos<DeviceType>::getMixingWeightsVect(const int np_total, int errorFlag, 
+                          ArrayT &epsilon1, ArrayT &alpha1, ArrayT &rm1,  ArrayT &mixWtSite1, ArrayT &epsilon2, ArrayT &alpha2, ArrayT &rm2, ArrayT &mixWtSite2, ArrayT &epsilon1_old, ArrayT &alpha1_old, ArrayT &rm1_old,  ArrayT &mixWtSite1old, ArrayT &epsilon2_old, ArrayT &alpha2_old, ArrayT &rm2_old, ArrayT &mixWtSite2old) const
+{
+  ArrayT epsilon          = PairExp6ParamDataVect.epsilon         ;
+  ArrayT rm3              = PairExp6ParamDataVect.rm3             ;
+  ArrayT alpha            = PairExp6ParamDataVect.alpha           ;
+  ArrayT xMolei           = PairExp6ParamDataVect.xMolei          ;
+
+  ArrayT epsilon_old      = PairExp6ParamDataVect.epsilon_old     ;
+  ArrayT rm3_old          = PairExp6ParamDataVect.rm3_old         ;
+  ArrayT alpha_old        = PairExp6ParamDataVect.alpha_old       ;
+  ArrayT xMolei_old       = PairExp6ParamDataVect.xMolei_old      ;
+
+  ArrayT fractionOFA      = PairExp6ParamDataVect.fractionOFA     ;
+  ArrayT fraction1        = PairExp6ParamDataVect.fraction1       ;
+  ArrayT fraction2        = PairExp6ParamDataVect.fraction2       ;
+  ArrayT nMoleculesOFA    = PairExp6ParamDataVect.nMoleculesOFA   ;
+  ArrayT nMolecules1      = PairExp6ParamDataVect.nMolecules1     ;
+  ArrayT nMolecules2      = PairExp6ParamDataVect.nMolecules2     ;
+  ArrayT nTotal           = PairExp6ParamDataVect.nTotal          ;
+
+  ArrayT fractionOFAold   = PairExp6ParamDataVect.fractionOFAold  ;
+  ArrayT fractionOld1     = PairExp6ParamDataVect.fractionOld1    ;
+  ArrayT fractionOld2     = PairExp6ParamDataVect.fractionOld2    ;
+  ArrayT nMoleculesOFAold = PairExp6ParamDataVect.nMoleculesOFAold;
+  ArrayT nMoleculesOld1   = PairExp6ParamDataVect.nMoleculesOld1  ;
+  ArrayT nMoleculesOld2   = PairExp6ParamDataVect.nMoleculesOld2  ;
+  ArrayT nTotalold        = PairExp6ParamDataVect.nTotalold       ;
+
+  int errorFlag1 = 0, errorFlag2 = 0;
+
+#ifdef _OPENMP
+  #pragma omp parallel reduction(+: errorFlag1, errorFlag2)
+#endif
+  {
+    int idx_begin = 0, idx_end = np_total;
+#ifdef _OPENMP
+    partition_range( 0, np_total, idx_begin, idx_end, 16 );
+#endif
+
+  // Zero out all of the terms first.
+  #pragma ivdep
+  for (int id = idx_begin; id < idx_end; ++id)
+  {
+     rm3[id] = 0.0;
+     epsilon[id] = 0.0;
+     alpha[id] = 0.0;
+     epsilon_old[id] = 0.0;
+     rm3_old[id] = 0.0;
+     alpha_old[id] = 0.0;
+     fractionOFA[id] = 0.0;
+     fractionOFAold[id] = 0.0;
+     nMoleculesOFA[id] = 0.0;
+     nMoleculesOFAold[id] = 0.0;
+     nTotal[id] = 0.0;
+     nTotalold[id] = 0.0;
+  }
+
+  // Compute the total number of molecules in the old and new CG particle as well as the total number of molecules in the fluid portion of the old and new CG particle
+  for (int ispecies = 0; ispecies < nspecies; ispecies++)
+  {
+    #pragma ivdep
+    for (int id = idx_begin; id < idx_end; ++id)
+    {
+      nTotal[id] += dvector(ispecies,id);
+      nTotalold[id] += dvector(ispecies+nspecies,id);
+    }
+
+    const int iparam = d_mol2param[ispecies];
+
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        nMoleculesOFAold[id] += dvector(ispecies+nspecies,id);
+        nMoleculesOFA[id] += dvector(ispecies,id);
+      }
+    }
+  }
+
+  // Make a reduction.
+  #pragma omp simd reduction(+:errorFlag1)
+  for (int id = idx_begin; id < idx_end; ++id)
+  {
+    if ( nTotal[id] < MY_EPSILON || nTotalold[id] < MY_EPSILON )
+      errorFlag1 = 1;
+
+    // Compute the mole fraction of molecules within the fluid portion of the particle (One Fluid Approximation)
+    fractionOFAold[id] = nMoleculesOFAold[id] / nTotalold[id];
+    fractionOFA[id] = nMoleculesOFA[id] / nTotal[id];
+  }
+
+  for (int ispecies = 0; ispecies < nspecies; ispecies++) {
+    const int iparam = d_mol2param[ispecies];
+    if (iparam < 0 || d_params[iparam].potentialType != exp6PotentialType ) continue;
+
+    // If Site1 matches a pure species, then grab the parameters
+    if (isite1 == d_params[iparam].ispecies)
+    {
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        rm1_old[id] = d_params[iparam].rm;
+        rm1[id] = d_params[iparam].rm;
+        epsilon1_old[id] = d_params[iparam].epsilon;
+        epsilon1[id] = d_params[iparam].epsilon;
+        alpha1_old[id] = d_params[iparam].alpha;
+        alpha1[id] = d_params[iparam].alpha;
+
+        // Compute the mole fraction of Site1
+        nMoleculesOld1[id] = dvector(ispecies+nspecies,id);
+        nMolecules1[id] = dvector(ispecies,id);
+        fractionOld1[id] = nMoleculesOld1[id]/nTotalold[id];
+        fraction1[id] = nMolecules1[id]/nTotal[id];
+      }
+    }
+
+    // If Site2 matches a pure species, then grab the parameters
+    if (isite2 == d_params[iparam].ispecies)
+    {
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        rm2_old[id] = d_params[iparam].rm;
+        rm2[id] = d_params[iparam].rm;
+        epsilon2_old[id] = d_params[iparam].epsilon;
+        epsilon2[id] = d_params[iparam].epsilon;
+        alpha2_old[id] = d_params[iparam].alpha;
+        alpha2[id] = d_params[iparam].alpha;
+
+        // Compute the mole fraction of Site2
+        nMoleculesOld2[id] = dvector(ispecies+nspecies,id);
+        nMolecules2[id] = dvector(ispecies,id);
+        fractionOld2[id] = nMoleculesOld2[id]/nTotalold[id];
+        fraction2[id] = nMolecules2[id]/nTotal[id];
+      }
+    }
+
+    // If Site1 or Site2 matches is a fluid, then compute the paramters
+    if (isOneFluidApprox(isite1) || isOneFluidApprox(isite2)) {
+      if (isite1 == d_params[iparam].ispecies || isite2 == d_params[iparam].ispecies) continue;
+
+      const double rmi = d_params[iparam].rm;
+      const double epsiloni = d_params[iparam].epsilon;
+      const double alphai = d_params[iparam].alpha;
+
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        if(nMoleculesOFA[id]<MY_EPSILON) xMolei[id] = 0.0;
+        else xMolei[id] = dvector(ispecies,id)/nMoleculesOFA[id];
+        if(nMoleculesOFAold[id]<MY_EPSILON) xMolei_old[id] = 0.0;
+        else xMolei_old[id] = dvector(ispecies+nspecies,id)/nMoleculesOFAold[id];
+      }
+
+      for (int jspecies = 0; jspecies < nspecies; jspecies++) {
+        const int jparam = d_mol2param[jspecies];
+        if (jparam < 0 || d_params[jparam].potentialType != exp6PotentialType ) continue;
+        if (isite1 == d_params[jparam].ispecies || isite2 == d_params[jparam].ispecies) continue;
+
+        const double rmj = d_params[jparam].rm;
+        const double epsilonj = d_params[jparam].epsilon;
+        const double alphaj = d_params[jparam].alpha;
+
+        const double rmij = (rmi+rmj)/2.0;
+        const double rm3ij = rmij*rmij*rmij;
+        const double epsilonij = sqrt(epsiloni*epsilonj);
+        const double alphaij = sqrt(alphai*alphaj);
+
+        #pragma ivdep
+        for (int id = idx_begin; id < idx_end; ++id)
+        {
+          double xMolej, xMolej_old;
+          if(nMoleculesOFA[id]<MY_EPSILON) xMolej = 0.0;
+          else xMolej = dvector(jspecies,id)/nMoleculesOFA[id];
+          if(nMoleculesOFAold[id]<MY_EPSILON) xMolej_old = 0.0;
+          else xMolej_old = dvector(jspecies+nspecies,id)/nMoleculesOFAold[id];
+
+          if(fractionOFAold[id] > 0.0){
+            rm3_old[id] += xMolei_old[id]*xMolej_old*rm3ij;
+            epsilon_old[id] += xMolei_old[id]*xMolej_old*rm3ij*epsilonij;
+            alpha_old[id] += xMolei_old[id]*xMolej_old*rm3ij*epsilonij*alphaij;
+          }
+          if(fractionOFA[id] > 0.0){
+            rm3[id] += xMolei[id]*xMolej*rm3ij;
+            epsilon[id] += xMolei[id]*xMolej*rm3ij*epsilonij;
+            alpha[id] += xMolei[id]*xMolej*rm3ij*epsilonij*alphaij;
+          }
+        }
+      }
+    }
+  }
+
+  if (isOneFluidApprox(isite1))
+  {
+    #pragma ivdep
+    for (int id = idx_begin; id < idx_end; ++id)
+    {
+      rm1[id] = cbrt(rm3[id]);
+      if(rm1[id] < MY_EPSILON) {
+        rm1[id] = 0.0;
+        epsilon1[id] = 0.0;
+        alpha1[id] = 0.0;
+      } else {
+        epsilon1[id] = epsilon[id] / rm3[id];
+        alpha1[id] = alpha[id] / epsilon1[id] / rm3[id];
+      }
+      nMolecules1[id] = 1.0-(nTotal[id]-nMoleculesOFA[id]);
+      fraction1[id] = fractionOFA[id];
+
+      rm1_old[id] = cbrt(rm3_old[id]);
+      if(rm1_old[id] < MY_EPSILON) {
+        rm1_old[id] = 0.0;
+        epsilon1_old[id] = 0.0;
+        alpha1_old[id] = 0.0;
+      } else {
+        epsilon1_old[id] = epsilon_old[id] / rm3_old[id];
+        alpha1_old[id] = alpha_old[id] / epsilon1_old[id] / rm3_old[id];
+      }
+      nMoleculesOld1[id] = 1.0-(nTotalold[id]-nMoleculesOFAold[id]);
+      fractionOld1[id] = fractionOFAold[id];
+    }
+
+    if(scalingFlag == EXPONENT) {
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        exponentScaling(nMoleculesOFA[id],epsilon1[id],rm1[id]);
+        exponentScaling(nMoleculesOFAold[id],epsilon1_old[id],rm1_old[id]);
+      }
+    }
+    else if(scalingFlag == POLYNOMIAL){
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        polynomialScaling(nMoleculesOFA[id],alpha1[id],epsilon1[id],rm1[id]);
+        polynomialScaling(nMoleculesOFAold[id],alpha1_old[id],epsilon1_old[id],rm1_old[id]);
+      }
+    }
+  }
+
+  if (isOneFluidApprox(isite2))
+  {
+    #pragma ivdep
+    for (int id = idx_begin; id < idx_end; ++id)
+    {
+      rm2[id] = cbrt(rm3[id]);
+      if(rm2[id] < MY_EPSILON) {
+        rm2[id] = 0.0;
+        epsilon2[id] = 0.0;
+        alpha2[id] = 0.0;
+      } else {
+        epsilon2[id] = epsilon[id] / rm3[id];
+        alpha2[id] = alpha[id] / epsilon2[id] / rm3[id];
+      }
+      nMolecules2[id] = 1.0-(nTotal[id]-nMoleculesOFA[id]);
+      fraction2[id] = fractionOFA[id];
+
+      rm2_old[id] = cbrt(rm3_old[id]);
+      if(rm2_old[id] < MY_EPSILON) {
+        rm2_old[id] = 0.0;
+        epsilon2_old[id] = 0.0;
+        alpha2_old[id] = 0.0;
+      } else {
+        epsilon2_old[id] = epsilon_old[id] / rm3_old[id];
+        alpha2_old[id] = alpha_old[id] / epsilon2_old[id] / rm3_old[id];
+      }
+      nMoleculesOld2[id] = 1.0-(nTotalold[id]-nMoleculesOFAold[id]);
+      fractionOld2[id] = fractionOFAold[id];
+    }
+
+    if(scalingFlag == EXPONENT){
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        exponentScaling(nMoleculesOFA[id],epsilon2[id],rm2[id]);
+        exponentScaling(nMoleculesOFAold[id],epsilon2_old[id],rm2_old[id]);
+      }
+    }
+    else if(scalingFlag == POLYNOMIAL){
+      #pragma ivdep
+      for (int id = idx_begin; id < idx_end; ++id)
+      {
+        polynomialScaling(nMoleculesOFA[id],alpha2[id],epsilon2[id],rm2[id]);
+        polynomialScaling(nMoleculesOFAold[id],alpha2_old[id],epsilon2_old[id],rm2_old[id]);
+      }
+    }
+  }
+
+  // Check that no fractions are less than zero
+  #pragma omp simd reduction(+:errorFlag2)
+  for (int id = idx_begin; id < idx_end; ++id)
+  {
+    if(fraction1[id] < 0.0 || nMolecules1[id] < 0.0){
+      if(fraction1[id] < -MY_EPSILON || nMolecules1[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMolecules1[id] = 0.0;
+      fraction1[id] = 0.0;
+    }
+    if(fraction2[id] < 0.0 || nMolecules2[id] < 0.0){
+      if(fraction2[id] < -MY_EPSILON || nMolecules2[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMolecules2[id] = 0.0;
+      fraction2[id] = 0.0;
+    }
+    if(fractionOld1[id] < 0.0 || nMoleculesOld1[id] < 0.0){
+      if(fractionOld1[id] < -MY_EPSILON || nMoleculesOld1[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMoleculesOld1[id] = 0.0;
+      fractionOld1[id] = 0.0;
+    }
+    if(fractionOld2[id] < 0.0 || nMoleculesOld2[id] < 0.0){
+      if(fractionOld2[id] < -MY_EPSILON || nMoleculesOld2[id] < -MY_EPSILON){
+        errorFlag2 = 2;
+      }
+      nMoleculesOld2[id] = 0.0;
+      fractionOld2[id] = 0.0;
+    }
+
+    if(fractionalWeighting){
+      mixWtSite1old[id] = fractionOld1[id];
+      mixWtSite1[id] = fraction1[id];
+      mixWtSite2old[id] = fractionOld2[id];
+      mixWtSite2[id] = fraction2[id];
+    } else {
+      mixWtSite1old[id] = nMoleculesOld1[id];
+      mixWtSite1[id] = nMolecules1[id];
+      mixWtSite2old[id] = nMoleculesOld2[id];
+      mixWtSite2[id] = nMolecules2[id];
+    }
+  }
+
+  } // end parallel region
+
+  if (errorFlag1 > 0)
+    errorFlag = 1;
+
+  if (errorFlag2 > 0)
+    errorFlag = 2;
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::exponentScaling(double phi, double &epsilon, double &rm) const
+{
+  double powfuch;
+
+  if(exponentEpsilon < 0.0){
+    powfuch = pow(phi,-exponentEpsilon);
+    if(powfuch<MY_EPSILON) epsilon = 0.0;
+    else epsilon *= 1.0/powfuch;
+  } else {
+    epsilon *= pow(phi,exponentEpsilon);
+  }
+
+  if(exponentR < 0.0){
+    powfuch = pow(phi,-exponentR);
+    if(powfuch<MY_EPSILON) rm = 0.0;
+    else rm *= 1.0/powfuch;
+  } else {
+    rm *= pow(phi,exponentR);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::polynomialScaling(double phi, double &alpha, double &epsilon, double &rm) const
+{
+    double phi2 = phi*phi;
+    double phi3 = phi2*phi;
+    double phi4 = phi2*phi2;
+    double phi5 = phi2*phi3;
+
+    alpha = (s_coeffAlpha[0]*phi5 + s_coeffAlpha[1]*phi4 + s_coeffAlpha[2]*phi3 + s_coeffAlpha[3]*phi2 + s_coeffAlpha[4]*phi + s_coeffAlpha[5]);
+    epsilon *= (s_coeffEps[0]*phi5 + s_coeffEps[1]*phi4 + s_coeffEps[2]*phi3 + s_coeffEps[3]*phi2 + s_coeffEps[4]*phi + s_coeffEps[5]);
+    rm *= (s_coeffRm[0]*phi5 + s_coeffRm[1]*phi4 + s_coeffRm[2]*phi3 + s_coeffRm[3]*phi2 + s_coeffRm[4]*phi + s_coeffRm[5]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double PairExp6rxKokkos<DeviceType>::func_rin(const double &alpha) const
+{
+  double function;
+
+  const double a = 3.7682065;
+  const double b = -1.4308614;
+
+  function = a+b*sqrt(alpha);
+  function = expValue(function);
+
+  return function;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+double PairExp6rxKokkos<DeviceType>::expValue(double value) const
+{
+  double returnValue;
+  if(value < DBL_MIN_EXP) returnValue = 0.0;
+  else returnValue = exp(value);
+
+  return returnValue;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void PairExp6rxKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int VFLAG = vflag_either;
+
+  // The eatom and vatom arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>();
+  Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>();
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      const E_FLOAT epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        v_vatom(j,0) += 0.5*v0;
+        v_vatom(j,1) += 0.5*v1;
+        v_vatom(j,2) += 0.5*v2;
+        v_vatom(j,3) += 0.5*v3;
+        v_vatom(j,4) += 0.5*v4;
+        v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+int PairExp6rxKokkos<DeviceType>::sbmask(const int& j) const {
+  return j >> SBBITS & 3;
+}
+
+namespace LAMMPS_NS {
+template class PairExp6rxKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairExp6rxKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_exp6_rx_kokkos.h b/src/KOKKOS/pair_exp6_rx_kokkos.h
new file mode 100644
index 0000000000..5e44048ae2
--- /dev/null
+++ b/src/KOKKOS/pair_exp6_rx_kokkos.h
@@ -0,0 +1,280 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(exp6/rx/kk,PairExp6rxKokkos<LMPDeviceType>)
+PairStyle(exp6/rx/kk/device,PairExp6rxKokkos<LMPDeviceType>)
+PairStyle(exp6/rx/kk/host,PairExp6rxKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_EXP6_RX_KOKKOS_H
+#define LMP_PAIR_EXP6_RX_KOKKOS_H
+
+#include "pair_exp6_rx.h"
+#include "kokkos_type.h"
+#include "pair_kokkos.h"
+
+namespace LAMMPS_NS {
+
+// Create a structure to hold the parameter data for all
+// local and neighbor particles. Pack inside this struct
+// to avoid any name clashes.
+
+template<class DeviceType>
+struct PairExp6ParamDataTypeKokkos
+{
+  typedef ArrayTypes<DeviceType> AT;
+
+   int n;
+   typename AT::t_float_1d epsilon1, alpha1, rm1, mixWtSite1,
+          epsilon2, alpha2, rm2, mixWtSite2,
+          epsilonOld1, alphaOld1, rmOld1, mixWtSite1old,
+          epsilonOld2, alphaOld2, rmOld2, mixWtSite2old;
+
+   // Default constructor -- nullify everything.
+   PairExp6ParamDataTypeKokkos<DeviceType>(void)
+      : n(0), epsilon1(NULL), alpha1(NULL), rm1(NULL), mixWtSite1(NULL),
+              epsilon2(NULL), alpha2(NULL), rm2(NULL), mixWtSite2(NULL),
+              epsilonOld1(NULL), alphaOld1(NULL), rmOld1(NULL), mixWtSite1old(NULL),
+              epsilonOld2(NULL), alphaOld2(NULL), rmOld2(NULL), mixWtSite2old(NULL)
+   {}
+};
+
+template<class DeviceType>
+struct PairExp6ParamDataTypeKokkosVect
+{
+  typedef ArrayTypes<DeviceType> AT;
+
+   typename AT::t_float_1d epsilon, rm3, alpha, xMolei, epsilon_old, rm3_old,
+                           alpha_old, xMolei_old, fractionOFA, fraction1,
+                           fraction2, nMoleculesOFA, nMolecules1, nMolecules2,
+                           nTotal, fractionOFAold, fractionOld1, fractionOld2,
+                           nMoleculesOFAold, nMoleculesOld1, nMoleculesOld2,
+                           nTotalold;
+
+   // Default constructor -- nullify everything.
+   PairExp6ParamDataTypeKokkosVect<DeviceType>(void)
+      : epsilon(NULL), rm3(NULL), alpha(NULL), xMolei(NULL), epsilon_old(NULL), rm3_old(NULL),
+        alpha_old(NULL), xMolei_old(NULL), fractionOFA(NULL), fraction1(NULL),
+        fraction2(NULL), nMoleculesOFA(NULL), nMolecules1(NULL), nMolecules2(NULL),
+        nTotal(NULL), fractionOFAold(NULL), fractionOld1(NULL), fractionOld2(NULL),
+        nMoleculesOFAold(NULL), nMoleculesOld1(NULL), nMoleculesOld2(NULL),
+        nTotalold(NULL)
+   {}
+};
+
+struct TagPairExp6rxZeroMixingWeights{};
+struct TagPairExp6rxgetMixingWeights{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+struct TagPairExp6rxCompute{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+struct TagPairExp6rxComputeNoAtomics{};
+
+struct TagPairExp6rxCollapseDupViews{};
+struct TagPairExp6rxZeroDupViews{};
+
+template<class DeviceType>
+class PairExp6rxKokkos : public PairExp6rx {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  PairExp6rxKokkos(class LAMMPS *);
+  virtual ~PairExp6rxKokkos();
+  void compute(int, int);
+  void coeff(int, char **);
+  void init_style();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxZeroMixingWeights, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxgetMixingWeights, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, bool Site1EqSite2, bool UseAtomics, bool OneType>
+  KOKKOS_INLINE_FUNCTION
+  void vectorized_operator(const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxComputeNoAtomics<NEIGHFLAG,NEWTON_PAIR,EVFLAG>, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxCollapseDupViews, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairExp6rxZeroDupViews, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR>
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int sbmask(const int& j) const;
+
+ protected:
+  int eflag,vflag;
+  int nlocal,newton_pair,neighflag;
+  double special_lj[4];
+  int num_threads,ntypes;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+  typename AT::t_efloat_1d uCG, uCGnew;
+  typename AT::t_float_2d dvector;
+
+  typedef Kokkos::View<F_FLOAT**[3],Kokkos::LayoutRight,DeviceType> t_f_array_thread;
+  typedef Kokkos::View<E_FLOAT**,Kokkos::LayoutRight,DeviceType> t_efloat_1d_thread;
+
+  t_f_array_thread t_f;
+  t_efloat_1d_thread t_uCG, t_uCGnew;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  DAT::tdual_int_scalar k_error_flag;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  PairExp6ParamDataTypeKokkos<DeviceType> PairExp6ParamData;
+  PairExp6ParamDataTypeKokkosVect<DeviceType> PairExp6ParamDataVect;
+
+  void allocate();
+  DAT::tdual_int_1d k_mol2param;               // mapping from molecule to parameters
+  typename AT::t_int_1d_randomread d_mol2param;
+
+  typedef Kokkos::DualView<Param*,Kokkos::LayoutRight,DeviceType> tdual_param_1d;
+  typedef typename tdual_param_1d::t_dev_const_randomread t_param_1d_randomread;
+
+  tdual_param_1d k_params;                // parameter set for an I-J-K interaction
+  t_param_1d_randomread d_params;                // parameter set for an I-J-K interaction
+
+  typename ArrayTypes<DeviceType>::tdual_ffloat_2d k_cutsq;
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  void read_file(char *);
+  void setup();
+
+  KOKKOS_INLINE_FUNCTION
+  void getMixingWeights(int, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &, double &) const;
+
+  template <class ArrayT>
+  void getMixingWeightsVect(const int, int, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &, ArrayT &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void exponentScaling(double, double &, double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void polynomialScaling(double, double &, double &, double &) const;
+
+  double s_coeffAlpha[6],s_coeffEps[6],s_coeffRm[6];
+
+  KOKKOS_INLINE_FUNCTION
+  double func_rin(const double &) const;
+
+  KOKKOS_INLINE_FUNCTION
+  double expValue(const double) const;
+
+  friend void pair_virial_fdotr_compute<PairExp6rxKokkos>(PairExp6rxKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E:  alpha_ij is 6.0 in pair exp6
+
+Self-explanatory
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: PairExp6rxKokkos requires a fix rx command
+
+The fix rx command must come before the pair style command in the input file
+
+E:  There are no rx species specified
+
+There must be at least one species specified through the fix rx command
+
+E:  Site1 name not recognized in pair coefficients
+
+The site1 keyword does not match the species keywords specified throug the fix rx command
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E:  Cannot open exp6/rx potential file %s
+
+Self-explanatory
+
+E:  Incorrect format in exp6/rx potential file
+
+Self-explanatory
+
+E:  Illegal exp6/rx parameters.  Rm and Epsilon must be greater than zero.  Alpha cannot be negative.
+
+Self-explanatory
+
+E:  Illegal exp6/rx parameters.  Interaction potential does not exist.
+
+Self-explanatory
+
+E:  Potential file has duplicate entry.
+
+Self-explanatory
+
+E:  The number of molecules in CG particle is less than 10*DBL_EPSILON.
+
+Self-explanatory.  Check the species concentrations have been properly set
+and check the reaction kinetic solver parameters in fix rx to more for
+sufficient accuracy.
+
+
+*/
diff --git a/src/KOKKOS/pair_hybrid_kokkos.cpp b/src/KOKKOS/pair_hybrid_kokkos.cpp
new file mode 100644
index 0000000000..337b56c6ce
--- /dev/null
+++ b/src/KOKKOS/pair_hybrid_kokkos.cpp
@@ -0,0 +1,159 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "pair_hybrid_kokkos.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "pair.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "update.h"
+#include "comm.h"
+#include "memory.h"
+#include "error.h"
+#include "respa.h"
+#include "atom_masks.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairHybridKokkos::PairHybridKokkos(LAMMPS *lmp) : PairHybrid(lmp)
+{
+  atomKK = (AtomKokkos *) atom;
+
+ // prevent overlapping host/device computation, which isn't
+ //  yet supported by pair_hybrid_kokkos
+ execution_space = Device;
+
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairHybridKokkos::~PairHybridKokkos()
+{
+
+}
+
+/* ----------------------------------------------------------------------
+  call each sub-style's compute() or compute_outer() function
+  accumulate sub-style global/peratom energy/virial in hybrid
+  for global vflag = 1:
+    each sub-style computes own virial[6]
+    sum sub-style virial[6] to hybrid's virial[6]
+  for global vflag = 2:
+    call sub-style with adjusted vflag to prevent it calling
+      virial_fdotr_compute()
+    hybrid calls virial_fdotr_compute() on final accumulated f
+------------------------------------------------------------------------- */
+
+void PairHybridKokkos::compute(int eflag, int vflag)
+{
+  int i,j,m,n;
+
+  // if no_virial_fdotr_compute is set and global component of
+  //   incoming vflag = 2, then
+  // reset vflag as if global component were 1
+  // necessary since one or more sub-styles cannot compute virial as F dot r
+
+  int neighflag = lmp->kokkos->neighflag;
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  if (no_virial_fdotr_compute && vflag % 4 == 2) vflag = 1 + vflag/4 * 4;
+
+  if (eflag || vflag) ev_setup(eflag,vflag);
+  else evflag = vflag_fdotr = eflag_global = vflag_global =
+         eflag_atom = vflag_atom = 0;
+
+  // check if global component of incoming vflag = 2
+  // if so, reset vflag passed to substyle as if it were 0
+  // necessary so substyle will not invoke virial_fdotr_compute()
+
+  int vflag_substyle;
+  if (vflag % 4 == 2) vflag_substyle = vflag/4 * 4;
+  else vflag_substyle = vflag;
+
+  double *saved_special = save_special();
+
+  // check if we are running with r-RESPA using the hybrid keyword
+
+  Respa *respa = NULL;
+  respaflag = 0;
+  if (strstr(update->integrate_style,"respa")) {
+    respa = (Respa *) update->integrate;
+    if (respa->nhybrid_styles > 0) respaflag = 1;
+  }
+
+  for (m = 0; m < nstyles; m++) {
+
+    set_special(m);
+
+    if (!respaflag || (respaflag && respa->hybrid_compute[m])) {
+
+      // invoke compute() unless compute flag is turned off or
+      // outerflag is set and sub-style has a compute_outer() method
+
+      if (styles[m]->compute_flag == 0) continue;
+      atomKK->sync(styles[m]->execution_space,styles[m]->datamask_read);
+      if (outerflag && styles[m]->respa_enable)
+        styles[m]->compute_outer(eflag,vflag_substyle);
+      else styles[m]->compute(eflag,vflag_substyle);
+      atomKK->modified(styles[m]->execution_space,styles[m]->datamask_modify);
+    }
+
+    restore_special(saved_special);
+
+    // jump to next sub-style if r-RESPA does not want global accumulated data
+
+    if (respaflag && !respa->tally_global) continue;
+
+    if (eflag_global) {
+      eng_vdwl += styles[m]->eng_vdwl;
+      eng_coul += styles[m]->eng_coul;
+    }
+    if (vflag_global) {
+      for (n = 0; n < 6; n++) virial[n] += styles[m]->virial[n];
+    }
+    if (eflag_atom) {
+      n = atom->nlocal;
+      if (force->newton_pair) n += atom->nghost;
+      double *eatom_substyle = styles[m]->eatom;
+      for (i = 0; i < n; i++) eatom[i] += eatom_substyle[i];
+    }
+    if (vflag_atom) {
+      n = atom->nlocal;
+      if (force->newton_pair) n += atom->nghost;
+      double **vatom_substyle = styles[m]->vatom;
+      for (i = 0; i < n; i++)
+        for (j = 0; j < 6; j++)
+          vatom[i][j] += vatom_substyle[i][j];
+    }
+  }
+
+  delete [] saved_special;
+
+  // perform virial_fdotr on device
+
+  atomKK->sync(Device,X_MASK|F_MASK);
+  x = atomKK->k_x.view<LMPDeviceType>();
+  f = atomKK->k_f.view<LMPDeviceType>();
+
+  if (vflag_fdotr)
+    pair_virial_fdotr_compute(this);
+}
diff --git a/src/KOKKOS/pair_hybrid_kokkos.h b/src/KOKKOS/pair_hybrid_kokkos.h
new file mode 100644
index 0000000000..62d325925b
--- /dev/null
+++ b/src/KOKKOS/pair_hybrid_kokkos.h
@@ -0,0 +1,118 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(hybrid/kk,PairHybridKokkos)
+
+#else
+
+#ifndef LMP_PAIR_HYBRID_KOKKOS_H
+#define LMP_PAIR_HYBRID_KOKKOS_H
+
+#include <stdio.h>
+#include "pair_hybrid.h"
+#include "pair_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class PairHybridKokkos : public PairHybrid {
+  friend class FixGPU;
+  friend class FixIntel;
+  friend class FixOMP;
+  friend class Force;
+  friend class Respa;
+  friend class Info;
+ public:
+  typedef LMPDeviceType device_type;
+
+  PairHybridKokkos(class LAMMPS *);
+  virtual ~PairHybridKokkos();
+  void compute(int, int);
+
+ private:
+  DAT::t_x_array_randomread x;
+  DAT::t_f_array f;
+  friend void pair_virial_fdotr_compute<PairHybridKokkos>(PairHybridKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E: Pair style hybrid cannot have hybrid as an argument
+
+Self-explanatory.
+
+E: Pair style hybrid cannot have none as an argument
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair coeff for hybrid has invalid style
+
+Style in pair coeff must have been listed in pair_style command.
+
+E: Pair hybrid sub-style is not used
+
+No pair_coeff command used a sub-style specified in the pair_style
+command.
+
+E: Pair_modify special setting for pair hybrid incompatible with global special_bonds setting
+
+Cannot override a setting of 0.0 or 1.0 or change a setting between
+0.0 and 1.0.
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Invoked pair single on pair style none
+
+A command (e.g. a dump) attempted to invoke the single() function on a
+pair style none, which is illegal.  You are probably attempting to
+compute per-atom quantities with an undefined pair style.
+
+E: Pair hybrid sub-style does not support single call
+
+You are attempting to invoke a single() call on a pair style
+that doesn't support it.
+
+E: Pair hybrid single calls do not support per sub-style special bond values
+
+Self-explanatory.
+
+E: Unknown pair_modify hybrid sub-style
+
+The choice of sub-style is unknown.
+
+E: Coulomb cutoffs of pair hybrid sub-styles do not match
+
+If using a Kspace solver, all Coulomb cutoffs of long pair styles must
+be the same.
+
+*/
diff --git a/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
new file mode 100644
index 0000000000..aa5d895155
--- /dev/null
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.cpp
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include "pair_hybrid_overlay_kokkos.h"
+#include "atom.h"
+#include "force.h"
+#include "neighbor.h"
+#include "neigh_request.h"
+#include "error.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairHybridOverlayKokkos::PairHybridOverlayKokkos(LAMMPS *lmp) : PairHybridKokkos(lmp) {}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairHybridOverlayKokkos::coeff(int narg, char **arg)
+{
+  if (narg < 3) error->all(FLERR,"Incorrect args for pair coefficients");
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  // 3rd arg = pair sub-style name
+  // 4th arg = pair sub-style index if name used multiple times
+  // allow for "none" as valid sub-style name
+
+  int multflag;
+  int m;
+
+  for (m = 0; m < nstyles; m++) {
+    multflag = 0;
+    if (strcmp(arg[2],keywords[m]) == 0) {
+      if (multiple[m]) {
+        multflag = 1;
+        if (narg < 4) error->all(FLERR,"Incorrect args for pair coefficients");
+        if (!isdigit(arg[3][0]))
+          error->all(FLERR,"Incorrect args for pair coefficients");
+        int index = force->inumeric(FLERR,arg[3]);
+        if (index == multiple[m]) break;
+        else continue;
+      } else break;
+    }
+  }
+
+  int none = 0;
+  if (m == nstyles) {
+    if (strcmp(arg[2],"none") == 0) none = 1;
+    else error->all(FLERR,"Pair coeff for hybrid has invalid style");
+  }
+
+  // move 1st/2nd args to 2nd/3rd args
+  // if multflag: move 1st/2nd args to 3rd/4th args
+  // just copy ptrs, since arg[] points into original input line
+
+  arg[2+multflag] = arg[1];
+  arg[1+multflag] = arg[0];
+
+  // invoke sub-style coeff() starting with 1st remaining arg
+
+  if (!none) styles[m]->coeff(narg-1-multflag,&arg[1+multflag]);
+
+  // set setflag and which type pairs map to which sub-style
+  // if sub-style is none: set hybrid subflag, wipe out map
+  // else: set hybrid setflag & map only if substyle setflag is set
+  //       if sub-style is new for type pair, add as multiple mapping
+  //       if sub-style exists for type pair, don't add, just update coeffs
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      if (none) {
+        setflag[i][j] = 1;
+        nmap[i][j] = 0;
+        count++;
+      } else if (styles[m]->setflag[i][j]) {
+        int k;
+        for (k = 0; k < nmap[i][j]; k++)
+          if (map[i][j][k] == m) break;
+        if (k == nmap[i][j]) map[i][j][nmap[i][j]++] = m;
+        setflag[i][j] = 1;
+        count++;
+      }
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
diff --git a/src/USER-DPD/npair_halffull_newton_ssa.h b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
similarity index 56%
rename from src/USER-DPD/npair_halffull_newton_ssa.h
rename to src/KOKKOS/pair_hybrid_overlay_kokkos.h
index 03903815b1..6bec57c453 100644
--- a/src/USER-DPD/npair_halffull_newton_ssa.h
+++ b/src/KOKKOS/pair_hybrid_overlay_kokkos.h
@@ -11,27 +11,24 @@
    See the README file in the top-level LAMMPS directory.
 ------------------------------------------------------------------------- */
 
-#ifdef NPAIR_CLASS
+#ifdef PAIR_CLASS
 
-NPairStyle(halffull/newton/ssa,
-           NPairHalffullNewtonSSA,
-           NP_HALF_FULL | NP_NSQ | NP_BIN | NP_MULTI | NP_NEWTON |
-           NP_ORTHO | NP_TRI | NP_SSA)
+PairStyle(hybrid/overlay/kk,PairHybridOverlayKokkos)
 
 #else
 
-#ifndef LMP_NPAIR_HALFFULL_NEWTON_SSA_H
-#define LMP_NPAIR_HALFFULL_NEWTON_SSA_H
+#ifndef LMP_PAIR_HYBRID_OVERLAY_KOKKOS_H
+#define LMP_PAIR_HYBRID_OVERLAY_KOKKOS_H
 
-#include "npair.h"
+#include "pair_hybrid_kokkos.h"
 
 namespace LAMMPS_NS {
 
-class NPairHalffullNewtonSSA : public NPair {
+class PairHybridOverlayKokkos : public PairHybridKokkos {
  public:
-  NPairHalffullNewtonSSA(class LAMMPS *);
-  ~NPairHalffullNewtonSSA() {}
-  void build(class NeighList *);
+  PairHybridOverlayKokkos(class LAMMPS *);
+  virtual ~PairHybridOverlayKokkos() {}
+  void coeff(int, char **);
 };
 
 }
@@ -41,4 +38,12 @@ class NPairHalffullNewtonSSA : public NPair {
 
 /* ERROR/WARNING messages:
 
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair coeff for hybrid has invalid style
+
+Style in pair coeff must have been listed in pair_style command.
+
 */
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
new file mode 100644
index 0000000000..d9a4f1ab83
--- /dev/null
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.cpp
@@ -0,0 +1,998 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------------------------
+   Contributing authors:
+   Stan Moore (Sandia)
+
+   Please cite the related publications:
+   J.D. Moore, B.C. Barnes, S. Izvekov, M. Lisal, M.S. Sellers, D.E. Taylor & J.K. Brennan
+   "A coarse-grain force field for RDX: Density dependent and energy conserving"
+   The Journal of Chemical Physics, 2016, 144, 104501.
+------------------------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include "math_const.h"
+#include <stdlib.h>
+#include <string.h>
+#include "pair_multi_lucy_rx_kokkos.h"
+#include "atom_kokkos.h"
+#include "force.h"
+#include "comm.h"
+#include "neigh_list.h"
+#include "memory.h"
+#include "error.h"
+#include "citeme.h"
+#include "modify.h"
+#include "fix.h"
+#include "atom_masks.h"
+#include "neigh_request.h"
+
+using namespace LAMMPS_NS;
+
+enum{NONE,RLINEAR,RSQ};
+
+#define MAXLINE 1024
+
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+#define oneFluidParameter (-1)
+#define isOneFluid(_site) ( (_site) == oneFluidParameter )
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMultiLucyRXKokkos<DeviceType>::PairMultiLucyRXKokkos(LAMMPS *lmp) : PairMultiLucyRX(lmp)
+{
+  respa_enable = 0;
+
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = EMPTY_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  update_table = 1;
+  h_table = new TableHost();
+  d_table = new TableDevice();
+
+  k_error_flag = DAT::tdual_int_scalar("pair:error_flag");
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairMultiLucyRXKokkos<DeviceType>::~PairMultiLucyRXKokkos()
+{
+  if (copymode) return;
+
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
+  memory->destroy_kokkos(k_cutsq,cutsq);
+
+  delete h_table;
+  delete d_table;
+  tabindex = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::init_style()
+{
+  PairMultiLucyRX::init_style();
+
+  // irequest = neigh request made by parent class
+
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with multi/lucy/rx/kk");
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  copymode = 1;
+
+  if (update_table)
+    create_kokkos_tables();
+
+  if (tabstyle == LOOKUP)
+    compute_style<LOOKUP>(eflag_in,vflag_in);
+  else if(tabstyle == LINEAR)
+    compute_style<LINEAR>(eflag_in,vflag_in);
+
+  copymode = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int TABSTYLE>
+void PairMultiLucyRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
+{
+  eflag = eflag_in;
+  vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
+  else evflag = vflag_fdotr = 0;
+
+  // reallocate per-atom arrays if necessary
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.template view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.template view<DeviceType>();
+  }
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  rho = atomKK->k_rho.view<DeviceType>();
+  uCG = atomKK->k_uCG.view<DeviceType>();
+  uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  dvector = atomKK->k_dvector.view<DeviceType>();
+
+  atomKK->sync(execution_space,X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK | DPDRHO_MASK | UCG_MASK | UCGNEW_MASK | DVECTOR_MASK);
+  k_cutsq.template sync<DeviceType>();
+
+  nlocal = atom->nlocal;
+  int nghost = atom->nghost;
+  int newton_pair = force->newton_pair;
+
+  {
+    const int ntotal = nlocal + nghost;
+    if (ntotal > d_mixWtSite1.dimension_0()) {
+      d_mixWtSite1old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1old",ntotal);
+      d_mixWtSite2old = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2old",ntotal);
+      d_mixWtSite1 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite1",ntotal);
+      d_mixWtSite2 = typename AT::t_float_1d("PairMultiLucyRX::mixWtSite2",ntotal);
+    }
+
+    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXgetMixingWeights>(0,ntotal),*this);
+  }
+
+  const int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  computeLocalDensity();
+
+  // loop over neighbors of my atoms
+
+  EV_FLOAT ev;
+
+  if (neighflag == HALF) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,1,0,TABSTYLE> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,0,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALF,0,0,TABSTYLE> >(0,inum),*this);
+    }
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,1,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,1,0,TABSTYLE> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,0,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<HALFTHREAD,0,0,TABSTYLE> >(0,inum),*this);
+    }
+  } else if (neighflag == FULL) {
+    if (newton_pair) {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,1,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,1,0,TABSTYLE> >(0,inum),*this);
+    } else {
+      if (evflag) Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,0,1,TABSTYLE> >(0,inum),*this,ev);
+      else Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXCompute<FULL,0,0,TABSTYLE> >(0,inum),*this);
+    }
+  }
+
+  if (evflag) atomKK->modified(execution_space,F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK);
+  else atomKK->modified(execution_space,F_MASK | UCG_MASK | UCGNEW_MASK);
+
+  k_error_flag.template modify<DeviceType>();
+  k_error_flag.template sync<LMPHostType>();
+  if (k_error_flag.h_view() == 1)
+    error->one(FLERR,"Density < table inner cutoff");
+  else if (k_error_flag.h_view() == 2)
+    error->one(FLERR,"Density > table outer cutoff");
+  else if (k_error_flag.h_view() == 3)
+    error->one(FLERR,"Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx");
+
+  if (eflag_global) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXgetMixingWeights, const int &i) const {
+  getMixingWeights(i, d_mixWtSite1old[i], d_mixWtSite2old[i], d_mixWtSite1[i], d_mixWtSite2[i]);
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int &ii, EV_FLOAT& ev) const {
+
+  // The f array is atomic for Half/Thread neighbor style
+  Kokkos::View<F_FLOAT*[3], typename DAT::t_f_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_f = f;
+
+  int i,jj,jnum,itype,jtype,itable;
+  double xtmp,ytmp,ztmp,delx,dely,delz,evdwl,evdwlOld,fpair;
+  double rsq;
+
+  double mixWtSite1old_i,mixWtSite1old_j;
+  double mixWtSite2old_i,mixWtSite2old_j;
+  double mixWtSite1_i;
+
+  double pi = MathConst::MY_PI;
+  double A_i, A_j;
+  double fraction_i,fraction_j;
+  int jtable;
+
+  int tlm1 = tablength - 1;
+
+  i = d_ilist[ii];
+  xtmp = x(i,0);
+  ytmp = x(i,1);
+  ztmp = x(i,2);
+  itype = type[i];
+  jnum = d_numneigh[i];
+
+  double fx_i = 0.0;
+  double fy_i = 0.0;
+  double fz_i = 0.0;
+
+  mixWtSite1old_i = d_mixWtSite1old[i];
+  mixWtSite2old_i = d_mixWtSite2old[i];
+  mixWtSite1_i = d_mixWtSite1[i];
+
+  for (jj = 0; jj < jnum; jj++) {
+    int j = d_neighbors(i,jj);
+    j &= NEIGHMASK;
+
+    delx = xtmp - x(j,0);
+    dely = ytmp - x(j,1);
+    delz = ztmp - x(j,2);
+    rsq = delx*delx + dely*dely + delz*delz;
+    jtype = type[j];
+
+    if (rsq < d_cutsq(itype,jtype)) { // optimize
+      fpair = 0.0;
+
+      mixWtSite1old_j = d_mixWtSite1old[j];
+      mixWtSite2old_j = d_mixWtSite2old[j];
+
+      //tb = &tables[tabindex[itype][jtype]];
+      const int tidx = d_table_const.tabindex(itype,jtype);
+
+      //if (rho[i]*rho[i] < tb->innersq || rho[j]*rho[j] < tb->innersq){
+      if (rho[i]*rho[i] < d_table_const.innersq(tidx) || rho[j]*rho[j] < d_table_const.innersq(tidx)){
+        k_error_flag.template view<DeviceType>()() = 1;
+      }
+
+      if (TABSTYLE == LOOKUP) {
+        //itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
+        itable = static_cast<int> (((rho[i]*rho[i]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+        //jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+        jtable = static_cast<int> (((rho[j]*rho[j]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+        if (itable >= tlm1 || jtable >= tlm1){
+          k_error_flag.template view<DeviceType>()() = 2;
+        }
+        //A_i = tb->f[itable];
+        A_i = d_table_const.f(tidx,itable);
+        //A_j = tb->f[jtable];
+        A_j = d_table_const.f(tidx,jtable);
+
+        const double rfactor = 1.0-sqrt(rsq/d_cutsq(itype,jtype));
+        fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
+        fpair /= sqrt(rsq);
+
+      } else if (TABSTYLE == LINEAR) {
+
+        //itable = static_cast<int> ((rho[i]*rho[i] - tb->innersq) * tb->invdelta);
+        itable = static_cast<int> ((rho[i]*rho[i] - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+        //jtable = static_cast<int> (((rho[j]*rho[j]) - tb->innersq) * tb->invdelta);
+        jtable = static_cast<int> ((rho[j]*rho[j] - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+        if (itable >= tlm1 || jtable >= tlm1){
+          k_error_flag.template view<DeviceType>()() = 2;
+        }
+        if(itable<0) itable=0;
+        if(itable>=tlm1) itable=tlm1;
+        if(jtable<0) jtable=0;
+        if(jtable>=tlm1)jtable=tlm1;
+
+        //fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
+        fraction_i = (((rho[i]*rho[i]) - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx));
+        //fraction_j = (((rho[j]*rho[j]) - tb->rsq[jtable]) * tb->invdelta);
+        fraction_j = (((rho[j]*rho[j]) - d_table_const.rsq(tidx,jtable)) * d_table_const.invdelta(tidx));
+        if(itable==0) fraction_i=0.0;
+        if(itable==tlm1) fraction_i=0.0;
+        if(jtable==0) fraction_j=0.0;
+        if(jtable==tlm1) fraction_j=0.0;
+
+        //A_i = tb->f[itable] + fraction_i*tb->df[itable];
+        A_i = d_table_const.f(tidx,itable) + fraction_i*d_table_const.df(tidx,itable);
+        //A_j = tb->f[jtable] + fraction_j*tb->df[jtable];
+        A_j = d_table_const.f(tidx,jtable) + fraction_j*d_table_const.df(tidx,jtable);
+
+        const double rfactor = 1.0-sqrt(rsq/d_cutsq(itype,jtype));
+        fpair = 0.5*(A_i + A_j)*(4.0-3.0*rfactor)*rfactor*rfactor*rfactor;
+        fpair /= sqrt(rsq);
+
+      } else k_error_flag.template view<DeviceType>()() = 3;
+
+      if (isite1 == isite2) fpair = sqrt(mixWtSite1old_i*mixWtSite2old_j)*fpair;
+      else fpair = (sqrt(mixWtSite1old_i*mixWtSite2old_j) + sqrt(mixWtSite2old_i*mixWtSite1old_j))*fpair;
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal)) {
+        a_f(j,0) -= delx*fpair;
+        a_f(j,1) -= dely*fpair;
+        a_f(j,2) -= delz*fpair;
+      }
+      //if (evflag) ev_tally(i,j,nlocal,newton_pair,0.0,0.0,fpair,delx,dely,delz);
+      if (EVFLAG) this->template ev_tally<NEIGHFLAG,NEWTON_PAIR>(ev,i,j,0.0,fpair,delx,dely,delz);
+    }
+  }
+
+  a_f(i,0) += fx_i;
+  a_f(i,1) += fy_i;
+  a_f(i,2) += fz_i;
+
+  //tb = &tables[tabindex[itype][itype]];
+  const int tidx = d_table_const.tabindex(itype,itype);
+  //itable = static_cast<int> (((rho[i]*rho[i]) - tb->innersq) * tb->invdelta);
+  itable = static_cast<int> (((rho[i]*rho[i]) - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+  //if (TABSTYLE == LOOKUP) evdwl = tb->e[itable];
+  if (TABSTYLE == LOOKUP) {
+    evdwl = d_table_const.e(tidx,itable);
+  } else if (TABSTYLE == LINEAR) {
+    if (itable >= tlm1){
+      k_error_flag.template view<DeviceType>()() = 2;
+    }
+    if(itable==0) fraction_i=0.0;
+    //else fraction_i = (((rho[i]*rho[i]) - tb->rsq[itable]) * tb->invdelta);
+    else fraction_i = (((rho[i]*rho[i]) - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx));
+    //evdwl = tb->e[itable] + fraction_i*tb->de[itable];
+    evdwl = d_table_const.e(tidx,itable) + fraction_i*d_table_const.de(tidx,itable);
+  } else k_error_flag.template view<DeviceType>()() = 3;
+
+  evdwl *=(pi*d_cutsq(itype,itype)*d_cutsq(itype,itype))/84.0;
+  evdwlOld = mixWtSite1old_i*evdwl;
+  evdwl = mixWtSite1_i*evdwl;
+
+  uCG[i] += evdwlOld;
+  uCGnew[i] += evdwl;
+
+  evdwl = evdwlOld;
+
+  //if (evflag) ev_tally(0,0,nlocal,newton_pair,evdwl,0.0,0.0,0.0,0.0,0.0);
+  if (EVFLAG)
+    ev.evdwl += ((/*FIXME??? (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && */ NEWTON_PAIR)?1.0:0.5)*evdwl;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int &ii) const {
+  EV_FLOAT ev;
+  this->template operator()<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>(), ii, ev);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::computeLocalDensity()
+{
+  x = atomKK->k_x.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  rho = atomKK->k_rho.view<DeviceType>();
+  h_rho = atomKK->k_rho.h_view;
+  nlocal = atom->nlocal;
+
+  atomKK->sync(execution_space,X_MASK | TYPE_MASK | DPDRHO_MASK);
+
+  const int inum = list->inum;
+  NeighListKokkos<DeviceType>* k_list = static_cast<NeighListKokkos<DeviceType>*>(list);
+  d_numneigh = k_list->d_numneigh;
+  d_neighbors = k_list->d_neighbors;
+  d_ilist = k_list->d_ilist;
+
+  const double pi = MathConst::MY_PI;
+
+  const bool newton_pair = force->newton_pair;
+  const bool one_type = (atom->ntypes == 1);
+
+  // Special cut-off values for when there's only one type.
+  cutsq_type11 = cutsq[1][1];
+  rcut_type11 = sqrt(cutsq_type11);
+  factor_type11 = 84.0/(5.0*pi*rcut_type11*rcut_type11*rcut_type11);
+
+  // zero out density
+  int m = nlocal;
+  if (newton_pair) m += atom->nghost;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXZero>(0,m),*this);
+
+  // rho = density at each atom
+  // loop over neighbors of my atoms
+
+  if (neighflag == HALF) {
+    if (newton_pair)
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,1,false> >(0,inum),*this);
+    else
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALF,0,false> >(0,inum),*this);
+  } else if (neighflag == HALFTHREAD) {
+    if (newton_pair)
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,1,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,1,false> >(0,inum),*this);
+    else
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<HALFTHREAD,0,false> >(0,inum),*this);
+  } else if (neighflag == FULL) {
+    if (newton_pair)
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,1,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,1,false> >(0,inum),*this);
+    else
+      if (one_type)
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,0,true> >(0,inum),*this);
+      else
+        Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPairMultiLucyRXComputeLocalDensity<FULL,0,false> >(0,inum),*this);
+  }
+
+  atomKK->modified(execution_space,DPDRHO_MASK);
+
+  // communicate and sum densities (on the host)
+
+  if (newton_pair)
+    comm->reverse_comm_pair(this);
+
+  comm->forward_comm_pair(this);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXZero, const int &i) const {
+  rho[i] = 0.0;
+}
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR, bool ONE_TYPE>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR,ONE_TYPE>, const int &ii) const {
+
+
+  // The rho array is atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > a_rho = rho;
+
+  const int i = d_ilist[ii];
+
+  const double xtmp = x(i,0);
+  const double ytmp = x(i,1);
+  const double ztmp = x(i,2);
+
+  double rho_i_contrib = 0.0;
+
+  const int itype = type[i];
+  const int jnum = d_numneigh[i];
+
+  const double pi = MathConst::MY_PI;
+
+  for (int jj = 0; jj < jnum; jj++){
+    const int j = (d_neighbors(i,jj) & NEIGHMASK);
+    const int jtype = type[j];
+
+    const double delx = xtmp - x(j,0);
+    const double dely = ytmp - x(j,1);
+    const double delz = ztmp - x(j,2);
+    const double rsq = delx*delx + dely*dely + delz*delz;
+
+    if (ONE_TYPE) {
+      if (rsq < cutsq_type11) {
+        const double rcut = rcut_type11;
+        const double r_over_rcut = sqrt(rsq) / rcut;
+        const double tmpFactor = 1.0 - r_over_rcut;
+        const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
+        const double factor = factor_type11*(1.0 + 1.5*r_over_rcut)*tmpFactor4;
+        rho_i_contrib += factor;
+        if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+          a_rho[j] += factor;
+      }
+    } else if (rsq < d_cutsq(itype,jtype)) {
+      const double rcut = sqrt(d_cutsq(itype,jtype));
+      const double tmpFactor = 1.0-sqrt(rsq)/rcut;
+      const double tmpFactor4 = tmpFactor*tmpFactor*tmpFactor*tmpFactor;
+      const double factor = (84.0/(5.0*pi*rcut*rcut*rcut))*(1.0+3.0*sqrt(rsq)/(2.0*rcut))*tmpFactor4;
+      rho_i_contrib += factor;
+      if ((NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) && (NEWTON_PAIR || j < nlocal))
+        a_rho[j] += factor;
+    }
+  }
+
+  a_rho[i] += rho_i_contrib;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::getMixingWeights(int id, double &mixWtSite1old, double &mixWtSite2old, double &mixWtSite1, double &mixWtSite2) const
+{
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalOld;
+
+
+  nTotal = 0.0;
+  nTotalOld = 0.0;
+  for (int ispecies = 0; ispecies < nspecies; ispecies++){
+    nTotal += dvector(ispecies,id);
+    nTotalOld += dvector(ispecies+nspecies,id);
+  }
+
+  if (isOneFluid(isite1) == false){
+    nMoleculesOld1 = dvector(isite1+nspecies,id);
+    nMolecules1 = dvector(isite1,id);
+    fractionOld1 = nMoleculesOld1/nTotalOld;
+    fraction1 = nMolecules1/nTotal;
+  }
+  if (isOneFluid(isite2) == false){
+    nMoleculesOld2 = dvector(isite2+nspecies,id);
+    nMolecules2 = dvector(isite2,id);
+    fractionOld2 = nMoleculesOld2/nTotalOld;
+    fraction2 = nMolecules2/nTotal;
+  }
+
+  if (isOneFluid(isite1) || isOneFluid(isite2)){
+    nMoleculesOFAold  = 0.0;
+    nMoleculesOFA  = 0.0;
+    fractionOFAold  = 0.0;
+    fractionOFA  = 0.0;
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++){
+      if (isite1 == ispecies || isite2 == ispecies) continue;
+      nMoleculesOFAold += dvector(ispecies+nspecies,id);
+      nMoleculesOFA += dvector(ispecies,id);
+      fractionOFAold += dvector(ispecies+nspecies,id) / nTotalOld;
+      fractionOFA += dvector(ispecies,id) / nTotal;
+    }
+    if (isOneFluid(isite1)){
+      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld1 = fractionOFAold;
+      fraction1 = fractionOFA;
+    }
+    if (isOneFluid(isite2)){
+      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld2 = fractionOFAold;
+      fraction2 = fractionOFA;
+    }
+  }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf,
+                               int pbc_flag, int *pbc)
+{
+  atomKK->sync(execution_space,DPDRHO_MASK);
+
+  d_sendlist = k_sendlist.view<DeviceType>();
+  iswap = iswap_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagPairMultiLucyRXPackForwardComm>(0,n),*this);
+  return n;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXPackForwardComm, const int &i) const {
+  int j = d_sendlist(iswap, i);
+  v_buf[i] = rho[j];
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf)
+{
+  first = first_in;
+  v_buf = buf.view<DeviceType>();
+  Kokkos::parallel_for(Kokkos::RangePolicy<LMPDeviceType, TagPairMultiLucyRXUnpackForwardComm>(0,n),*this);
+
+  atomKK->modified(execution_space,DPDRHO_MASK);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::operator()(TagPairMultiLucyRXUnpackForwardComm, const int &i) const {
+  rho[i + first] = v_buf[i];
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMultiLucyRXKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc)
+{
+  int i,j,m;
+
+  atomKK->sync(Host,DPDRHO_MASK);
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_rho[j];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) h_rho[i] = buf[m++];
+
+  atomKK->modified(Host,DPDRHO_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+int PairMultiLucyRXKokkos<DeviceType>::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  atomKK->sync(Host,DPDRHO_MASK);
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) buf[m++] = h_rho[i];
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    h_rho[j] += buf[m++];
+  }
+
+  atomKK->modified(Host,DPDRHO_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+template<int NEIGHFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void PairMultiLucyRXKokkos<DeviceType>::ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                const F_FLOAT &dely, const F_FLOAT &delz) const
+{
+  const int EFLAG = eflag;
+  const int VFLAG = vflag_either;
+
+  // The eatom and vatom arrays are atomic for Half/Thread neighbor style
+  Kokkos::View<E_FLOAT*, typename DAT::t_efloat_1d::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom = k_eatom.view<DeviceType>();
+  Kokkos::View<F_FLOAT*[6], typename DAT::t_virial_array::array_layout,DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom = k_vatom.view<DeviceType>();
+
+  if (EFLAG) {
+    if (eflag_atom) {
+      const E_FLOAT epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (VFLAG) {
+    const E_FLOAT v0 = delx*delx*fpair;
+    const E_FLOAT v1 = dely*dely*fpair;
+    const E_FLOAT v2 = delz*delz*fpair;
+    const E_FLOAT v3 = delx*dely*fpair;
+    const E_FLOAT v4 = delx*delz*fpair;
+    const E_FLOAT v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          ev.v[0] += 0.5*v0;
+          ev.v[1] += 0.5*v1;
+          ev.v[2] += 0.5*v2;
+          ev.v[3] += 0.5*v3;
+          ev.v[4] += 0.5*v4;
+          ev.v[5] += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+        v_vatom(j,0) += 0.5*v0;
+        v_vatom(j,1) += 0.5*v1;
+        v_vatom(j,2) += 0.5*v2;
+        v_vatom(j,3) += 0.5*v3;
+        v_vatom(j,4) += 0.5*v4;
+        v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
+      }
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::create_kokkos_tables()
+{
+  const int tlm1 = tablength-1;
+
+  memory->create_kokkos(d_table->innersq,h_table->innersq,ntables,"Table::innersq");
+  memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
+
+  if(tabstyle == LOOKUP) {
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tlm1,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tlm1,"Table::f");
+  }
+
+  if(tabstyle == LINEAR) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,tlm1,"Table::df");
+  }
+
+  for(int i=0; i < ntables; i++) {
+    Table* tb = &tables[i];
+
+    h_table->innersq[i] = tb->innersq;
+    h_table->invdelta[i] = tb->invdelta;
+
+    for(int j = 0; j<h_table->rsq.dimension_1(); j++)
+      h_table->rsq(i,j) = tb->rsq[j];
+    for(int j = 0; j<h_table->e.dimension_1(); j++)
+      h_table->e(i,j) = tb->e[j];
+    for(int j = 0; j<h_table->de.dimension_1(); j++)
+      h_table->de(i,j) = tb->de[j];
+    for(int j = 0; j<h_table->f.dimension_1(); j++)
+      h_table->f(i,j) = tb->f[j];
+    for(int j = 0; j<h_table->df.dimension_1(); j++)
+      h_table->df(i,j) = tb->df[j];
+  }
+
+
+  Kokkos::deep_copy(d_table->innersq,h_table->innersq);
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
+  Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+  Kokkos::deep_copy(d_table->e,h_table->e);
+  Kokkos::deep_copy(d_table->de,h_table->de);
+  Kokkos::deep_copy(d_table->f,h_table->f);
+  Kokkos::deep_copy(d_table->df,h_table->df);
+  Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
+
+  d_table_const.innersq = d_table->innersq;
+  d_table_const.invdelta = d_table->invdelta;
+  d_table_const.rsq = d_table->rsq;
+  d_table_const.e = d_table->e;
+  d_table_const.de = d_table->de;
+  d_table_const.f = d_table->f;
+  d_table_const.df = d_table->df;
+
+  update_table = 0;
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+  const int nt = atom->ntypes + 1;
+
+  memory->create(setflag,nt,nt,"pair:setflag");
+
+  memory->create_kokkos(k_cutsq,cutsq,nt,nt,"pair:cutsq");
+  d_cutsq = k_cutsq.template view<DeviceType>();
+  k_cutsq.template modify<LMPHostType>();
+
+  memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
+  d_table_const.tabindex = d_table->tabindex;
+
+  memset(&setflag[0][0],0,nt*nt*sizeof(int));
+  memset(&cutsq[0][0],0,nt*nt*sizeof(double));
+  memset(&tabindex[0][0],0,nt*nt*sizeof(int));
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairMultiLucyRXKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg < 2) error->all(FLERR,"Illegal pair_style command");
+
+  // new settings
+
+  if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
+  else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
+  else error->all(FLERR,"Unknown table style in pair_style command");
+
+  tablength = force->inumeric(FLERR,arg[1]);
+  if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
+
+  // optional keywords
+
+  int iarg = 2;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"fractional") == 0)   fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0)   fractionalWeighting = false;
+    else error->all(FLERR,"Illegal pair_style command");
+    iarg++;
+  }
+
+  // delete old tables, since cannot just change settings
+
+  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
+  memory->sfree(tables);
+
+  if (allocated) {
+    memory->destroy(setflag);
+
+    d_table_const.tabindex = d_table->tabindex = typename ArrayTypes<DeviceType>::t_int_2d();
+    h_table->tabindex = typename ArrayTypes<LMPHostType>::t_int_2d();
+  }
+  allocated = 0;
+
+  ntables = 0;
+  tables = NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+namespace LAMMPS_NS {
+template class PairMultiLucyRXKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairMultiLucyRXKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/pair_multi_lucy_rx_kokkos.h b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
new file mode 100644
index 0000000000..b8ced4c847
--- /dev/null
+++ b/src/KOKKOS/pair_multi_lucy_rx_kokkos.h
@@ -0,0 +1,266 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(multi/lucy/rx/kk,PairMultiLucyRXKokkos<LMPDeviceType>)
+PairStyle(multi/lucy/rx/kk/device,PairMultiLucyRXKokkos<LMPDeviceType>)
+PairStyle(multi/lucy/rx/kk/host,PairMultiLucyRXKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_MULTI_LUCY_RX_KOKKOS_H
+#define LMP_PAIR_MULTI_LUCY_RX_KOKKOS_H
+
+
+#include "pair_multi_lucy_rx.h"
+#include "pair_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+struct TagPairMultiLucyRXPackForwardComm{};
+struct TagPairMultiLucyRXUnpackForwardComm{};
+
+struct TagPairMultiLucyRXgetMixingWeights{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
+struct TagPairMultiLucyRXCompute{};
+
+struct TagPairMultiLucyRXZero{};
+
+template<int NEIGHFLAG, int NEWTON_PAIR, bool ONE_TYPE>
+struct TagPairMultiLucyRXComputeLocalDensity{};
+
+template<class DeviceType>
+class PairMultiLucyRXKokkos : public PairMultiLucyRX {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typedef EV_FLOAT value_type;
+
+  PairMultiLucyRXKokkos(class LAMMPS *);
+  virtual ~PairMultiLucyRXKokkos();
+
+  void compute(int, int);
+  void settings(int, char **);
+
+  template<int TABSTYLE>
+  void compute_style(int, int);
+
+  void init_style();
+  int pack_forward_comm_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d&,
+                               int, int *);
+  void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d&);
+  int pack_forward_comm(int, int *, double *, int, int *);
+  void unpack_forward_comm(int, int, double *);
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  void computeLocalDensity();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXPackForwardComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXUnpackForwardComm, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXgetMixingWeights, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int&, EV_FLOAT&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, int EVFLAG, int TABSTYLE>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXCompute<NEIGHFLAG,NEWTON_PAIR,EVFLAG,TABSTYLE>, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXZero, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR, bool ONE_TYPE>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPairMultiLucyRXComputeLocalDensity<NEIGHFLAG,NEWTON_PAIR,ONE_TYPE>, const int&) const;
+
+  template<int NEIGHFLAG, int NEWTON_PAIR>
+  KOKKOS_INLINE_FUNCTION
+  void ev_tally(EV_FLOAT &ev, const int &i, const int &j,
+      const F_FLOAT &epair, const F_FLOAT &fpair, const F_FLOAT &delx,
+                  const F_FLOAT &dely, const F_FLOAT &delz) const;
+
+ private:
+  int nlocal;
+  int neighflag;
+  int eflag,vflag;
+
+  double cutsq_type11;
+  double rcut_type11;
+  double factor_type11;
+
+  enum{LOOKUP,LINEAR,SPLINE,BITMAP};
+
+  //struct Table {
+  //  int ninput,rflag,fpflag,match;
+  //  double rlo,rhi,fplo,fphi,cut;
+  //  double *rfile,*efile,*ffile;
+  //  double *e2file,*f2file;
+  //  double innersq,delta,invdelta,deltasq6;
+  //  double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
+  //};
+
+  /*struct TableDeviceConst {
+    typename AT::t_int_2d_randomread tabindex;
+    typename AT::t_ffloat_1d_randomread innersq,invdelta;
+    typename AT::t_ffloat_2d_randomread rsq,e,de,f,df;
+  };*/
+ //Its faster not to use texture fetch if the number of tables is less than 32!
+  struct TableDeviceConst {
+    typename AT::t_int_2d tabindex;
+    typename AT::t_ffloat_1d innersq,invdelta;
+    typename AT::t_ffloat_2d_randomread rsq,e,de,f,df;
+  };
+
+  struct TableDevice {
+    typename AT::t_int_2d tabindex;
+    typename AT::t_ffloat_1d innersq,invdelta;
+    typename AT::t_ffloat_2d rsq,e,de,f,df;
+  };
+
+  struct TableHost {
+    HAT::t_int_2d tabindex;
+    HAT::t_ffloat_1d innersq,invdelta;
+    HAT::t_ffloat_2d rsq,e,de,f,df;
+  };
+
+  TableDeviceConst d_table_const;
+  TableDevice* d_table;
+  TableHost* h_table;
+
+  F_FLOAT m_cutsq[MAX_TYPES_STACKPARAMS+1][MAX_TYPES_STACKPARAMS+1];
+
+  void allocate();
+  int update_table;
+  void create_kokkos_tables();
+
+  KOKKOS_INLINE_FUNCTION
+  void getMixingWeights(int, double &, double &, double &, double &) const;
+
+  typename AT::t_float_1d d_mixWtSite1old,d_mixWtSite2old,d_mixWtSite1,d_mixWtSite2;
+
+  typename AT::t_x_array_randomread x;
+  typename AT::t_f_array f;
+  typename AT::t_int_1d_randomread type;
+  typename AT::t_efloat_1d rho;
+  typename HAT::t_efloat_1d h_rho;
+  typename AT::t_efloat_1d uCG, uCGnew;
+  typename AT::t_float_2d dvector;
+
+  DAT::tdual_efloat_1d k_eatom;
+  DAT::tdual_virial_array k_vatom;
+  typename AT::t_efloat_1d d_eatom;
+  typename AT::t_virial_array d_vatom;
+
+  typename AT::t_neighbors_2d d_neighbors;
+  typename AT::t_int_1d_randomread d_ilist;
+  typename AT::t_int_1d_randomread d_numneigh;
+
+  DAT::tdual_int_scalar k_error_flag;
+
+  typename AT::tdual_ffloat_2d k_cutsq;
+  typename AT::t_ffloat_2d d_cutsq;
+
+  int iswap;
+  int first;
+  typename AT::t_int_2d d_sendlist;
+  typename AT::t_xfloat_1d_um v_buf;
+
+  friend void pair_virial_fdotr_compute<PairMultiLucyRXKokkos>(PairMultiLucyRXKokkos*);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Pair multi/lucy/rx command requires atom_style with density (e.g. dpd, meso)
+
+Self-explanatory
+
+E: Density < table inner cutoff
+
+The local density inner is smaller than the inner cutoff
+
+E: Density > table inner cutoff
+
+The local density inner is greater than the inner cutoff
+
+E: Only LOOKUP and LINEAR table styles have been implemented for pair multi/lucy/rx
+
+Self-explanatory
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+E:  Unknown table style in pair_style command
+
+Self-explanatory
+
+E: Illegal number of pair table entries
+
+There must be at least 2 table entries.
+
+E: Illegal pair_coeff command
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: PairMultiLucyRXKokkos requires a fix rx command
+
+The fix rx command must come before the pair style command in the input file
+
+E:  There are no rx species specified
+
+There must be at least one species specified through the fix rx command
+
+E: Invalid pair table length
+
+Length of read-in pair table is invalid
+
+E: All pair coeffs are not set
+
+All pair coefficients must be set in the data file or by the
+pair_coeff command before running a simulation.
+
+E: Cannot open file %s
+
+The specified file cannot be opened.  Check that the path and name are
+correct.
+
+E: Did not find keyword in table file
+
+Keyword used in pair_coeff command was not found in table file.
+
+E: Invalid keyword in pair table parameters
+
+Keyword used in list of table parameters is not recognized.
+
+E: Pair table parameters did not set N
+
+List of pair table parameters must include N setting.
+
+*/
diff --git a/src/KOKKOS/pair_table_rx_kokkos.cpp b/src/KOKKOS/pair_table_rx_kokkos.cpp
new file mode 100644
index 0000000000..2f5a670537
--- /dev/null
+++ b/src/KOKKOS/pair_table_rx_kokkos.cpp
@@ -0,0 +1,1307 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing author: Dan Ibanez (SNL)
+------------------------------------------------------------------------- */
+
+#include <mpi.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "pair_table_rx_kokkos.h"
+#include "kokkos.h"
+#include "atom.h"
+#include "force.h"
+#include "comm.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+#include "memory.h"
+#include "error.h"
+#include "atom_masks.h"
+#include "fix.h"
+#include "kokkos_few.h"
+#include <cassert>
+
+using namespace LAMMPS_NS;
+
+enum{NONE,RLINEAR,RSQ,BMP};
+
+#ifdef DBL_EPSILON
+  #define MY_EPSILON (10.0*DBL_EPSILON)
+#else
+  #define MY_EPSILON (10.0*2.220446049250313e-16)
+#endif
+
+#define OneFluidValue (-1)
+#define isOneFluid(_site_) ( (_site_) == OneFluidValue )
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void getMixingWeights(
+    typename ArrayTypes<DeviceType>::t_float_2d_randomread dvector,
+    int nspecies,
+    int isite1, int isite2,
+    bool fractionalWeighting,
+    int id,
+    double &mixWtSite1old, double &mixWtSite2old,
+    double &mixWtSite1, double &mixWtSite2) {
+  double fractionOFAold, fractionOFA;
+  double fractionOld1, fraction1;
+  double fractionOld2, fraction2;
+  double nMoleculesOFAold, nMoleculesOFA;
+  double nMoleculesOld1, nMolecules1;
+  double nMoleculesOld2, nMolecules2;
+  double nTotal, nTotalOld;
+
+  nTotal = 0.0;
+  nTotalOld = 0.0;
+  assert(id >= 0);
+  assert(id < dvector.dimension_1());
+  for (int ispecies = 0; ispecies < nspecies; ++ispecies){
+    assert(ispecies < dvector.dimension_0());
+    nTotal += dvector(ispecies,id);
+    assert(ispecies+nspecies < dvector.dimension_0());
+    nTotalOld += dvector(ispecies+nspecies,id);
+  }
+
+  assert(isite1 >= 0);
+  assert(isite1 < nspecies);
+  assert(isite2 >= 0);
+  assert(isite2 < nspecies);
+  if (isOneFluid(isite1) == false){
+    nMoleculesOld1 = dvector(isite1+nspecies,id);
+    nMolecules1 = dvector(isite1,id);
+    fractionOld1 = nMoleculesOld1/nTotalOld;
+    fraction1 = nMolecules1/nTotal;
+  }
+  if (isOneFluid(isite2) == false){
+    nMoleculesOld2 = dvector(isite2+nspecies,id);
+    nMolecules2 = dvector(isite2,id);
+    fractionOld2 = nMoleculesOld2/nTotalOld;
+    fraction2 = nMolecules2/nTotal;
+  }
+
+  if (isOneFluid(isite1) || isOneFluid(isite2)){
+    nMoleculesOFAold  = 0.0;
+    nMoleculesOFA  = 0.0;
+    fractionOFAold  = 0.0;
+    fractionOFA  = 0.0;
+
+    for (int ispecies = 0; ispecies < nspecies; ispecies++){
+      if (isite1 == ispecies || isite2 == ispecies) continue;
+      nMoleculesOFAold += dvector(ispecies+nspecies,id);
+      nMoleculesOFA += dvector(ispecies,id);
+      fractionOFAold += dvector(ispecies+nspecies,id)/nTotalOld;
+      fractionOFA += dvector(ispecies,id)/nTotal;
+    }
+    if(isOneFluid(isite1)){
+      nMoleculesOld1 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules1 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld1 = fractionOFAold;
+      fraction1 = fractionOFA;
+    }
+    if(isOneFluid(isite2)){
+      nMoleculesOld2 = 1.0-(nTotalOld-nMoleculesOFAold);
+      nMolecules2 = 1.0-(nTotal-nMoleculesOFA);
+      fractionOld2 = fractionOFAold;
+      fraction2 = fractionOFA;
+    }
+  }
+
+  if(fractionalWeighting){
+    mixWtSite1old = fractionOld1;
+    mixWtSite1 = fraction1;
+    mixWtSite2old = fractionOld2;
+    mixWtSite2 = fraction2;
+  } else {
+    mixWtSite1old = nMoleculesOld1;
+    mixWtSite1 = nMolecules1;
+    mixWtSite2old = nMoleculesOld2;
+    mixWtSite2 = nMolecules2;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairTableRXKokkos<DeviceType>::PairTableRXKokkos(LAMMPS *lmp) : PairTable(lmp)
+{
+  update_table = 0;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK |
+                  DVECTOR_MASK | UCG_MASK | UCGNEW_MASK;
+  datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK | UCG_MASK | UCGNEW_MASK;
+  h_table = new TableHost();
+  d_table = new TableDevice();
+  fractionalWeighting = true;
+
+  site1 = nullptr;
+  site2 = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+PairTableRXKokkos<DeviceType>::~PairTableRXKokkos()
+{
+  if (copymode) return;
+
+  delete [] site1;
+  delete [] site2;
+
+  memory->destroy_kokkos(k_eatom,eatom);
+  memory->destroy_kokkos(k_vatom,vatom);
+
+  if (allocated) {
+    memory->destroy_kokkos(d_table->cutsq, cutsq);
+    memory->destroy_kokkos(d_table->tabindex, tabindex);
+  }
+
+  delete h_table;
+  h_table = nullptr;
+  delete d_table;
+  d_table = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::compute(int eflag_in, int vflag_in)
+{
+  if(update_table)
+    create_kokkos_tables();
+  if(tabstyle == LOOKUP)
+    compute_style<LOOKUP>(eflag_in,vflag_in);
+  if(tabstyle == LINEAR)
+    compute_style<LINEAR>(eflag_in,vflag_in);
+  if(tabstyle == SPLINE)
+    compute_style<SPLINE>(eflag_in,vflag_in);
+  if(tabstyle == BITMAP)
+    compute_style<BITMAP>(eflag_in,vflag_in);
+}
+
+KOKKOS_INLINE_FUNCTION static int sbmask(const int& j)
+{
+  return j >> SBBITS & 3;
+}
+
+template <class DeviceType, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+static F_FLOAT
+compute_fpair(F_FLOAT rsq,
+              int itype, int jtype,
+              typename PairTableRXKokkos<DeviceType>::TableDeviceConst const& d_table_const
+              ) {
+  Pair::union_int_float_t rsq_lookup;
+  double fpair;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  if (TABSTYLE == PairTable::LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    fpair = d_table_const.f(tidx,itable);
+  } else if (TABSTYLE == PairTable::LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  } else if (TABSTYLE == PairTable::SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    fpair = a * d_table_const.f(tidx,itable) + b * d_table_const.f(tidx,itable+1) +
+      ((a*a*a-a)*d_table_const.f2(tidx,itable) + (b*b*b-b)*d_table_const.f2(tidx,itable+1)) *
+      d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    fpair = d_table_const.f(tidx,itable) + fraction*d_table_const.df(tidx,itable);
+  }
+  return fpair;
+}
+
+template<class DeviceType, int TABSTYLE>
+KOKKOS_INLINE_FUNCTION
+static F_FLOAT
+compute_evdwl(
+    F_FLOAT rsq,
+    int itype, int jtype,
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst const& d_table_const
+    ) {
+  double evdwl;
+  Pair::union_int_float_t rsq_lookup;
+  const int tidx = d_table_const.tabindex(itype,jtype);
+  if (TABSTYLE == PairTable::LOOKUP) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    evdwl = d_table_const.e(tidx,itable);
+  } else if (TABSTYLE == PairTable::LINEAR) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double fraction = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  } else if (TABSTYLE == PairTable::SPLINE) {
+    const int itable = static_cast<int> ((rsq - d_table_const.innersq(tidx)) * d_table_const.invdelta(tidx));
+    const double b = (rsq - d_table_const.rsq(tidx,itable)) * d_table_const.invdelta(tidx);
+    const double a = 1.0 - b;
+    evdwl = a * d_table_const.e(tidx,itable) + b * d_table_const.e(tidx,itable+1) +
+        ((a*a*a-a)*d_table_const.e2(tidx,itable) + (b*b*b-b)*d_table_const.e2(tidx,itable+1)) *
+        d_table_const.deltasq6(tidx);
+  } else {
+    rsq_lookup.f = rsq;
+    int itable = rsq_lookup.i & d_table_const.nmask(tidx);
+    itable >>= d_table_const.nshiftbits(tidx);
+    const double fraction = (rsq_lookup.f - d_table_const.rsq(tidx,itable)) * d_table_const.drsq(tidx,itable);
+    evdwl = d_table_const.e(tidx,itable) + fraction*d_table_const.de(tidx,itable);
+  }
+  return evdwl;
+}
+
+template<class DeviceType, int NEIGHFLAG, int TABSTYLE, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+void
+ev_tally(
+    int eflag,
+    int eflag_atom,
+    int vflag,
+    int vflag_global,
+    int vflag_atom,
+    int nlocal,
+    int i, int j,
+    EV_FLOAT& ev,
+    F_FLOAT epair, F_FLOAT fpair,
+    F_FLOAT delx, F_FLOAT dely, F_FLOAT delz,
+    Kokkos::View<F_FLOAT*[6],
+                 typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_vatom,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_eatom)
+{
+  if (eflag) {
+    if (eflag_atom) {
+      auto epairhalf = 0.5 * epair;
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) v_eatom[i] += epairhalf;
+        if (NEWTON_PAIR || j < nlocal) v_eatom[j] += epairhalf;
+      } else {
+        v_eatom[i] += epairhalf;
+      }
+    }
+  }
+
+  if (vflag) {
+    auto v0 = delx*delx*fpair;
+    auto v1 = dely*dely*fpair;
+    auto v2 = delz*delz*fpair;
+    auto v3 = delx*dely*fpair;
+    auto v4 = delx*delz*fpair;
+    auto v5 = dely*delz*fpair;
+
+    if (vflag_global) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR) {
+          ev.v[0] += v0;
+          ev.v[1] += v1;
+          ev.v[2] += v2;
+          ev.v[3] += v3;
+          ev.v[4] += v4;
+          ev.v[5] += v5;
+        } else {
+          if (i < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+          if (j < nlocal) {
+            ev.v[0] += 0.5*v0;
+            ev.v[1] += 0.5*v1;
+            ev.v[2] += 0.5*v2;
+            ev.v[3] += 0.5*v3;
+            ev.v[4] += 0.5*v4;
+            ev.v[5] += 0.5*v5;
+          }
+        }
+      } else {
+        ev.v[0] += 0.5*v0;
+        ev.v[1] += 0.5*v1;
+        ev.v[2] += 0.5*v2;
+        ev.v[3] += 0.5*v3;
+        ev.v[4] += 0.5*v4;
+        ev.v[5] += 0.5*v5;
+      }
+    }
+
+    if (vflag_atom) {
+      if (NEIGHFLAG!=FULL) {
+        if (NEWTON_PAIR || i < nlocal) {
+          v_vatom(i,0) += 0.5*v0;
+          v_vatom(i,1) += 0.5*v1;
+          v_vatom(i,2) += 0.5*v2;
+          v_vatom(i,3) += 0.5*v3;
+          v_vatom(i,4) += 0.5*v4;
+          v_vatom(i,5) += 0.5*v5;
+        }
+        if (NEWTON_PAIR || j < nlocal) {
+          v_vatom(j,0) += 0.5*v0;
+          v_vatom(j,1) += 0.5*v1;
+          v_vatom(j,2) += 0.5*v2;
+          v_vatom(j,3) += 0.5*v3;
+          v_vatom(j,4) += 0.5*v4;
+          v_vatom(j,5) += 0.5*v5;
+        }
+      } else {
+        v_vatom(i,0) += 0.5*v0;
+        v_vatom(i,1) += 0.5*v1;
+        v_vatom(i,2) += 0.5*v2;
+        v_vatom(i,3) += 0.5*v3;
+        v_vatom(i,4) += 0.5*v4;
+        v_vatom(i,5) += 0.5*v5;
+      }
+    }
+  }
+}
+
+template <class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE,
+          int EVFLAG, int NEWTON_PAIR>
+KOKKOS_INLINE_FUNCTION
+static EV_FLOAT
+compute_item(
+    int ii,
+    int nlocal,
+    typename ArrayTypes<DeviceType>::t_int_1d_const const& d_ilist,
+    typename ArrayTypes<DeviceType>::t_neighbors_2d_const const& d_neighbors,
+    typename ArrayTypes<DeviceType>::t_int_1d_const const& d_numneigh,
+    typename ArrayTypes<DeviceType>::t_x_array_randomread const& x,
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread const& type,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2,
+    Few<int, 4> const& special_lj,
+    Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> const& m_cutsq,
+    typename ArrayTypes<DeviceType>::t_ffloat_2d const& d_cutsq,
+    Kokkos::View<F_FLOAT*[3],
+      typename ArrayTypes<DeviceType>::t_f_array::array_layout,
+      DeviceType,
+      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& f,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& uCG,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& uCGnew,
+    int isite1, int isite2,
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst const& d_table_const,
+    int eflag,
+    int eflag_atom,
+    int vflag,
+    int vflag_global,
+    int vflag_atom,
+    Kokkos::View<F_FLOAT*[6],
+                 typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_vatom,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > const& v_eatom) {
+  EV_FLOAT ev;
+  auto i = d_ilist(ii);
+  auto xtmp = x(i,0);
+  auto ytmp = x(i,1);
+  auto ztmp = x(i,2);
+  auto itype = type(i);
+
+  auto jlist = NeighListKokkos<DeviceType>::static_neighbors_const(i,
+      d_neighbors, d_numneigh);
+  auto jnum = d_numneigh(i);
+
+  double uCG_i = 0.0;
+  double uCGnew_i = 0.0;
+  double fx_i = 0.0, fy_i = 0.0, fz_i = 0.0;
+
+  auto mixWtSite1old_i = mixWtSite1old(i);
+  auto mixWtSite2old_i = mixWtSite2old(i);
+  auto mixWtSite1_i = mixWtSite1(i);
+  auto mixWtSite2_i = mixWtSite2(i);
+
+  for (int jj = 0; jj < jnum; jj++) {
+    auto j = jlist(jj);
+    const F_FLOAT factor_lj = special_lj[sbmask(j)];
+    j &= NEIGHMASK;
+
+    auto delx = xtmp - x(j,0);
+    auto dely = ytmp - x(j,1);
+    auto delz = ztmp - x(j,2);
+    auto rsq = delx*delx + dely*dely + delz*delz;
+    auto jtype = type(j);
+
+    if(rsq < (STACKPARAMS ? m_cutsq[itype][jtype] : d_cutsq(itype,jtype))) {
+      auto mixWtSite1old_j = mixWtSite1old(j);
+      auto mixWtSite2old_j = mixWtSite2old(j);
+      auto mixWtSite1_j = mixWtSite1(j);
+      auto mixWtSite2_j = mixWtSite2(j);
+
+      auto fpair = factor_lj * compute_fpair<DeviceType,TABSTYLE>(
+          rsq,itype,jtype,d_table_const);
+
+      if (isite1 == isite2) fpair *= sqrt(mixWtSite1old_i * mixWtSite2old_j);
+      else fpair *= (sqrt(mixWtSite1old_i * mixWtSite2old_j) +
+                     sqrt(mixWtSite2old_i * mixWtSite1old_j));
+
+      fx_i += delx*fpair;
+      fy_i += dely*fpair;
+      fz_i += delz*fpair;
+
+      auto do_half = (NEIGHFLAG==HALF || NEIGHFLAG==HALFTHREAD) &&
+                     (NEWTON_PAIR || j < nlocal);
+      if (do_half) {
+        f(j,0) -= delx*fpair;
+        f(j,1) -= dely*fpair;
+        f(j,2) -= delz*fpair;
+      }
+
+      auto evdwl = compute_evdwl<DeviceType,TABSTYLE>(
+          rsq,itype,jtype,d_table_const);
+
+      double evdwlOld;
+      if (isite1 == isite2) {
+        evdwlOld = sqrt(mixWtSite1old_i*mixWtSite2old_j)*evdwl;
+        evdwl = sqrt(mixWtSite1_i*mixWtSite2_j)*evdwl;
+      } else {
+        evdwlOld = (sqrt(mixWtSite1old_i*mixWtSite2old_j) +
+                    sqrt(mixWtSite2old_i*mixWtSite1old_j))*evdwl;
+        evdwl = (sqrt(mixWtSite1_i*mixWtSite2_j) +
+                 sqrt(mixWtSite2_i*mixWtSite1_j))*evdwl;
+      }
+      evdwlOld *= factor_lj;
+      evdwl *= factor_lj;
+
+      uCG_i += 0.5*evdwlOld;
+      if (do_half) uCG(j) += 0.5*evdwlOld;
+
+      uCGnew_i += 0.5*evdwl;
+      if (do_half) uCGnew(j) += 0.5*evdwl;
+      evdwl = evdwlOld;
+
+      ev.evdwl += (do_half ? 1.0 : 0.5)*evdwl;
+
+      if (EVFLAG) {
+        ev_tally<DeviceType,NEIGHFLAG,TABSTYLE,NEWTON_PAIR>(
+            eflag,eflag_atom,
+            vflag,vflag_global,vflag_atom,
+            nlocal,i,j,ev,evdwl,fpair,delx,dely,delz,
+            v_vatom, v_eatom);
+      }
+    }
+  }
+
+  uCG(i) += uCG_i;
+  uCGnew(i) += uCGnew_i;
+
+  f(i,0) += fx_i;
+  f(i,1) += fy_i;
+  f(i,2) += fz_i;
+
+  return ev;
+}
+
+template<class DeviceType, int NEIGHFLAG, bool STACKPARAMS, int TABSTYLE, int NEWTON_PAIR>
+static void compute_all_items(
+    EV_FLOAT& ev,
+    int nlocal,
+    int inum,
+    typename ArrayTypes<DeviceType>::t_int_1d_const d_ilist,
+    typename ArrayTypes<DeviceType>::t_neighbors_2d_const d_neighbors,
+    typename ArrayTypes<DeviceType>::t_int_1d_const d_numneigh,
+    typename ArrayTypes<DeviceType>::t_x_array_randomread x,
+    typename ArrayTypes<DeviceType>::t_int_1d_randomread type,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2,
+    Few<int, 4> special_lj,
+    Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> m_cutsq,
+    typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq,
+    Kokkos::View<F_FLOAT*[3],
+      typename ArrayTypes<DeviceType>::t_f_array::array_layout,
+      DeviceType,
+      Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > f,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCG,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > uCGnew,
+    int isite1, int isite2,
+    typename PairTableRXKokkos<DeviceType>::TableDeviceConst d_table_const,
+    int eflag,
+    int eflag_atom,
+    int vflag,
+    int vflag_global,
+    int vflag_atom,
+    Kokkos::View<F_FLOAT*[6],
+                 typename ArrayTypes<DeviceType>::t_virial_array::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_vatom,
+    Kokkos::View<E_FLOAT*,
+                 typename ArrayTypes<DeviceType>::t_efloat_1d::array_layout,
+                 DeviceType,
+                 Kokkos::MemoryTraits<AtomicF<NEIGHFLAG>::value> > v_eatom) {
+  if (eflag || vflag) {
+    Kokkos::parallel_reduce(inum,
+    LAMMPS_LAMBDA(int i, EV_FLOAT& energy_virial) {
+        energy_virial +=
+          compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,1,NEWTON_PAIR>(
+            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
+            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+            d_table_const, eflag, eflag_atom,
+            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
+    }, ev);
+  } else {
+    Kokkos::parallel_for(inum,
+    LAMMPS_LAMBDA(int i) {
+        compute_item<DeviceType,NEIGHFLAG,STACKPARAMS,TABSTYLE,0,NEWTON_PAIR>(
+            i, nlocal, d_ilist, d_neighbors, d_numneigh, x, type,
+            mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+            special_lj, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+            d_table_const, eflag, eflag_atom,
+            vflag, vflag_global, vflag_atom, v_vatom, v_eatom);
+    });
+  }
+}
+
+template<class DeviceType>
+static void getAllMixingWeights(
+    int ntotal,
+    typename ArrayTypes<DeviceType>::t_float_2d_randomread dvector,
+    int nspecies,
+    int isite1, int isite2,
+    bool fractionalWeighting,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2old,
+    Kokkos::View<double*, DeviceType> const& mixWtSite1,
+    Kokkos::View<double*, DeviceType> const& mixWtSite2) {
+  Kokkos::parallel_for(ntotal,
+  LAMMPS_LAMBDA(int i) {
+      getMixingWeights<DeviceType>(dvector,nspecies,isite1,isite2,fractionalWeighting,
+        i, mixWtSite1old(i), mixWtSite2old(i), mixWtSite1(i), mixWtSite2(i));
+  });
+}
+
+template<class DeviceType>
+template<int TABSTYLE>
+void PairTableRXKokkos<DeviceType>::compute_style(int eflag_in, int vflag_in)
+{
+  auto eflag = eflag_in;
+  auto vflag = vflag_in;
+
+  if (neighflag == FULL) no_virial_fdotr_compute = 1;
+
+  if (eflag || vflag) ev_setup(eflag,vflag,0);
+  else evflag = vflag_fdotr = 0;
+
+  if (eflag_atom) {
+    memory->destroy_kokkos(k_eatom,eatom);
+    memory->create_kokkos(k_eatom,eatom,maxeatom,"pair:eatom");
+    d_eatom = k_eatom.template view<DeviceType>();
+  }
+  if (vflag_atom) {
+    memory->destroy_kokkos(k_vatom,vatom);
+    memory->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom");
+    d_vatom = k_vatom.template view<DeviceType>();
+  }
+
+  atomKK->sync(execution_space,datamask_read);
+  if (eflag || vflag) atomKK->modified(execution_space,datamask_modify);
+  else atomKK->modified(execution_space,F_MASK);
+
+  x = atomKK->k_x.view<DeviceType>();
+  f = atomKK->k_f.view<DeviceType>();
+  auto type = atomKK->k_type.view<DeviceType>();
+  auto uCG = atomKK->k_uCG.view<DeviceType>();
+  auto uCGnew = atomKK->k_uCGnew.view<DeviceType>();
+  auto nlocal = atom->nlocal;
+  Few<int, 4> special_lj_local;
+  special_lj_local[0] = force->special_lj[0];
+  special_lj_local[1] = force->special_lj[1];
+  special_lj_local[2] = force->special_lj[2];
+  special_lj_local[3] = force->special_lj[3];
+  auto newton_pair = force->newton_pair;
+  d_cutsq = d_table->cutsq;
+  // loop over neighbors of my atoms
+
+  const int ntotal = atom->nlocal + atom->nghost;
+  if (ntotal > mixWtSite1.dimension_0()) {
+    mixWtSite1old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1old", ntotal);
+    mixWtSite2old = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2old", ntotal);
+    mixWtSite1 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite1", ntotal);
+    mixWtSite2 = Kokkos::View<double*, DeviceType>("PairTableRXKokkos::mixWtSite2", ntotal);
+  }
+
+  getAllMixingWeights(ntotal, atomKK->k_dvector.template view<DeviceType>(),
+      nspecies, isite1, isite2, fractionalWeighting,
+      mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2);
+
+  if (neighflag == N2) error->all(FLERR,"pair table/rx/kk can't handle N2 yet\n");
+
+  NeighListKokkos<DeviceType>* l =
+    dynamic_cast<NeighListKokkos<DeviceType>*>(list);
+
+  EV_FLOAT ev;
+  if(atom->ntypes > MAX_TYPES_STACKPARAMS) {
+    if (neighflag == HALFTHREAD) {
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALFTHREAD,false,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALFTHREAD,false,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
+    } else if (neighflag == HALF) {
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALF,false,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALF,false,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
+    } else if (neighflag == FULL) {
+      if (newton_pair) {
+        compute_all_items<DeviceType,FULL,false,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,FULL,false,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
+    }
+  } else {
+    if (neighflag == HALFTHREAD) {
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALFTHREAD,true,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALFTHREAD,true,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
+    } else if (neighflag == HALF) {
+      if (newton_pair) {
+        compute_all_items<DeviceType,HALF,true,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,HALF,true,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
+    } else if (neighflag == FULL) {
+      if (newton_pair) {
+        compute_all_items<DeviceType,FULL,true,TABSTYLE,1>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      } else {
+        compute_all_items<DeviceType,FULL,true,TABSTYLE,0>(
+          ev, nlocal, l->inum, l->d_ilist, l->d_neighbors, l->d_numneigh,
+          x, type, mixWtSite1old, mixWtSite2old, mixWtSite1, mixWtSite2,
+          special_lj_local, m_cutsq, d_cutsq, f, uCG, uCGnew, isite1, isite2,
+          d_table_const, eflag, eflag_atom,
+          vflag, vflag_global, vflag_atom, d_vatom, d_eatom);
+      }
+    }
+  }
+
+  if (eflag) eng_vdwl += ev.evdwl;
+  if (vflag_global) {
+    virial[0] += ev.v[0];
+    virial[1] += ev.v[1];
+    virial[2] += ev.v[2];
+    virial[3] += ev.v[3];
+    virial[4] += ev.v[4];
+    virial[5] += ev.v[5];
+  }
+
+  if (vflag_fdotr) pair_virial_fdotr_compute(this);
+
+  if (eflag_atom) {
+    k_eatom.template modify<DeviceType>();
+    k_eatom.template sync<LMPHostType>();
+  }
+
+  if (vflag_atom) {
+    k_vatom.template modify<DeviceType>();
+    k_vatom.template sync<LMPHostType>();
+  }
+}
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::create_kokkos_tables()
+{
+  const int tlm1 = tablength-1;
+
+  memory->create_kokkos(d_table->nshiftbits,h_table->nshiftbits,ntables,"Table::nshiftbits");
+  memory->create_kokkos(d_table->nmask,h_table->nmask,ntables,"Table::nmask");
+  memory->create_kokkos(d_table->innersq,h_table->innersq,ntables,"Table::innersq");
+  memory->create_kokkos(d_table->invdelta,h_table->invdelta,ntables,"Table::invdelta");
+  memory->create_kokkos(d_table->deltasq6,h_table->deltasq6,ntables,"Table::deltasq6");
+
+  if(tabstyle == LOOKUP) {
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tlm1,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tlm1,"Table::f");
+  }
+
+  if(tabstyle == LINEAR) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,tlm1,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,tlm1,"Table::df");
+  }
+
+  if(tabstyle == SPLINE) {
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,tablength,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,tablength,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,tablength,"Table::f");
+    memory->create_kokkos(d_table->e2,h_table->e2,ntables,tablength,"Table::e2");
+    memory->create_kokkos(d_table->f2,h_table->f2,ntables,tablength,"Table::f2");
+  }
+
+  if(tabstyle == BITMAP) {
+    int ntable = 1 << tablength;
+    memory->create_kokkos(d_table->rsq,h_table->rsq,ntables,ntable,"Table::rsq");
+    memory->create_kokkos(d_table->e,h_table->e,ntables,ntable,"Table::e");
+    memory->create_kokkos(d_table->f,h_table->f,ntables,ntable,"Table::f");
+    memory->create_kokkos(d_table->de,h_table->de,ntables,ntable,"Table::de");
+    memory->create_kokkos(d_table->df,h_table->df,ntables,ntable,"Table::df");
+    memory->create_kokkos(d_table->drsq,h_table->drsq,ntables,ntable,"Table::drsq");
+  }
+
+
+
+  for(int i=0; i < ntables; i++) {
+    Table* tb = &tables[i];
+
+    h_table->nshiftbits[i] = tb->nshiftbits;
+    h_table->nmask[i] = tb->nmask;
+    h_table->innersq[i] = tb->innersq;
+    h_table->invdelta[i] = tb->invdelta;
+    h_table->deltasq6[i] = tb->deltasq6;
+
+    for(int j = 0; j<h_table->rsq.dimension_1(); j++)
+      h_table->rsq(i,j) = tb->rsq[j];
+    for(int j = 0; j<h_table->drsq.dimension_1(); j++)
+      h_table->drsq(i,j) = tb->drsq[j];
+    for(int j = 0; j<h_table->e.dimension_1(); j++)
+      h_table->e(i,j) = tb->e[j];
+    for(int j = 0; j<h_table->de.dimension_1(); j++)
+      h_table->de(i,j) = tb->de[j];
+    for(int j = 0; j<h_table->f.dimension_1(); j++)
+      h_table->f(i,j) = tb->f[j];
+    for(int j = 0; j<h_table->df.dimension_1(); j++)
+      h_table->df(i,j) = tb->df[j];
+    for(int j = 0; j<h_table->e2.dimension_1(); j++)
+      h_table->e2(i,j) = tb->e2[j];
+    for(int j = 0; j<h_table->f2.dimension_1(); j++)
+      h_table->f2(i,j) = tb->f2[j];
+  }
+
+
+  Kokkos::deep_copy(d_table->nshiftbits,h_table->nshiftbits);
+  d_table_const.nshiftbits = d_table->nshiftbits;
+  Kokkos::deep_copy(d_table->nmask,h_table->nmask);
+  d_table_const.nmask = d_table->nmask;
+  Kokkos::deep_copy(d_table->innersq,h_table->innersq);
+  d_table_const.innersq = d_table->innersq;
+  Kokkos::deep_copy(d_table->invdelta,h_table->invdelta);
+  d_table_const.invdelta = d_table->invdelta;
+  Kokkos::deep_copy(d_table->deltasq6,h_table->deltasq6);
+  d_table_const.deltasq6 = d_table->deltasq6;
+
+  if(tabstyle == LOOKUP) {
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+  }
+
+  if(tabstyle == LINEAR) {
+    Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+    d_table_const.rsq = d_table->rsq;
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+    Kokkos::deep_copy(d_table->de,h_table->de);
+    d_table_const.de = d_table->de;
+    Kokkos::deep_copy(d_table->df,h_table->df);
+    d_table_const.df = d_table->df;
+  }
+
+  if(tabstyle == SPLINE) {
+    Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+    d_table_const.rsq = d_table->rsq;
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+    Kokkos::deep_copy(d_table->e2,h_table->e2);
+    d_table_const.e2 = d_table->e2;
+    Kokkos::deep_copy(d_table->f2,h_table->f2);
+    d_table_const.f2 = d_table->f2;
+  }
+
+  if(tabstyle == BITMAP) {
+    Kokkos::deep_copy(d_table->rsq,h_table->rsq);
+    d_table_const.rsq = d_table->rsq;
+    Kokkos::deep_copy(d_table->e,h_table->e);
+    d_table_const.e = d_table->e;
+    Kokkos::deep_copy(d_table->f,h_table->f);
+    d_table_const.f = d_table->f;
+    Kokkos::deep_copy(d_table->de,h_table->de);
+    d_table_const.de = d_table->de;
+    Kokkos::deep_copy(d_table->df,h_table->df);
+    d_table_const.df = d_table->df;
+    Kokkos::deep_copy(d_table->drsq,h_table->drsq);
+    d_table_const.drsq = d_table->drsq;
+  }
+
+  Kokkos::deep_copy(d_table->cutsq,h_table->cutsq);
+  d_table_const.cutsq = d_table->cutsq;
+  Kokkos::deep_copy(d_table->tabindex,h_table->tabindex);
+  d_table_const.tabindex = d_table->tabindex;
+
+  update_table = 0;
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::allocate()
+{
+  allocated = 1;
+  const int nt = atom->ntypes + 1;
+
+  memory->create(setflag,nt,nt,"pair:setflag");
+  memory->create_kokkos(d_table->cutsq,h_table->cutsq,cutsq,nt,nt,"pair:cutsq");
+  memory->create_kokkos(d_table->tabindex,h_table->tabindex,tabindex,nt,nt,"pair:tabindex");
+  d_table_const.cutsq = d_table->cutsq;
+  d_table_const.tabindex = d_table->tabindex;
+
+  memset(&setflag[0][0],0,nt*nt*sizeof(int));
+  memset(&cutsq[0][0],0,nt*nt*sizeof(double));
+  memset(&tabindex[0][0],0,nt*nt*sizeof(int));
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::settings(int narg, char **arg)
+{
+  if (narg < 2) error->all(FLERR,"Illegal pair_style command");
+
+  // new settings
+
+  if (strcmp(arg[0],"lookup") == 0) tabstyle = LOOKUP;
+  else if (strcmp(arg[0],"linear") == 0) tabstyle = LINEAR;
+  else if (strcmp(arg[0],"spline") == 0) tabstyle = SPLINE;
+  else if (strcmp(arg[0],"bitmap") == 0) tabstyle = BITMAP;
+  else error->all(FLERR,"Unknown table style in pair_style command");
+
+  tablength = force->inumeric(FLERR,arg[1]);
+  if (tablength < 2) error->all(FLERR,"Illegal number of pair table entries");
+
+  // optional keywords
+  // assert the tabulation is compatible with a specific long-range solver
+
+  int iarg = 2;
+  while (iarg < narg) {
+    if (strcmp(arg[iarg],"ewald") == 0) ewaldflag = 1;
+    else if (strcmp(arg[iarg],"pppm") == 0) pppmflag = 1;
+    else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
+    else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
+    else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
+    else if (strcmp(arg[iarg],"fractional") == 0) fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0) fractionalWeighting = false;
+    else error->all(FLERR,"Illegal pair_style command");
+    iarg++;
+  }
+
+  // delete old tables, since cannot just change settings
+
+  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
+  memory->sfree(tables);
+  ntables = 0;
+  tables = NULL;
+
+  if (allocated) {
+    memory->destroy(setflag);
+
+    d_table_const.tabindex = d_table->tabindex = typename ArrayTypes<DeviceType>::t_int_2d();
+    h_table->tabindex = typename ArrayTypes<LMPHostType>::t_int_2d();
+
+    d_table_const.cutsq = d_table->cutsq = typename ArrayTypes<DeviceType>::t_ffloat_2d();
+    h_table->cutsq = typename ArrayTypes<LMPHostType>::t_ffloat_2d();
+    allocated = 0;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::coeff(int narg, char **arg)
+{
+  if (narg != 6 && narg != 7) error->all(FLERR,"Illegal pair_coeff command");
+  if (!allocated) allocate();
+
+  bool rx_flag = false;
+  for (int i = 0; i < modify->nfix; i++)
+    if (strncmp(modify->fix[i]->style,"rx",2) == 0) rx_flag = true;
+  if (!rx_flag) error->all(FLERR,"PairTableRX requires a fix rx command.");
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  int me;
+  MPI_Comm_rank(world,&me);
+  tables = (Table *)
+    memory->srealloc(tables,(ntables+1)*sizeof(Table),"pair:tables");
+  Table *tb = &tables[ntables];
+  null_table(tb);
+  if (me == 0) read_table(tb,arg[2],arg[3]);
+  bcast_table(tb);
+
+  nspecies = atom->nspecies_dpd;
+  if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
+  int n;
+  n = strlen(arg[4]) + 1;
+  site1 = new char[n];
+  strcpy(site1,arg[4]);
+
+  int ispecies;
+  for (ispecies = 0; ispecies < nspecies; ispecies++){
+    if (strcmp(site1,&atom->dname[ispecies][0]) == 0) break;
+  }
+  if (ispecies == nspecies && strcmp(site1,"1fluid") != 0)
+    error->all(FLERR,"Site1 name not recognized in pair coefficients");
+
+  n = strlen(arg[5]) + 1;
+  site2 = new char[n];
+  strcpy(site2,arg[5]);
+
+  for (ispecies = 0; ispecies < nspecies; ispecies++){
+    if (strcmp(site2,&atom->dname[ispecies][0]) == 0) break;
+  }
+  if (ispecies == nspecies && strcmp(site2,"1fluid") != 0)
+    error->all(FLERR,"Site2 name not recognized in pair coefficients");
+
+  // set table cutoff
+
+  if (narg == 7) tb->cut = force->numeric(FLERR,arg[6]);
+  else if (tb->rflag) tb->cut = tb->rhi;
+  else tb->cut = tb->rfile[tb->ninput-1];
+
+  // error check on table parameters
+  // insure cutoff is within table
+  // for BITMAP tables, file values can be in non-ascending order
+
+  if (tb->ninput <= 1) error->one(FLERR,"Invalid pair table length");
+  double rlo,rhi;
+  if (tb->rflag == 0) {
+    rlo = tb->rfile[0];
+    rhi = tb->rfile[tb->ninput-1];
+  } else {
+    rlo = tb->rlo;
+    rhi = tb->rhi;
+  }
+  if (tb->cut <= rlo || tb->cut > rhi)
+    error->all(FLERR,"Invalid pair table cutoff");
+  if (rlo <= 0.0) error->all(FLERR,"Invalid pair table cutoff");
+
+  // match = 1 if don't need to spline read-in tables
+  // this is only the case if r values needed by final tables
+  //   exactly match r values read from file
+  // for tabstyle SPLINE, always need to build spline tables
+
+  tb->match = 0;
+  if (tabstyle == LINEAR && tb->ninput == tablength &&
+      tb->rflag == RSQ && tb->rhi == tb->cut) tb->match = 1;
+  if (tabstyle == BITMAP && tb->ninput == 1 << tablength &&
+      tb->rflag == BMP && tb->rhi == tb->cut) tb->match = 1;
+  if (tb->rflag == BMP && tb->match == 0)
+    error->all(FLERR,"Bitmapped table in file does not match requested table");
+
+  // spline read-in values and compute r,e,f vectors within table
+
+  if (tb->match == 0) spline_table(tb);
+  compute_table(tb);
+
+  // store ptr to table in tabindex
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      tabindex[i][j] = ntables;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Illegal pair_coeff command");
+  ntables++;
+
+  {
+     if ( strcmp(site1,"1fluid") == 0 )
+       isite1 = OneFluidValue;
+     else {
+       isite1 = nspecies;
+
+       for (int k = 0; k < nspecies; k++){
+         if (strcmp(site1, atom->dname[k]) == 0){
+           isite1 = k;
+           break;
+         }
+       }
+
+       if (isite1 == nspecies) error->all(FLERR,"isite1 == nspecies");
+     }
+
+     if ( strcmp(site2,"1fluid") == 0 )
+       isite2 = OneFluidValue;
+     else {
+       isite2 = nspecies;
+
+       for (int k = 0; k < nspecies; k++){
+         if (strcmp(site2, atom->dname[k]) == 0){
+           isite2 = ispecies;
+           break;
+         }
+       }
+
+       if (isite2 == nspecies)
+         error->all(FLERR,"isite2 == nspecies");
+     }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableRXKokkos<DeviceType>::init_one(int i, int j)
+{
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+
+  tabindex[j][i] = tabindex[i][j];
+
+  if(i<MAX_TYPES_STACKPARAMS+1 && j<MAX_TYPES_STACKPARAMS+1) {
+    m_cutsq[j][i] = m_cutsq[i][j] = tables[tabindex[i][j]].cut*tables[tabindex[i][j]].cut;
+  }
+
+  return tables[tabindex[i][j]].cut;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double PairTableRXKokkos<DeviceType>::single(int i, int j, int itype, int jtype, double rsq,
+                         double factor_coul, double factor_lj,
+                         double &fforce)
+{
+  int itable;
+  double fraction,value,a,b,phi;
+  int tlm1 = tablength - 1;
+
+  Table *tb = &tables[tabindex[itype][jtype]];
+  double mixWtSite1_i, mixWtSite1_j;
+  double mixWtSite2_i, mixWtSite2_j;
+  double mixWtSite1old_i, mixWtSite1old_j;
+  double mixWtSite2old_i, mixWtSite2old_j;
+
+  fraction = 0.0;
+  a = 0.0;
+  b = 0.0;
+
+  atomKK->k_dvector.template sync<LMPHostType>();
+  typename ArrayTypes<LMPHostType>::t_float_2d_randomread h_dvector =
+    atomKK->k_dvector.view<LMPHostType>();
+  getMixingWeights<LMPHostType>(h_dvector,
+      nspecies, isite1, isite2, fractionalWeighting,
+      i,mixWtSite1old_i,mixWtSite2old_i,
+      mixWtSite1_i,mixWtSite2_i);
+  getMixingWeights<LMPHostType>(h_dvector,
+      nspecies, isite1, isite2, fractionalWeighting,
+      j,mixWtSite1old_j,mixWtSite2old_j,
+      mixWtSite1_j,mixWtSite2_j);
+
+  if (rsq < tb->innersq) error->one(FLERR,"Pair distance < table inner cutoff");
+
+  if (tabstyle == LOOKUP) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fforce = factor_lj * tb->f[itable];
+  } else if (tabstyle == LINEAR) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    fraction = (rsq - tb->rsq[itable]) * tb->invdelta;
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  } else if (tabstyle == SPLINE) {
+    itable = static_cast<int> ((rsq-tb->innersq) * tb->invdelta);
+    if (itable >= tlm1) error->one(FLERR,"Pair distance > table outer cutoff");
+    b = (rsq - tb->rsq[itable]) * tb->invdelta;
+    a = 1.0 - b;
+    value = a * tb->f[itable] + b * tb->f[itable+1] +
+      ((a*a*a-a)*tb->f2[itable] + (b*b*b-b)*tb->f2[itable+1]) *
+      tb->deltasq6;
+    fforce = factor_lj * value;
+  } else {
+    Pair::union_int_float_t rsq_lookup;
+    rsq_lookup.f = rsq;
+    itable = rsq_lookup.i & tb->nmask;
+    itable >>= tb->nshiftbits;
+    fraction = (rsq_lookup.f - tb->rsq[itable]) * tb->drsq[itable];
+    value = tb->f[itable] + fraction*tb->df[itable];
+    fforce = factor_lj * value;
+  }
+
+  if (isite1 == isite2) fforce = sqrt(mixWtSite1_i*mixWtSite2_j)*fforce;
+  else fforce = (sqrt(mixWtSite1_i*mixWtSite2_j) + sqrt(mixWtSite2_i*mixWtSite1_j))*fforce;
+
+  if (tabstyle == LOOKUP)
+    phi = tb->e[itable];
+  else if (tabstyle == LINEAR || tabstyle == BITMAP)
+    phi = tb->e[itable] + fraction*tb->de[itable];
+  else
+    phi = a * tb->e[itable] + b * tb->e[itable+1] +
+      ((a*a*a-a)*tb->e2[itable] + (b*b*b-b)*tb->e2[itable+1]) * tb->deltasq6;
+
+  if (isite1 == isite2) phi = sqrt(mixWtSite1_i*mixWtSite2_j)*phi;
+  else phi = (sqrt(mixWtSite1_i*mixWtSite2_j) + sqrt(mixWtSite2_i*mixWtSite1_j))*phi;
+
+  return factor_lj*phi;
+}
+
+/* ----------------------------------------------------------------------
+   compute r,e,f vectors from splined values
+------------------------------------------------------------------------- */
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::compute_table(Table *tb)
+{
+  update_table = 1;
+  PairTable::compute_table(tb);
+}
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::init_style()
+{
+  neighbor->request(this,instance_me);
+  neighflag = lmp->kokkos->neighflag;
+  int irequest = neighbor->nrequest - 1;
+
+  neighbor->requests[irequest]->
+    kokkos_host = Kokkos::Impl::is_same<DeviceType,LMPHostType>::value &&
+    !Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+  neighbor->requests[irequest]->
+    kokkos_device = Kokkos::Impl::is_same<DeviceType,LMPDeviceType>::value;
+
+  if (neighflag == FULL) {
+    neighbor->requests[irequest]->full = 1;
+    neighbor->requests[irequest]->half = 0;
+  } else if (neighflag == HALF || neighflag == HALFTHREAD) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 1;
+  } else if (neighflag == N2) {
+    neighbor->requests[irequest]->full = 0;
+    neighbor->requests[irequest]->half = 0;
+  } else {
+    error->all(FLERR,"Cannot use chosen neighbor list style with lj/cut/kk");
+  }
+}
+
+template<class DeviceType>
+void PairTableRXKokkos<DeviceType>::cleanup_copy() {
+  // WHY needed: this prevents parent copy from deallocating any arrays
+  allocated = 0;
+  cutsq = NULL;
+  eatom = NULL;
+  vatom = NULL;
+  h_table=NULL; d_table=NULL;
+}
+
+namespace LAMMPS_NS {
+template class PairTableRXKokkos<LMPDeviceType>;
+#ifdef KOKKOS_HAVE_CUDA
+template class PairTableRXKokkos<LMPHostType>;
+#endif
+
+}
+
diff --git a/src/KOKKOS/pair_table_rx_kokkos.h b/src/KOKKOS/pair_table_rx_kokkos.h
new file mode 100644
index 0000000000..4230263dc9
--- /dev/null
+++ b/src/KOKKOS/pair_table_rx_kokkos.h
@@ -0,0 +1,122 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(table/rx/kk,PairTableRXKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/device,PairTableRXKokkos<LMPDeviceType>)
+PairStyle(table/rx/kk/host,PairTableRXKokkos<LMPHostType>)
+
+#else
+
+#ifndef LMP_PAIR_TABLE_RX_KOKKOS_H
+#define LMP_PAIR_TABLE_RX_KOKKOS_H
+
+#include "pair_table_kokkos.h"
+#include "kokkos_few.h"
+
+namespace LAMMPS_NS {
+
+template<class DeviceType>
+class PairTableRXKokkos : public PairTable {
+ public:
+  enum {EnabledNeighFlags=FULL|HALFTHREAD|HALF|N2};
+  typedef DeviceType device_type;
+
+  PairTableRXKokkos(class LAMMPS *);
+  virtual ~PairTableRXKokkos();
+
+  virtual void compute(int, int);
+
+  template<int TABSTYLE>
+  void compute_style(int, int);
+
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  virtual double single(int, int, int, int, double, double, double, double &);
+
+  void init_style();
+
+  struct TableDeviceConst {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d_randomread rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableDevice {
+    typename ArrayTypes<DeviceType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<DeviceType>::t_int_2d tabindex;
+    typename ArrayTypes<DeviceType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<DeviceType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<DeviceType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  struct TableHost {
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d cutsq;
+    typename ArrayTypes<LMPHostType>::t_int_2d tabindex;
+    typename ArrayTypes<LMPHostType>::t_int_1d nshiftbits,nmask;
+    typename ArrayTypes<LMPHostType>::t_ffloat_1d innersq,invdelta,deltasq6;
+    typename ArrayTypes<LMPHostType>::t_ffloat_2d rsq,drsq,e,de,f,df,e2,f2;
+  };
+
+  TableDeviceConst d_table_const;
+  TableDevice* d_table;
+  TableHost* h_table;
+
+  Few<Few<F_FLOAT, MAX_TYPES_STACKPARAMS+1>, MAX_TYPES_STACKPARAMS+1> m_cutsq;
+
+  typename ArrayTypes<DeviceType>::t_ffloat_2d d_cutsq;
+
+  virtual void allocate();
+  void compute_table(Table *);
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_f_array f;
+
+  int neighflag;
+
+  int update_table;
+  void create_kokkos_tables();
+  void cleanup_copy();
+
+  friend void pair_virial_fdotr_compute<PairTableRXKokkos>(PairTableRXKokkos*);
+
+  /* PairTableRX members */
+
+  Kokkos::View<double*, DeviceType> mixWtSite1old;
+  Kokkos::View<double*, DeviceType> mixWtSite2old;
+  Kokkos::View<double*, DeviceType> mixWtSite1;
+  Kokkos::View<double*, DeviceType> mixWtSite2;
+
+  int nspecies;
+  char *site1, *site2;
+  int isite1, isite2;
+  bool fractionalWeighting;
+
+  typename ArrayTypes<DeviceType>::tdual_efloat_1d k_eatom;
+  typename ArrayTypes<DeviceType>::tdual_virial_array k_vatom;
+  typename ArrayTypes<DeviceType>::t_efloat_1d d_eatom;
+  typename ArrayTypes<DeviceType>::t_virial_array d_vatom;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+ */
diff --git a/src/KOKKOS/rand_pool_wrap_kokkos.cpp b/src/KOKKOS/rand_pool_wrap_kokkos.cpp
new file mode 100644
index 0000000000..c11764640b
--- /dev/null
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.cpp
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "comm.h"
+#include "rand_pool_wrap_kokkos.h"
+#include "lammps.h"
+#include "kokkos.h"
+#include "random_mars.h"
+#include "update.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+RandPoolWrap::RandPoolWrap(int, LAMMPS *lmp) : Pointers(lmp)
+{
+  random_thr =  NULL;
+  nthreads = lmp->kokkos->num_threads;
+}
+
+/* ---------------------------------------------------------------------- */
+
+RandPoolWrap::~RandPoolWrap()
+{
+
+}
+
+void RandPoolWrap::destroy()
+{
+  if (random_thr) {
+    for (int i=1; i < nthreads; ++i)
+      delete random_thr[i];
+  
+    delete[] random_thr;
+    random_thr = NULL;
+  }
+}
+
+void RandPoolWrap::init(RanMars* random, int seed)
+{
+  // deallocate pool of RNGs
+  if (random_thr) {
+    for (int i=1; i < this->nthreads; ++i)
+      delete random_thr[i];
+  
+    delete[] random_thr;
+  }
+  
+  // allocate pool of RNGs
+  // generate a random number generator instance for
+  // all threads != 0. make sure we use unique seeds.
+  nthreads = lmp->kokkos->num_threads;
+  random_thr = new RanMars*[nthreads];
+  for (int tid = 1; tid < nthreads; ++tid) {
+    random_thr[tid] = new RanMars(lmp, seed + comm->me
+                                  + comm->nprocs*tid);
+  }
+
+  // to ensure full compatibility with the serial style
+  // we use the serial random number generator instance for thread 0
+  random_thr[0] = random;
+}
diff --git a/src/KOKKOS/rand_pool_wrap_kokkos.h b/src/KOKKOS/rand_pool_wrap_kokkos.h
new file mode 100644
index 0000000000..975ce0c89a
--- /dev/null
+++ b/src/KOKKOS/rand_pool_wrap_kokkos.h
@@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifndef RAND_POOL_WRAP_H
+#define RAND_POOL_WRAP_H
+
+#include "pointers.h"
+#include "kokkos_type.h"
+#include "random_mars.h"
+#include "error.h"
+
+namespace LAMMPS_NS {
+
+struct RandWrap {
+  class RanMars* rng;
+
+  KOKKOS_INLINE_FUNCTION
+  RandWrap() {
+    rng = NULL;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double drand() {
+    return rng->uniform();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double normal() {
+    return rng->gaussian();
+  }
+};
+
+class RandPoolWrap : protected Pointers {
+ public:
+  RandPoolWrap(int, class LAMMPS *);
+  ~RandPoolWrap();
+  void destroy();
+  void init(RanMars*, int);
+
+  KOKKOS_INLINE_FUNCTION
+  RandWrap get_state() const
+  {
+#ifdef KOKKOS_HAVE_CUDA
+    error->all(FLERR,"Cannot use Marsaglia RNG with GPUs");
+#endif
+
+    RandWrap rand_wrap;
+    int tid = 0;
+#ifndef KOKKOS_HAVE_CUDA
+    tid = LMPDeviceType::hardware_thread_id();
+#endif
+    rand_wrap.rng = random_thr[tid];
+    return rand_wrap;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void free_state(RandWrap) const
+  {
+
+  }
+
+ private:
+  class RanMars **random_thr;
+  int nthreads;
+};
+
+}
+
+#endif
+
+/* ERROR/WARNING messages:
+
+*/
diff --git a/src/KOKKOS/region_block_kokkos.cpp b/src/KOKKOS/region_block_kokkos.cpp
index 90fd47ab06..eed4272f23 100644
--- a/src/KOKKOS/region_block_kokkos.cpp
+++ b/src/KOKKOS/region_block_kokkos.cpp
@@ -67,7 +67,6 @@ void RegBlockKokkos<DeviceType>::match_all_kokkos(int groupbit_in, DAT::tdual_in
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagRegBlockMatchAll>(0,nlocal),*this);
-  DeviceType::fence();
   copymode = 0;
 
   k_match_in.template modify<DeviceType>();
diff --git a/src/MAKE/MACHINES/Makefile.icex b/src/MAKE/MACHINES/Makefile.icex
new file mode 100644
index 0000000000..ceeec48870
--- /dev/null
+++ b/src/MAKE/MACHINES/Makefile.icex
@@ -0,0 +1,116 @@
+# mpi = MPI with its default compiler
+
+SHELL = /bin/sh
+
+# ---------------------------------------------------------------------
+# compiler/linker settings
+# specify flags and libraries needed for your compiler
+
+CC =		mpicxx
+CCFLAGS =	-g -O3 -Wall -Wextra -frounding-math -fsignaling-nans -march=native
+SHFLAGS =	-shared -MD -mcmodel=medium -fpic -fPIC
+DEPFLAGS =	-M
+
+LINK =		mpicxx
+LINKFLAGS =	-g -O
+LIB = 
+SIZE =		size
+
+ARCHIVE =	ar
+ARFLAGS =	-rc
+SHLIBFLAGS =	-shared
+
+# ---------------------------------------------------------------------
+# LAMMPS-specific settings, all OPTIONAL
+# specify settings for LAMMPS features you will use
+# if you change any -D setting, do full re-compile after "make clean"
+
+# LAMMPS ifdef settings
+# see possible settings in Section 2.2 (step 4) of manual
+
+LMP_INC =	-DLAMMPS_GZIP
+#LMP_INC +=	-DLAMMPS_JPEG
+LMP_INC +=	-DLAMMPS_MEMALIGN=64
+
+# MPI library
+# see discussion in Section 2.2 (step 5) of manual
+# MPI wrapper compiler/linker can provide this info
+# can point to dummy MPI library in src/STUBS as in Makefile.serial
+# use -D MPICH and OMPI settings in INC to avoid C++ lib conflicts
+# INC = path for mpi.h, MPI compiler settings
+# PATH = path for MPI library
+# LIB = name of MPI library
+
+MPI_INC =       -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
+MPI_PATH = 
+MPI_LIB =	
+
+# FFT library
+# see discussion in Section 2.2 (step 6) of manual
+# can be left blank to use provided KISS FFT library
+# INC = -DFFT setting, e.g. -DFFT_FFTW, FFT compiler settings
+# PATH = path for FFT library
+# LIB = name of FFT library
+
+FFT_INC =    	
+FFT_PATH = 
+FFT_LIB =	
+
+# JPEG and/or PNG library
+# see discussion in Section 2.2 (step 7) of manual
+# only needed if -DLAMMPS_JPEG or -DLAMMPS_PNG listed with LMP_INC
+# INC = path(s) for jpeglib.h and/or png.h
+# PATH = path(s) for JPEG library and/or PNG library
+# LIB = name(s) of JPEG library and/or PNG library
+
+JPG_INC =       
+JPG_PATH = 	
+JPG_LIB =	
+
+# ---------------------------------------------------------------------
+# build rules and dependencies
+# do not edit this section
+
+include	Makefile.package.settings
+include	Makefile.package
+
+EXTRA_INC = $(LMP_INC) $(PKG_INC) $(MPI_INC) $(FFT_INC) $(JPG_INC) $(PKG_SYSINC)
+EXTRA_PATH = $(PKG_PATH) $(MPI_PATH) $(FFT_PATH) $(JPG_PATH) $(PKG_SYSPATH)
+EXTRA_LIB = $(PKG_LIB) $(MPI_LIB) $(FFT_LIB) $(JPG_LIB) $(PKG_SYSLIB)
+EXTRA_CPP_DEPENDS = $(PKG_CPP_DEPENDS)
+EXTRA_LINK_DEPENDS = $(PKG_LINK_DEPENDS)
+
+# Path to src files
+
+vpath %.cpp ..
+vpath %.h ..
+
+# Link target
+
+$(EXE):	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(LINK) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(EXTRA_LIB) $(LIB) -o $(EXE)
+	$(SIZE) $(EXE)
+
+# Library targets
+
+lib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(ARCHIVE) $(ARFLAGS) $(EXE) $(OBJ)
+
+shlib:	$(OBJ) $(EXTRA_LINK_DEPENDS)
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(SHLIBFLAGS) $(EXTRA_PATH) -o $(EXE) \
+        $(OBJ) $(EXTRA_LIB) $(LIB)
+
+# Compilation rules
+
+%.o:%.cpp
+	$(CC) $(CCFLAGS) $(SHFLAGS) $(EXTRA_INC) -c $<
+
+# Individual dependencies
+
+depend : fastdep.exe $(SRC)
+	@./fastdep.exe $(EXTRA_INC) -- $^ > .depend || exit 1
+
+fastdep.exe: ../DEPEND/fastdep.c
+	cc -O -o $@ $<
+
+sinclude .depend
diff --git a/src/MPIIO/dump_custom_mpiio.cpp b/src/MPIIO/dump_custom_mpiio.cpp
index 6e48bfa146..0b282b77ef 100644
--- a/src/MPIIO/dump_custom_mpiio.cpp
+++ b/src/MPIIO/dump_custom_mpiio.cpp
@@ -542,8 +542,8 @@ void DumpCustomMPIIO::write_string(int n, double *mybuf)
 
 #if defined(_OPENMP)
     int nthreads = omp_get_max_threads();
-    if (nthreads > 1)
-      nsme = convert_string_omp(n,mybuf);
+    if ((nthreads > 1) && !(lmp->kokkos))
+      nsme = convert_string_omp(n,mybuf); // not (yet) compatible with Kokkos
     else
       nsme = convert_string(n,mybuf);
 #else
diff --git a/src/Purge.list b/src/Purge.list
index 340b210b20..315e5e3424 100644
--- a/src/Purge.list
+++ b/src/Purge.list
@@ -16,6 +16,9 @@ style_region.h
 style_neigh_bin.h
 style_neigh_pair.h
 style_neigh_stencil.h
+# deleted on 5 September 2017
+npair_halffull_newton_ssa.cpp
+npair_halffull_newton_ssa.h
 # deleted on 6 June 2017
 pair_lj_sf.cpp
 pair_lj_sf.h
diff --git a/src/USER-DPD/fix_dpd_energy.cpp b/src/USER-DPD/fix_dpd_energy.cpp
index 05907a5fcf..475e12f02f 100644
--- a/src/USER-DPD/fix_dpd_energy.cpp
+++ b/src/USER-DPD/fix_dpd_energy.cpp
@@ -34,6 +34,8 @@ FixDPDenergy::FixDPDenergy(LAMMPS *lmp, int narg, char **arg) :
 
   pairDPDE = NULL;
   pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
 
   if (pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style dpd/fdt/energy with fix dpd/energy");
diff --git a/src/USER-DPD/fix_eos_table_rx.cpp b/src/USER-DPD/fix_eos_table_rx.cpp
index b90c39ed4e..c9a705446a 100644
--- a/src/USER-DPD/fix_eos_table_rx.cpp
+++ b/src/USER-DPD/fix_eos_table_rx.cpp
@@ -150,6 +150,8 @@ FixEOStableRX::FixEOStableRX(LAMMPS *lmp, int narg, char **arg) :
 
 FixEOStableRX::~FixEOStableRX()
 {
+  if (copymode) return;
+
   for (int m = 0; m < ntables; m++) {
     free_table(&tables[m]);
     free_table(&tables2[m]);
diff --git a/src/USER-DPD/fix_rx.cpp b/src/USER-DPD/fix_rx.cpp
index 82bb44a24d..0d0ec2a75b 100644
--- a/src/USER-DPD/fix_rx.cpp
+++ b/src/USER-DPD/fix_rx.cpp
@@ -220,6 +220,9 @@ FixRX::FixRX(LAMMPS *lmp, int narg, char **arg) :
 
 FixRX::~FixRX()
 {
+  //printf("Inside FixRX::~FixRX copymode= %d\n", copymode);
+  if (copymode) return;
+
   // De-Allocate memory to prevent memory leak
   for (int ii = 0; ii < nreactions; ii++){
     delete [] stoich[ii];
@@ -370,11 +373,11 @@ void FixRX::post_constructor()
   newarg2[nspecies+3] = (char *) "ghost";
   newarg2[nspecies+4] = (char *) "yes";
 
-  modify->add_fix(nspecies+5,newarg);
+  modify->add_fix(nspecies+5,newarg,1);
   fix_species = (FixPropertyAtom *) modify->fix[modify->nfix-1];
   restartFlag = modify->fix[modify->nfix-1]->restart_reset;
 
-  modify->add_fix(nspecies+5,newarg2);
+  modify->add_fix(nspecies+5,newarg2,1);
   fix_species_old = (FixPropertyAtom *) modify->fix[modify->nfix-1];
 
   if(nspecies==0) error->all(FLERR,"There are no rx species specified.");
@@ -634,6 +637,9 @@ int FixRX::setmask()
 void FixRX::init()
 {
   pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
+
   if (pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style dpd/fdt/energy with fix rx");
 
@@ -669,7 +675,17 @@ void FixRX::setup_pre_force(int vflag)
 
   if(restartFlag){
     restartFlag = 0;
-  } else {
+  }
+  else
+  {
+    int ode_counter[4] = {0};
+
+    UserRHSData userData;
+    userData.kFor = new double[nreactions];
+    userData.rxnRateLaw = new double[nreactions];
+
+    double *rwork = new double[8*nspecies];
+
     if(localTempFlag){
       int count = nlocal + (newton_pair ? nghost : 0);
       dpdThetaLocal = new double[count];
@@ -682,22 +698,27 @@ void FixRX::setup_pre_force(int vflag)
         tmp = atom->dvector[ispecies][id];
         atom->dvector[ispecies+nspecies][id] = tmp;
       }
+
     for (int i = 0; i < nlocal; i++)
       if (mask[i] & groupbit){
 
         // Set the reaction rate constants to zero:  no reactions occur at step 0
         for(int irxn=0;irxn<nreactions;irxn++)
-          kR[irxn] = 0.0;
+          userData.kFor[irxn] = 0.0;
 
         if (odeIntegrationFlag == ODE_LAMMPS_RK4)
-          rk4(i,NULL);
+          rk4(i, rwork, &userData);
         else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
-          rkf45(i,NULL);
+          rkf45(i, rwork, &userData, ode_counter);
       }
 
     // Communicate the updated momenta and velocities to all nodes
     comm->forward_comm_fix(this);
     if(localTempFlag) delete [] dpdThetaLocal;
+
+    delete [] userData.kFor;
+    delete [] userData.rxnRateLaw;
+    delete [] rwork;
   }
 }
 
@@ -705,12 +726,13 @@ void FixRX::setup_pre_force(int vflag)
 
 void FixRX::pre_force(int vflag)
 {
+  TimerType timer_start = getTimeStamp();
+
   int nlocal = atom->nlocal;
   int nghost = atom->nghost;
   int *mask = atom->mask;
   double *dpdTheta = atom->dpdTheta;
   int newton_pair = force->newton_pair;
-  double theta;
 
   if(localTempFlag){
     int count = nlocal + (newton_pair ? nghost : 0);
@@ -722,7 +744,10 @@ void FixRX::pre_force(int vflag)
   TimerType timer_localTemperature = getTimeStamp();
 
   // Zero the counters for the ODE solvers.
-  this->nSteps = this->nIters = this->nFuncs = this->nFails = 0;
+  int nSteps = 0;
+  int nIters = 0;
+  int nFuncs = 0;
+  int nFails = 0;
 
   if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
   {
@@ -730,35 +755,66 @@ void FixRX::pre_force(int vflag)
     memory->create( diagnosticCounterPerODE[FuncSum], nlocal, "FixRX::diagnosticCounterPerODE");
   }
 
-  double *rwork = new double[8*nspecies + nreactions];
+  //#pragma omp parallel \
+  //   reduction(+: nSteps, nIters, nFuncs, nFails )
+  {
+    double *rwork = new double[8*nspecies];
 
-  for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit){
-      if (localTempFlag)
-        theta = dpdThetaLocal[i];
-      else
-        theta = dpdTheta[i];
+    UserRHSData userData;
+    userData.kFor = new double[nreactions];
+    userData.rxnRateLaw = new double[nreactions];
 
-      //Compute the reaction rate constants
-      for (int irxn = 0; irxn < nreactions; irxn++)
-        kR[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/force->boltz/theta);
+    int ode_counter[4] = { 0 };
 
-      if (odeIntegrationFlag == ODE_LAMMPS_RK4)
-        rk4(i,rwork);
-      else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
-        rkf45(i,rwork);
+    //#pragma omp for schedule(runtime)
+    for (int i = 0; i < nlocal; i++)
+    {
+      if (mask[i] & groupbit)
+      {
+        double theta;
+        if (localTempFlag)
+          theta = dpdThetaLocal[i];
+        else
+          theta = dpdTheta[i];
+
+        //Compute the reaction rate constants
+        for (int irxn = 0; irxn < nreactions; irxn++)
+          userData.kFor[irxn] = Arr[irxn]*pow(theta,nArr[irxn])*exp(-Ea[irxn]/force->boltz/theta);
+
+        if (odeIntegrationFlag == ODE_LAMMPS_RK4)
+          rk4(i, rwork, &userData);
+        else if (odeIntegrationFlag == ODE_LAMMPS_RKF45)
+          rkf45(i, rwork, &userData, ode_counter);
+      }
     }
 
-  TimerType timer_ODE = getTimeStamp();
+    nSteps += ode_counter[0];
+    nIters += ode_counter[1];
+    nFuncs += ode_counter[2];
+    nFails += ode_counter[3];
 
-  delete [] rwork;
+    delete [] rwork;
+    delete [] userData.kFor;
+    delete [] userData.rxnRateLaw;
+
+  } // end parallel region
+
+  TimerType timer_ODE = getTimeStamp();
 
   // Communicate the updated momenta and velocities to all nodes
   comm->forward_comm_fix(this);
   if(localTempFlag) delete [] dpdThetaLocal;
 
+  TimerType timer_stop = getTimeStamp();
+
   double time_ODE = getElapsedTime(timer_localTemperature, timer_ODE);
 
+  //printf("me= %d total= %g temp= %g ode= %g comm= %g nlocal= %d nfc= %d %d\n", comm->me,
+  //                       getElapsedTime(timer_start, timer_stop),
+  //                       getElapsedTime(timer_start, timer_localTemperature),
+  //                       getElapsedTime(timer_localTemperature, timer_ODE),
+  //                       getElapsedTime(timer_ODE, timer_stop), nlocal, nFuncs, nSteps);
+
   // Warn the user if a failure was detected in the ODE solver.
   if (nFails > 0){
     char sbuf[128];
@@ -954,21 +1010,15 @@ void FixRX::setupParams()
 
 /* ---------------------------------------------------------------------- */
 
-void FixRX::rk4(int id, double *rwork)
+void FixRX::rk4(int id, double *rwork, void* v_params)
 {
-  double *k1 = NULL;
-  if (rwork == NULL)
-    k1 = new double[6*nspecies + nreactions];
-  else
-    k1 = rwork;
+  double *k1 = rwork;
   double *k2 = k1 + nspecies;
   double *k3 = k2 + nspecies;
   double *k4 = k3 + nspecies;
   double *y  = k4 + nspecies;
   double *yp = y  + nspecies;
 
-  double *dummyArray = yp + nspecies; // Passed to the rhs function.
-
   const int numSteps = minSteps;
 
   const double h = update->dt / double(numSteps);
@@ -985,25 +1035,25 @@ void FixRX::rk4(int id, double *rwork)
   for (int step = 0; step < numSteps; step++)
   {
     // k1
-    rhs(0.0,y,k1,dummyArray);
+    rhs(0.0,y,k1,v_params);
 
     // k2
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       yp[ispecies] = y[ispecies] + 0.5*h*k1[ispecies];
 
-    rhs(0.0,yp,k2,dummyArray);
+    rhs(0.0,yp,k2,v_params);
 
     // k3
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       yp[ispecies] = y[ispecies] + 0.5*h*k2[ispecies];
 
-    rhs(0.0,yp,k3,dummyArray);
+    rhs(0.0,yp,k3,v_params);
 
     // k4
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       yp[ispecies] = y[ispecies] + h*k3[ispecies];
 
-    rhs(0.0,yp,k4,dummyArray);
+    rhs(0.0,yp,k4,v_params);
 
     for (int ispecies = 0; ispecies < nspecies; ispecies++)
       y[ispecies] += h*(k1[ispecies]/6.0 + k2[ispecies]/3.0 + k3[ispecies]/3.0 + k4[ispecies]/6.0);
@@ -1018,9 +1068,6 @@ void FixRX::rk4(int id, double *rwork)
       y[ispecies] = 0.0;
     atom->dvector[ispecies][id] = y[ispecies];
   }
-
-  if (rwork == NULL)
-    delete [] k1;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1270,6 +1317,78 @@ void FixRX::odeDiagnostics(void)
   double max_per_proc[numCounters];
   double min_per_proc[numCounters];
 
+  if(1)
+  {
+     static bool firstStep = true;
+
+     static TimerType oldTimeStamp (-1);
+
+     TimerType now = getTimeStamp();
+
+     // Query the fix database and look for rx_weight for the balance fix.
+     int type_flag = -1;
+     int rx_weight_index = atom->find_custom( "rx_weight", /*0:int, 1:float*/ type_flag );
+
+     // Compute the average # of neighbors.
+     double averageNumNeighbors = 0;
+     {
+        const int inum = pairDPDE->list->inum;
+        const int* ilist = pairDPDE->list->ilist;
+        const int* numneigh = pairDPDE->list->numneigh;
+
+        for (int ii = 0; ii < inum; ++ii)
+        {
+           const int i = ilist[ii];
+           averageNumNeighbors += numneigh[i];
+        }
+
+        averageNumNeighbors /= inum;
+     }
+
+     printf("me= %d nst= %g nfc= %g time= %g nlocal= %g lmpnst= %g weight_idx= %d 1st= %d aveNeigh= %g\n", comm->me, this->diagnosticCounter[0], this->diagnosticCounter[1], this->diagnosticCounter[2], this->diagnosticCounter[3], this->diagnosticCounter[4], rx_weight_index, firstStep, averageNumNeighbors);
+
+     if (rx_weight_index != -1 && !firstStep && 0)
+     {
+        double *rx_weight = atom->dvector[rx_weight_index];
+
+        const int nlocal = atom->nlocal;
+        const int *mask = atom->mask;
+
+        if (odeIntegrationFlag == ODE_LAMMPS_RKF45 && diagnosticFrequency == 1)
+        {
+          const double total_time = getElapsedTime( oldTimeStamp, now );
+          const double fixrx_time = this->diagnosticCounter[TimeSum];
+          const double time_ratio = fixrx_time / total_time;
+
+          double tsum = 0.0;
+          double tmin = 100000, tmax = 0;
+          for (int i = 0; i < nlocal; ++i)
+            if (mask[i] & groupbit)
+            {
+              double nfunc_ratio = double( diagnosticCounterPerODE[FuncSum][i] ) / diagnosticCounter[FuncSum];
+              rx_weight[i] = nfunc_ratio * fixrx_time + (total_time - fixrx_time) / nlocal;
+              tmin = fmin( tmin, rx_weight[i] );
+              tmax = fmax( tmax, rx_weight[i] );
+              tsum += rx_weight[i];
+              //rx_weight[i] = (double) diagnosticCounterPerODE[FuncSum][i];
+            }
+
+          printf("me= %d total= %g fixrx= %g ratio= %g tsum= %g %g %g %g\n", comm->me, total_time, fixrx_time, time_ratio, tsum, (total_time - fixrx_time) / nlocal, tmin, tmax);
+        }
+        else
+        {
+          error->warning(FLERR, "Dynamic load balancing enabled but per-atom weights not available.");
+
+          for (int i = 0; i < nlocal; ++i)
+            if (mask[i] & groupbit)
+              rx_weight[i] = 1.0;
+        }
+     }
+
+     firstStep = false;
+     oldTimeStamp = now;
+  }
+
   // Compute counters per dpd time-step.
   for (int i = 0; i < numCounters; ++i){
     my_vals[i] = this->diagnosticCounter[i] / nTimes;
@@ -1343,7 +1462,7 @@ void FixRX::odeDiagnostics(void)
     if (screen)  fprintf(screen,"%s\n", smesg); \
     if (logfile) fprintf(logfile,"%s\n", smesg); }
 
-    sprintf(smesg, "FixRX::ODE Diagnostics:  # of steps  |# of rhs evals| run-time (sec)");
+    sprintf(smesg, "FixRX::ODE Diagnostics:  # of iters  |# of rhs evals| run-time (sec) | # atoms");
     print_mesg(smesg);
 
     sprintf(smesg, "         AVG per ODE  : %-12.5g | %-12.5g | %-12.5g", avg_per_atom[0], avg_per_atom[1], avg_per_atom[2]);
@@ -1365,7 +1484,7 @@ void FixRX::odeDiagnostics(void)
       print_mesg(smesg);
     }
 
-    sprintf(smesg, "         AVG per Proc : %-12.5g | %-12.5g | %-12.5g", avg_per_proc[0], avg_per_proc[1], avg_per_proc[2]);
+    sprintf(smesg, "         AVG per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", avg_per_proc[StepSum], avg_per_proc[FuncSum], avg_per_proc[TimeSum], avg_per_proc[AtomSum]);
     print_mesg(smesg);
 
     if (comm->nprocs > 1){
@@ -1373,13 +1492,13 @@ void FixRX::odeDiagnostics(void)
       for (int i = 0; i < numCounters; ++i)
         rms_per_proc[i] = sqrt( sum_sq[i] / comm->nprocs );
 
-      sprintf(smesg, "         RMS per Proc : %-12.5g | %-12.5g | %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2]);
+      sprintf(smesg, "         RMS per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", rms_per_proc[0], rms_per_proc[1], rms_per_proc[2], rms_per_proc[AtomSum]);
       print_mesg(smesg);
 
-      sprintf(smesg, "         MAX per Proc : %-12.5g | %-12.5g | %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2]);
+      sprintf(smesg, "         MAX per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", max_per_proc[0], max_per_proc[1], max_per_proc[2], max_per_proc[AtomSum]);
       print_mesg(smesg);
 
-      sprintf(smesg, "         MIN per Proc : %-12.5g | %-12.5g | %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2]);
+      sprintf(smesg, "         MIN per Proc : %-12.5g | %-12.5g | %-12.5g | %-12.5g", min_per_proc[0], min_per_proc[1], min_per_proc[2], min_per_proc[AtomSum]);
       print_mesg(smesg);
     }
 
@@ -1399,7 +1518,7 @@ void FixRX::odeDiagnostics(void)
   return;
 }
 
-void FixRX::rkf45(int id, double *rwork)
+void FixRX::rkf45(int id, double *rwork, void *v_param, int ode_counter[])
 {
   // Rounding coefficient.
   const double uround = DBL_EPSILON;
@@ -1408,12 +1527,7 @@ void FixRX::rkf45(int id, double *rwork)
   const double adaption_limit = 4.0;
 
   //double *y = new double[8*nspecies + nreactions];
-  double *y = NULL;
-  if (rwork == NULL)
-    y = new double[8*nspecies + nreactions];
-  else
-    y = rwork;
-  double *rhstmp = y + 8*nspecies;
+  double *y = rwork;
 
   const int neq = nspecies;
 
@@ -1450,7 +1564,7 @@ void FixRX::rkf45(int id, double *rwork)
   if (h < h_min){
     //fprintf(stderr,"hin not implemented yet\n");
     //exit(-1);
-    nfe = rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, y + neq, rhstmp);
+    nfe = rkf45_h0 (neq, t, t_stop, h_min, h_max, h, y, y + neq, v_param);
   }
 
   //printf("t= %e t_stop= %e h= %e\n", t, t_stop, h);
@@ -1461,7 +1575,7 @@ void FixRX::rkf45(int id, double *rwork)
     double *eout = yout + neq;
 
     // Take a trial step.
-    rkf45_step (neq, h, y, yout, eout, rhstmp);
+    rkf45_step (neq, h, y, yout, eout, v_param);
 
     // Estimate the solution error.
       // ... weighted 2-norm of the error.
@@ -1509,16 +1623,17 @@ void FixRX::rkf45(int id, double *rwork)
 
     if (maxIters && nit > maxIters){
       //fprintf(stderr,"atom[%d] took too many iterations in rkf45 %d %e %e\n", id, nit, t, t_stop);
-      nFails ++;
+      //nFails ++;
+      ode_counter[3] ++;
       break;
       // We should set an error here so that the solution is not used!
     }
 
   } // end while
 
-  nSteps += nst;
-  nIters += nit;
-  nFuncs += nfe;
+  ode_counter[0] += nst;
+  ode_counter[1] += nit;
+  ode_counter[2] += nfe;
 
   //if (diagnosticFrequency == 1 && diagnosticCounterPerODE[StepSum] != NULL)
   if (diagnosticCounterPerODE[StepSum] != NULL){
@@ -1535,9 +1650,6 @@ void FixRX::rkf45(int id, double *rwork)
       y[ispecies] = 0.0;
     atom->dvector[ispecies][id] = y[ispecies];
   }
-
-  if (rwork == NULL)
-    delete [] y;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -1555,21 +1667,23 @@ int FixRX::rhs(double t, const double *y, double *dydt, void *params)
 
 int FixRX::rhs_dense(double t, const double *y, double *dydt, void *params)
 {
-  double rxnRateLawForward;
-  double *rxnRateLaw = (double *) params;
-  double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
-  double concentration;
-  int nspecies = atom->nspecies_dpd;
+  UserRHSData *userData = (UserRHSData *) params;
+
+  double *rxnRateLaw = userData->rxnRateLaw;
+  double *kFor       = userData->kFor;
+
+  const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
+  const int nspecies = atom->nspecies_dpd;
 
   for(int ispecies=0; ispecies<nspecies; ispecies++)
     dydt[ispecies] = 0.0;
 
   // Construct the reaction rate laws
   for(int jrxn=0; jrxn<nreactions; jrxn++){
-    rxnRateLawForward = kR[jrxn];
+    double rxnRateLawForward = kFor[jrxn];
 
     for(int ispecies=0; ispecies<nspecies; ispecies++){
-      concentration = y[ispecies]/VDPD;
+      const double concentration = y[ispecies]/VDPD;
       rxnRateLawForward *= pow(concentration,stoichReactants[jrxn][ispecies]);
     }
     rxnRateLaw[jrxn] = rxnRateLawForward;
@@ -1587,13 +1701,13 @@ int FixRX::rhs_dense(double t, const double *y, double *dydt, void *params)
 
 int FixRX::rhs_sparse(double t, const double *y, double *dydt, void *v_params) const
 {
-   double *_rxnRateLaw = (double *) v_params;
+   UserRHSData *userData = (UserRHSData *) v_params;
 
    const double VDPD = domain->xprd * domain->yprd * domain->zprd / atom->natoms;
 
-   #define kFor         (this->kR)
+   #define kFor         (userData->kFor)
    #define kRev         (NULL)
-   #define rxnRateLaw   (_rxnRateLaw)
+   #define rxnRateLaw   (userData->rxnRateLaw)
    #define conc         (dydt)
    #define maxReactants (this->sparseKinetics_maxReactants)
    #define maxSpecies   (this->sparseKinetics_maxSpecies)
diff --git a/src/USER-DPD/fix_rx.h b/src/USER-DPD/fix_rx.h
index 35998963e2..ca87fc51fd 100644
--- a/src/USER-DPD/fix_rx.h
+++ b/src/USER-DPD/fix_rx.h
@@ -66,19 +66,19 @@ class FixRX : public Fix {
   double *kR;
 
   //!< Classic Runge-Kutta 4th-order stepper.
-  void rk4(int,double*);
+  void rk4(int, double*, void*);
 
   //!< Runge-Kutta-Fehlberg ODE Solver.
-  void rkf45(int,double*);
+  void rkf45(int, double*, void*, int ode_counter[]);
 
   //!< Runge-Kutta-Fehlberg ODE stepper function.
   void rkf45_step (const int neq, const double h, double y[], double y_out[],
-                   double rwk[], void* v_param);
+                   double rwk[], void *);
 
   //!< Initial step size estimation for the Runge-Kutta-Fehlberg ODE solver.
   int rkf45_h0 (const int neq, const double t, const double t_stop,
                      const double hmin, const double hmax,
-                     double& h0, double y[], double rwk[], void* v_params);
+                     double& h0, double y[], double rwk[], void *v_params);
 
   class PairDPDfdtEnergy *pairDPDE;
   double *dpdThetaLocal;
@@ -90,6 +90,13 @@ class FixRX : public Fix {
   int rhs(double, const double *, double *, void *);
   int rhs_dense (double, const double *, double *, void *);
 
+  // User-defined data container needed in rhs.
+  struct UserRHSData
+  {
+    double *kFor;
+    double *rxnRateLaw;
+  };
+
   // Sparse stoichiometric matrix storage format and methods.
   bool useSparseKinetics;
   //SparseKinetics sparseKinetics;
@@ -116,10 +123,10 @@ class FixRX : public Fix {
   double relTol, absTol; //!< Relative and absolute tolerances for the ODE solver(s).
 
   // ODE Diagnostics
-  int nSteps; //!< # of accepted steps taken over all atoms.
-  int nIters; //!< # of attemped steps for all atoms.
-  int nFuncs; //!< # of RHS evaluations for all atoms.
-  int nFails; //!< # of ODE systems that failed (for some reason).
+  //int nSteps; //!< # of accepted steps taken over all atoms.
+  //int nIters; //!< # of attemped steps for all atoms.
+  //int nFuncs; //!< # of RHS evaluations for all atoms.
+  //int nFails; //!< # of ODE systems that failed (for some reason).
 
   int diagnosticFrequency; //!< Frequency (LMP steps) that run-time diagnostics will be printed to the log.
   enum { numDiagnosticCounters = 5 };
diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp
index d14c1e0ddb..cec53ab15f 100644
--- a/src/USER-DPD/fix_shardlow.cpp
+++ b/src/USER-DPD/fix_shardlow.cpp
@@ -55,6 +55,7 @@
 #include "pair_dpd_fdt.h"
 #include "pair_dpd_fdt_energy.h"
 #include "pair.h"
+#include "npair_half_bin_newton_ssa.h"
 #include "citeme.h"
 
 using namespace LAMMPS_NS;
@@ -95,6 +96,8 @@ FixShardlow::FixShardlow(LAMMPS *lmp, int narg, char **arg) :
   pairDPDE = NULL;
   pairDPD = (PairDPDfdt *) force->pair_match("dpd/fdt",1);
   pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy",1);
+  if (pairDPDE == NULL)
+    pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1);
 
   if(pairDPDE){
     comm_forward = 3;
@@ -107,26 +110,12 @@ FixShardlow::FixShardlow(LAMMPS *lmp, int narg, char **arg) :
   if(pairDPD == NULL && pairDPDE == NULL)
     error->all(FLERR,"Must use pair_style dpd/fdt or dpd/fdt/energy with fix shardlow");
 
-  // Setup the ssaAIR array
-  atom->ssaAIR = NULL;
-  grow_arrays(atom->nmax);
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-
-  // Setup callbacks for maintaining atom->ssaAIR[]
-  atom->add_callback(0); // grow (aka exchange)
-  atom->add_callback(1); // restart
-  atom->add_callback(2); // border
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixShardlow::~FixShardlow()
 {
-  atom->delete_callback(id, 0);
-  atom->delete_callback(id, 1);
-  atom->delete_callback(id, 2);
-
-  memory->destroy(atom->ssaAIR);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -135,7 +124,6 @@ int FixShardlow::setmask()
 {
   int mask = 0;
   mask |= INITIAL_INTEGRATE;
-  mask |= PRE_EXCHANGE | MIN_PRE_EXCHANGE;
   return mask;
 }
 
@@ -144,9 +132,11 @@ int FixShardlow::setmask()
 void FixShardlow::init()
 {
   int irequest = neighbor->request(this,instance_me);
-  neighbor->requests[irequest]->pair = 0;
-  neighbor->requests[irequest]->fix  = 1;
-  neighbor->requests[irequest]->ssa  = 1;
+  neighbor->requests[irequest]->pair   = 0;
+  neighbor->requests[irequest]->fix    = 1;
+  neighbor->requests[irequest]->ghost  = 1;
+  neighbor->requests[irequest]->ssa    = 1;
+  neighbor->requests[irequest]->newton = 1; // SSA requires newton on
 }
 
 /* ---------------------------------------------------------------------- */
@@ -158,27 +148,6 @@ void FixShardlow::init_list(int id, NeighList *ptr)
 
 /* ---------------------------------------------------------------------- */
 
-void FixShardlow::pre_exchange()
-{
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShardlow::setup_pre_exchange()
-{
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void FixShardlow::min_pre_exchange()
-{
-  memset(atom->ssaAIR, 0, sizeof(int)*atom->nlocal);
-}
-
-/* ---------------------------------------------------------------------- */
-
 void FixShardlow::setup(int vflag)
 {
   bool fixShardlow = false;
@@ -243,6 +212,10 @@ void FixShardlow::ssa_update_dpd(
   const double mass_i = (rmass) ? rmass[i] : mass[itype];
   const double massinv_i = 1.0 / mass_i;
 
+#ifdef DEBUG_SSA_PAIR_CT
+  const int nlocal = atom->nlocal;
+#endif
+
   // Loop over Directional Neighbors only
   for (int jj = 0; jj < jlen; jj++) {
     int j = jlist[jj] & NEIGHMASK;
@@ -252,9 +225,23 @@ void FixShardlow::ssa_update_dpd(
     double dely = ytmp - x[j][1];
     double delz = ztmp - x[j][2];
     double rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_SSA_PAIR_CT
+    if ((i < nlocal) && (j < nlocal)) ++(counters[0][0]);
+    else ++(counters[0][1]);
+    ++(counters[0][2]);
+    int rsqi = rsq / 8;
+    if (rsqi < 0) rsqi = 0;
+    else if (rsqi > 31) rsqi = 31;
+    ++(hist[rsqi]);
+#endif
 
     // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
     if ((rsq < cut2_i[jtype]) && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_SSA_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) ++(counters[1][0]);
+      else ++(counters[1][1]);
+      ++(counters[1][2]);
+#endif
       double r = sqrt(rsq);
       double rinv = 1.0/r;
       double delx_rinv = delx*rinv;
@@ -382,6 +369,10 @@ void FixShardlow::ssa_update_dpde(
   const double massinv_i = 1.0 / mass_i;
   const double mass_i_div_neg4_ftm2v = mass_i*(-0.25)/ftm2v;
 
+#ifdef DEBUG_SSA_PAIR_CT
+  const int nlocal = atom->nlocal;
+#endif
+
   // Loop over Directional Neighbors only
   for (int jj = 0; jj < jlen; jj++) {
     int j = jlist[jj] & NEIGHMASK;
@@ -391,9 +382,23 @@ void FixShardlow::ssa_update_dpde(
     double dely = ytmp - x[j][1];
     double delz = ztmp - x[j][2];
     double rsq = delx*delx + dely*dely + delz*delz;
+#ifdef DEBUG_SSA_PAIR_CT
+    if ((i < nlocal) && (j < nlocal)) ++(counters[0][0]);
+    else ++(counters[0][1]);
+    ++(counters[0][2]);
+    int rsqi = rsq / 8;
+    if (rsqi < 0) rsqi = 0;
+    else if (rsqi > 31) rsqi = 31;
+    ++(hist[rsqi]);
+#endif
 
     // NOTE: r can be 0.0 in DPD systems, so do EPSILON_SQUARED test
     if ((rsq < cut2_i[jtype]) && (rsq >= EPSILON_SQUARED)) {
+#ifdef DEBUG_SSA_PAIR_CT
+      if ((i < nlocal) && (j < nlocal)) ++(counters[1][0]);
+      else ++(counters[1][1]);
+      ++(counters[1][2]);
+#endif
       double r = sqrt(rsq);
       double rinv = 1.0/r;
       double delx_rinv = delx*rinv;
@@ -518,7 +523,19 @@ void FixShardlow::initial_integrate(int vflag)
     error->all(FLERR,"Fix shardlow does not yet support triclinic geometries");
 
   if(rcut >= bbx || rcut >= bby || rcut>= bbz )
-    error->all(FLERR,"Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either reduce the number of processors requested, or change the cutoff/skin\n");
+  {
+    char fmt[] = {"Shardlow algorithm requires sub-domain length > 2*(rcut+skin). Either reduce the number of processors requested, or change the cutoff/skin: rcut= %e bbx= %e bby= %e bbz= %e\n"};
+    char *msg = (char *) malloc(sizeof(fmt) + 4*15);
+    sprintf(msg, fmt, rcut, bbx, bby, bbz);
+    error->one(FLERR, msg);
+  }
+
+#ifdef DEBUG_SSA_PAIR_CT
+  for (int i = 0; i < 2; ++i)
+    for (int j = 0; j < 3; ++j)
+      counters[i][j] = 0;
+  for (int i = 0; i < 32; ++i) hist[i] = 0;
+#endif
 
   // Allocate memory for v_t0 to hold the initial velocities for the ghosts
   v_t0 = (double (*)[3]) memory->smalloc(sizeof(double)*3*nghost, "FixShardlow:v_t0");
@@ -528,36 +545,69 @@ void FixShardlow::initial_integrate(int vflag)
 
   dtsqrt = sqrt(update->dt);
 
-  //Loop over all 14 directions (8 stages)
-  for (airnum = 1; airnum <=8; airnum++){
+  NPairHalfBinNewtonSSA *np_ssa = dynamic_cast<NPairHalfBinNewtonSSA*>(list->np);
+  if (!np_ssa) error->one(FLERR, "NPair wasn't a NPairHalfBinNewtonSSA object");
+  int ssa_phaseCt = np_ssa->ssa_phaseCt;
+  int *ssa_phaseLen = np_ssa->ssa_phaseLen;
+  int **ssa_itemLoc = np_ssa->ssa_itemLoc;
+  int **ssa_itemLen = np_ssa->ssa_itemLen;
 
-    if (airnum > 1) {
-      // Communicate the updated velocities to all nodes
-      comm->forward_comm_fix(this);
+  // process neighbors in the local AIR
+  for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) {
+    int workItemCt = ssa_phaseLen[workPhase];
 
-      if(useDPDE){
-        // Zero out the ghosts' uCond & uMech to be used as delta accumulators
-        memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
-        memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
+    for (int workItem = 0; workItem < workItemCt; ++workItem) {
+      int ct = ssa_itemLen[workPhase][workItem];
+      ii = ssa_itemLoc[workPhase][workItem];
+
+      while (ct-- > 0) {
+        int len = list->numneigh[ii];
+        if (len > 0) {
+          if (useDPDE) ssa_update_dpde(ilist[ii], list->firstneigh[ii], len);
+          else ssa_update_dpd(ilist[ii], list->firstneigh[ii], len);
+        }
+        ii++;
       }
     }
+  }
 
-    // Loop over neighbors of my atoms
-    for (ii = 0; ii < inum; ii++) {
-      i = ilist[ii];
-      int start = (airnum < 2) ? 0 : list->ndxAIR_ssa[i][airnum - 2];
-      int len = list->ndxAIR_ssa[i][airnum - 1] - start;
-      if (len > 0) {
-        if (useDPDE) ssa_update_dpde(i, &(list->firstneigh[i][start]), len);
-        else ssa_update_dpd(i, &(list->firstneigh[i][start]), len);
-      }
+  ii = inum;
+  //Loop over all 13 outward directions (7 stages)
+  for (airnum = 1; airnum <=7; airnum++){
+    int ct = list->AIRct_ssa[airnum];
+
+    // Communicate the updated velocities to all nodes
+    comm->forward_comm_fix(this);
+
+    if(useDPDE){
+      // Zero out the ghosts' uCond & uMech to be used as delta accumulators
+      memset(&(atom->uCond[nlocal]), 0, sizeof(double)*nghost);
+      memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost);
+    }
+
+    // process neighbors in this AIR
+    while (ct-- > 0) {
+      int len = list->numneigh[ii];
+      if (useDPDE) ssa_update_dpde(ilist[ii], list->firstneigh[ii], len);
+      else ssa_update_dpd(ilist[ii], list->firstneigh[ii], len);
+      ii++;
     }
 
     // Communicate the ghost deltas to the atom owners
-    if (airnum > 1) comm->reverse_comm_fix(this);
+    comm->reverse_comm_fix(this);
 
   }  //End Loop over all directions For airnum = Top, Top-Right, Right, Bottom-Right, Back
 
+#ifdef DEBUG_SSA_PAIR_CT
+for (int i = 0; i < 32; ++i) fprintf(stdout, "%8d", hist[i]);
+fprintf(stdout, "\n%6d %6d,%6d %6d: "
+  ,counters[0][2]
+  ,counters[1][2]
+  ,counters[0][1]
+  ,counters[1][1]
+);
+#endif
+
   memory->sfree(v_t0);
   v_t0 = NULL;
 }
@@ -643,91 +693,11 @@ void FixShardlow::unpack_reverse_comm(int n, int *list, double *buf)
   }
 }
 
-/* ----------------------------------------------------------------------
-   convert atom coords into the ssa active interaction region number
-------------------------------------------------------------------------- */
-
-int FixShardlow::coord2ssaAIR(double *x)
-{
-  int ix, iy, iz;
-
-  ix = iy = iz = 0;
-  if (x[2] < domain->sublo[2]) iz = -1;
-  if (x[2] >= domain->subhi[2]) iz = 1;
-  if (x[1] < domain->sublo[1]) iy = -1;
-  if (x[1] >= domain->subhi[1]) iy = 1;
-  if (x[0] < domain->sublo[0]) ix = -1;
-  if (x[0] >= domain->subhi[0]) ix = 1;
-
-  if(iz < 0){
-    return -1;
-  } else if(iz == 0){
-    if( iy<0 ) return -1; // bottom left/middle/right
-    if( (iy==0) && (ix<0)  ) return -1; // left atoms
-    if( (iy==0) && (ix==0) ) return 0; // Locally owned atoms
-    if( (iy==0) && (ix>0)  ) return 3; // Right atoms
-    if( (iy>0)  && (ix==0) ) return 2; // Top-middle atoms
-    if( (iy>0)  && (ix!=0) ) return 4; // Top-right and top-left atoms
-  } else { // iz > 0
-    if((ix==0) && (iy==0)) return 5; // Back atoms
-    if((ix==0) && (iy!=0)) return 6; // Top-back and bottom-back atoms
-    if((ix!=0) && (iy==0)) return 7; // Left-back and right-back atoms
-    if((ix!=0) && (iy!=0)) return 8; // Back corner atoms
-  }
-
-  return -2;
-}
-
 /* ---------------------------------------------------------------------- */
 
-void FixShardlow::grow_arrays(int nmax)
-{
-  memory->grow(atom->ssaAIR,nmax,"fix_shardlow:ssaAIR");
-}
-
-void FixShardlow::copy_arrays(int i, int j, int delflag)
-{
-  atom->ssaAIR[j] = atom->ssaAIR[i];
-}
-
-void FixShardlow::set_arrays(int i)
-{
-  atom->ssaAIR[i] = 0; /* coord2ssaAIR(x[i]) */
-}
-
-int FixShardlow::pack_border(int n, int *list, double *buf)
-{
-  for (int i = 0; i < n; i++) {
-    int j = list[i];
-    if (atom->ssaAIR[j] == 0) atom->ssaAIR[j] = 1; // not purely local anymore
-  }
-  return 0;
-}
-
-int FixShardlow::unpack_border(int n, int first, double *buf)
-{
-  int i,last = first + n;
-  for (i = first; i < last; i++) {
-    atom->ssaAIR[i] = coord2ssaAIR(atom->x[i]);
-  }
-  return 0;
-}
-
-int FixShardlow::unpack_exchange(int i, double *buf)
-{
-  atom->ssaAIR[i] = 0; /* coord2ssaAIR(x[i]) */
-  return 0;
-}
-
-void FixShardlow::unpack_restart(int i, int nth)
-{
-  atom->ssaAIR[i] = 0; /* coord2ssaAIR(x[i]) */
-}
-
 double FixShardlow::memory_usage()
 {
   double bytes = 0.0;
-  bytes += memory->usage(atom->ssaAIR,atom->nmax);
   bytes += sizeof(double)*3*atom->nghost; // v_t0[]
   return bytes;
 }
diff --git a/src/USER-DPD/fix_shardlow.h b/src/USER-DPD/fix_shardlow.h
index 2ffb96ae7c..e8e5f484a0 100644
--- a/src/USER-DPD/fix_shardlow.h
+++ b/src/USER-DPD/fix_shardlow.h
@@ -35,21 +35,14 @@ class FixShardlow : public Fix {
   virtual void init_list(int, class NeighList *);
   virtual void setup(int);
   virtual void initial_integrate(int);
-  void setup_pre_exchange();
-  void pre_exchange();
-  void min_pre_exchange();
-
-  void grow_arrays(int);
-  void copy_arrays(int, int, int);
-  void set_arrays(int);
-
-  int pack_border(int, int *, double *);
-  int unpack_border(int, int, double *);
-  int unpack_exchange(int, double *);
-  void unpack_restart(int, int);
 
   double memory_usage();
 
+#ifdef DEBUG_SSA_PAIR_CT
+  int counters[2][3];
+  int hist[32];
+#endif
+
  protected:
   int pack_reverse_comm(int, int, double *);
   void unpack_reverse_comm(int, int *, double *);
@@ -63,7 +56,6 @@ class FixShardlow : public Fix {
  private:
   double dtsqrt; // = sqrt(update->dt);
 
-  int coord2ssaAIR(double *);  // map atom coord to an AIR number
   void ssa_update_dpd(int, int *, int);  // Constant Temperature
   void ssa_update_dpde(int, int *, int); // Constant Energy
 
diff --git a/src/USER-DPD/nbin_ssa.cpp b/src/USER-DPD/nbin_ssa.cpp
index f65a397e88..4c57a8e70f 100644
--- a/src/USER-DPD/nbin_ssa.cpp
+++ b/src/USER-DPD/nbin_ssa.cpp
@@ -20,6 +20,7 @@
 #include "atom.h"
 #include "update.h"
 #include "group.h"
+#include "domain.h"
 #include "memory.h"
 #include "error.h"
 
@@ -29,24 +30,19 @@ using namespace LAMMPS_NS;
 
 NBinSSA::NBinSSA(LAMMPS *lmp) : NBinStandard(lmp)
 {
-  maxbin_ssa = 0;
-  bins_ssa = NULL;
-  maxhead_ssa = 0;
-  binhead_ssa = NULL;
-  gbinhead_ssa = NULL;
+  for (int i = 0; i < 8; i++) {
+    gairhead_ssa[i] = -1;
+  }
 }
 
 NBinSSA::~NBinSSA()
 {
-  memory->destroy(bins_ssa);
-  memory->destroy(binhead_ssa);
-  memory->destroy(gbinhead_ssa);
 }
 
 /* ----------------------------------------------------------------------
    bin owned and ghost atoms for the Shardlow Splitting Algorithm (SSA)
-   local atoms are in distinct bins (binhead_ssa) from the ghosts
-   ghost atoms are in distinct bins (gbinhead_ssa) from the locals
+   local atoms are in distinct bins (binhead[]) from the ghosts
+   ghost atoms are "binned" in gairhead_ssa[] instead
      ghosts which are not in an Active Interaction Region (AIR) are skipped
 ------------------------------------------------------------------------- */
 
@@ -58,13 +54,19 @@ void NBinSSA::bin_atoms()
   if (includegroup) nlocal = atom->nfirst;
   double **x = atom->x;
   int *mask = atom->mask;
-  int *ssaAIR = atom->ssaAIR;
+  int xbin,ybin,zbin;
 
   last_bin = update->ntimestep;
 
+  bboxlo_[0] = bboxlo[0]; bboxlo_[1] = bboxlo[1]; bboxlo_[2] = bboxlo[2];
+  bboxhi_[0] = bboxhi[0]; bboxhi_[1] = bboxhi[1]; bboxhi_[2] = bboxhi[2];
+
+  for (i = 0; i < 8; i++) {
+    gairhead_ssa[i] = -1;
+  }
+
   for (i = 0; i < mbins; i++) {
-    gbinhead_ssa[i] = -1;
-    binhead_ssa[i] = -1;
+    binhead[i] = -1;
   }
 
   // bin in reverse order so linked list will be in forward order
@@ -73,29 +75,34 @@ void NBinSSA::bin_atoms()
     int bitmask = group->bitmask[includegroup];
     int nowned = atom->nlocal; // NOTE: nlocal was set to atom->nfirst above
     for (i = nall-1; i >= nowned; i--) {
-      if (ssaAIR[i] < 2) continue; // skip ghost atoms not in AIR
+      ibin = coord2ssaAIR(x[i]);
+      if (ibin < 1) continue; // skip ghost atoms not in AIR
       if (mask[i] & bitmask) {
-        ibin = coord2bin(x[i]);
-        atom2bin[i] = ibin;
-        bins_ssa[i] = gbinhead_ssa[ibin];
-        gbinhead_ssa[ibin] = i;
+        bins[i] = gairhead_ssa[ibin];
+        gairhead_ssa[ibin] = i;
       }
     }
   } else {
     for (i = nall-1; i >= nlocal; i--) {
-      if (ssaAIR[i] < 2) continue; // skip ghost atoms not in AIR
-      ibin = coord2bin(x[i]);
-      atom2bin[i] = ibin;
-      bins_ssa[i] = gbinhead_ssa[ibin];
-      gbinhead_ssa[ibin] = i;
+      ibin = coord2ssaAIR(x[i]);
+      if (ibin < 1) continue; // skip ghost atoms not in AIR
+      bins[i] = gairhead_ssa[ibin];
+      gairhead_ssa[ibin] = i;
     }
   }
   for (i = nlocal-1; i >= 0; i--) {
-    ibin = coord2bin(x[i]);
-    atom2bin[i] = ibin;
-    bins_ssa[i] = binhead_ssa[ibin];
-    binhead_ssa[ibin] = i;
+    ibin = coord2bin(x[i][0], x[i][1], x[i][2], xbin, ybin, zbin);
+    // Find the bounding box of the local atoms in the bins
+    if (xbin < lbinxlo) lbinxlo = xbin;
+    if (xbin >= lbinxhi) lbinxhi = xbin + 1;
+    if (ybin < lbinylo) lbinylo = ybin;
+    if (ybin >= lbinyhi) lbinyhi = ybin + 1;
+    if (zbin < lbinzlo) lbinzlo = zbin;
+    if (zbin >= lbinzhi) lbinzhi = zbin + 1;
+    bins[i] = binhead[ibin];
+    binhead[ibin] = i;
   }
+
 }
 
 /* ---------------------------------------------------------------------- */
@@ -104,19 +111,13 @@ void NBinSSA::bin_atoms_setup(int nall)
 {
   NBinStandard::bin_atoms_setup(nall); // Setup the parent class's data too
 
-  if (mbins > maxhead_ssa) {
-    maxhead_ssa = mbins;
-    memory->destroy(gbinhead_ssa);
-    memory->destroy(binhead_ssa);
-    memory->create(binhead_ssa,maxhead_ssa,"binhead_ssa");
-    memory->create(gbinhead_ssa,maxhead_ssa,"gbinhead_ssa");
-  }
-
-  if (nall > maxbin_ssa) {
-    maxbin_ssa = nall;
-    memory->destroy(bins_ssa);
-    memory->create(bins_ssa,maxbin_ssa,"bins_ssa");
-  }
+  // Clear the local bin extent bounding box.
+  lbinxlo = mbinx - 1; // Safe to = stencil->sx + 1
+  lbinylo = mbiny - 1; // Safe to = stencil->sy + 1
+  lbinzlo = mbinz - 1; // Safe to = stencil->sz + 1
+  lbinxhi = 0; // Safe to = mbinx - stencil->sx - 1
+  lbinyhi = 0; // Safe to = mbiny - stencil->sy - 1
+  lbinzhi = 0; // Safe to = mbinz - stencil->sz - 1
 }
 
 /* ---------------------------------------------------------------------- */
@@ -125,10 +126,39 @@ bigint NBinSSA::memory_usage()
 {
   bigint bytes = NBinStandard::memory_usage(); // Count the parent's usage too
 
-  if (maxbin_ssa) bytes += memory->usage(bins_ssa,maxbin_ssa);
-  if (maxhead_ssa) {
-    bytes += memory->usage(binhead_ssa,maxhead_ssa);
-    bytes += memory->usage(gbinhead_ssa,maxhead_ssa);
-  }
   return bytes;
 }
+
+/* ----------------------------------------------------------------------
+   convert atom coords into the ssa active interaction region number
+------------------------------------------------------------------------- */
+int NBinSSA::coord2ssaAIR(const double *x)
+{
+  int ix, iy, iz;
+
+  ix = iy = iz = 0;
+  if (x[2] < domain->sublo[2]) iz = -1;
+  if (x[2] >= domain->subhi[2]) iz = 1;
+  if (x[1] < domain->sublo[1]) iy = -1;
+  if (x[1] >= domain->subhi[1]) iy = 1;
+  if (x[0] < domain->sublo[0]) ix = -1;
+  if (x[0] >= domain->subhi[0]) ix = 1;
+
+  if(iz < 0){
+    return -1;
+  } else if(iz == 0){
+    if( iy<0 ) return -1; // bottom left/middle/right
+    if( (iy==0) && (ix<0)  ) return -1; // left atoms
+    if( (iy==0) && (ix==0) ) return 0; // Locally owned atoms
+    if( (iy==0) && (ix>0)  ) return 2; // Right atoms
+    if( (iy>0)  && (ix==0) ) return 1; // Top-middle atoms
+    if( (iy>0)  && (ix!=0) ) return 3; // Top-right and top-left atoms
+  } else { // iz > 0
+    if((ix==0) && (iy==0)) return 4; // Back atoms
+    if((ix==0) && (iy!=0)) return 5; // Top-back and bottom-back atoms
+    if((ix!=0) && (iy==0)) return 6; // Left-back and right-back atoms
+    if((ix!=0) && (iy!=0)) return 7; // Back corner atoms
+  }
+
+  return -2;
+}
diff --git a/src/USER-DPD/nbin_ssa.h b/src/USER-DPD/nbin_ssa.h
index f0699b3a7a..2a0175081e 100644
--- a/src/USER-DPD/nbin_ssa.h
+++ b/src/USER-DPD/nbin_ssa.h
@@ -29,11 +29,15 @@ namespace LAMMPS_NS {
 class NBinSSA : public NBinStandard {
  public:
 
-  int *bins_ssa;             // index of next atom in each bin
-  int maxbin_ssa;            // size of bins_ssa array
-  int *binhead_ssa;          // index of 1st local atom in each bin
-  int *gbinhead_ssa;         // index of 1st ghost atom in each bin
-  int maxhead_ssa;           // size of binhead_ssa and gbinhead_ssa arrays
+  int gairhead_ssa[8];       // index of 1st ghost atom in each AIR
+
+  // Bounds of the local atoms in the binhead array
+  int lbinxlo;               // lowest local bin x-dim coordinate
+  int lbinylo;               // lowest local bin y-dim coordinate
+  int lbinzlo;               // lowest local bin z-dim coordinate
+  int lbinxhi;               // highest local bin x-dim coordinate
+  int lbinyhi;               // highest local bin y-dim coordinate
+  int lbinzhi;               // highest local bin z-dim coordinate
 
   NBinSSA(class LAMMPS *);
   ~NBinSSA();
@@ -42,6 +46,115 @@ class NBinSSA : public NBinStandard {
   void bin_atoms();
 
   bigint memory_usage();
+
+  inline
+  int coord2bin(const double & x,const double & y,const double & z) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+  inline
+  int coord2bin(const double & x,const double & y,const double & z, int* i) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    i[0] = ix - mbinxlo;
+    i[1] = iy - mbinylo;
+    i[2] = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+  inline
+  int coord2bin(const double & x,const double & y,const double & z, int &ixo, int &iyo, int &izo) const
+  {
+    int ix,iy,iz;
+
+    if (x >= bboxhi_[0])
+      ix = static_cast<int> ((x-bboxhi_[0])*bininvx) + nbinx;
+    else if (x >= bboxlo_[0]) {
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx);
+      ix = MIN(ix,nbinx-1);
+    } else
+      ix = static_cast<int> ((x-bboxlo_[0])*bininvx) - 1;
+
+    if (y >= bboxhi_[1])
+      iy = static_cast<int> ((y-bboxhi_[1])*bininvy) + nbiny;
+    else if (y >= bboxlo_[1]) {
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy);
+      iy = MIN(iy,nbiny-1);
+    } else
+      iy = static_cast<int> ((y-bboxlo_[1])*bininvy) - 1;
+
+    if (z >= bboxhi_[2])
+      iz = static_cast<int> ((z-bboxhi_[2])*bininvz) + nbinz;
+    else if (z >= bboxlo_[2]) {
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz);
+      iz = MIN(iz,nbinz-1);
+    } else
+      iz = static_cast<int> ((z-bboxlo_[2])*bininvz) - 1;
+
+    ixo = ix - mbinxlo;
+    iyo = iy - mbinylo;
+    izo = iz - mbinzlo;
+
+    return (iz-mbinzlo)*mbiny*mbinx + (iy-mbinylo)*mbinx + (ix-mbinxlo);
+  }
+
+ private:
+  int coord2ssaAIR(const double *);  // map atom coord to an AIR number
+  double bboxlo_[3],bboxhi_[3];
+
 };
 
 }
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
index e8e4b20a0a..a6479d4c4f 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp
@@ -32,15 +32,29 @@
 
 using namespace LAMMPS_NS;
 
-// allocate space for static class variable
-// prototype for non-class function
+/* ---------------------------------------------------------------------- */
 
-static int *ssaAIRptr;
-static int cmp_ssaAIR(const void *, const void *);
+NPairHalfBinNewtonSSA::NPairHalfBinNewtonSSA(LAMMPS *lmp) : NPair(lmp)
+{
+  ssa_maxPhaseCt = 0;
+  ssa_maxPhaseLen = 0;
+  ssa_phaseCt = 0;
+  ssa_phaseLen = NULL;
+  ssa_itemLoc = NULL;
+  ssa_itemLen = NULL;
+}
 
 /* ---------------------------------------------------------------------- */
 
-NPairHalfBinNewtonSSA::NPairHalfBinNewtonSSA(LAMMPS *lmp) : NPair(lmp) {}
+NPairHalfBinNewtonSSA::~NPairHalfBinNewtonSSA()
+{
+  ssa_maxPhaseCt = 0;
+  ssa_maxPhaseLen = 0;
+  ssa_phaseCt = 0;
+  memory->destroy(ssa_phaseLen);
+  memory->destroy(ssa_itemLoc);
+  memory->destroy(ssa_itemLen);
+}
 
 /* ----------------------------------------------------------------------
    binned neighbor list construction with full Newton's 3rd law
@@ -65,7 +79,6 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
   int **nspecial = atom->nspecial;
   int nlocal = atom->nlocal;
   if (includegroup) nlocal = atom->nfirst;
-  int *ssaAIR = atom->ssaAIR;
 
   int *molindex = atom->molindex;
   int *molatom = atom->molatom;
@@ -81,179 +94,218 @@ void NPairHalfBinNewtonSSA::build(NeighList *list)
 
   NStencilSSA *ns_ssa = dynamic_cast<NStencilSSA*>(ns);
   if (!ns_ssa) error->one(FLERR, "NStencil wasn't a NStencilSSA object");
-  int nstencil_half = ns_ssa->nstencil_half;
+  int *nstencil_ssa = &(ns_ssa->nstencil_ssa[0]);
   int nstencil_full = ns_ssa->nstencil;
 
   NBinSSA *nb_ssa = dynamic_cast<NBinSSA*>(nb);
   if (!nb_ssa) error->one(FLERR, "NBin wasn't a NBinSSA object");
-  int *bins_ssa = nb_ssa->bins_ssa;
-  int *binhead_ssa = nb_ssa->binhead_ssa;
-  int *gbinhead_ssa = nb_ssa->gbinhead_ssa;
+  int *bins = nb_ssa->bins;
+  int *binhead = nb_ssa->binhead;
+  int *gairhead_ssa = &(nb_ssa->gairhead_ssa[0]);
 
   int inum = 0;
+  int gnum = 0;
+  int xbin,ybin,zbin,xbin2,ybin2,zbin2;
+  int **stencilxyz = ns_ssa->stencilxyz;
+  int lbinxlo = nb_ssa->lbinxlo;
+  int lbinxhi = nb_ssa->lbinxhi;
+  int lbinylo = nb_ssa->lbinylo;
+  int lbinyhi = nb_ssa->lbinyhi;
+  int lbinzlo = nb_ssa->lbinzlo;
+  int lbinzhi = nb_ssa->lbinzhi;
+
+  int sx1 = ns_ssa->sx + 1;
+  int sy1 = ns_ssa->sy + 1;
+  int sz1 = ns_ssa->sz + 1;
+
+  ssa_phaseCt = sz1*sy1*sx1;
+
+  xbin = (lbinxhi - lbinxlo + sx1 - 1) / sx1 + 1;
+  ybin = (lbinyhi - lbinylo + sy1 - 1) / sy1 + 1;
+  zbin = (lbinzhi - lbinzlo + sz1 - 1) / sz1 + 1;
+
+  int phaseLenEstimate = xbin*ybin*zbin;
+
+  if (ssa_phaseCt > ssa_maxPhaseCt) {
+    ssa_maxPhaseCt = ssa_phaseCt;
+    ssa_maxPhaseLen = 0;
+    memory->destroy(ssa_phaseLen);
+    memory->destroy(ssa_itemLoc);
+    memory->destroy(ssa_itemLen);
+    memory->create(ssa_phaseLen,ssa_maxPhaseCt,"NPairHalfBinNewtonSSA:ssa_phaseLen");
+  }
+
+  if (phaseLenEstimate > ssa_maxPhaseLen) {
+    ssa_maxPhaseLen = phaseLenEstimate;
+    memory->destroy(ssa_itemLoc);
+    memory->destroy(ssa_itemLen);
+    memory->create(ssa_itemLoc,ssa_maxPhaseCt,ssa_maxPhaseLen,"NPairHalfBinNewtonSSA:ssa_itemLoc");
+    memory->create(ssa_itemLen,ssa_maxPhaseCt,ssa_maxPhaseLen,"NPairHalfBinNewtonSSA:ssa_itemLen");
+  }
 
   ipage->reset();
 
-  // loop over owned atoms, storing half of the neighbors
+  int workPhase = 0;
+  // loop over bins with local atoms, storing half of the neighbors
+  for (int zoff = ns_ssa->sz; zoff >= 0; --zoff) {
+  for (int yoff = ns_ssa->sy; yoff >= 0; --yoff) {
+  for (int xoff = ns_ssa->sx; xoff >= 0; --xoff) {
+    int workItem = 0;
+  for (zbin = lbinzlo + zoff; zbin < lbinzhi; zbin += sz1) {
+  for (ybin = lbinylo + yoff - ns_ssa->sy; ybin < lbinyhi; ybin += sy1) {
+  for (xbin = lbinxlo + xoff - ns_ssa->sx; xbin < lbinxhi; xbin += sx1) {
+    if (workItem >= phaseLenEstimate) error->one(FLERR,"phaseLenEstimate was too small");
+    ssa_itemLoc[workPhase][workItem] = inum; // record where workItem starts in ilist
 
-  for (i = 0; i < nlocal; i++) {
-    int AIRct[8] = { 0 };
-    n = 0;
-    neighptr = ipage->vget();
+    for (int subphase = 0; subphase < 4; subphase++) {
+      int s_ybin = ybin + ((subphase & 0x2) ? ns_ssa->sy : 0);
+      int s_xbin = xbin + ((subphase & 0x1) ? ns_ssa->sx : 0);
+      int ibin, ct;
 
-    itype = type[i];
-    xtmp = x[i][0];
-    ytmp = x[i][1];
-    ztmp = x[i][2];
-    if (moltemplate) {
-      imol = molindex[i];
-      iatom = molatom[i];
-      tagprev = tag[i] - iatom - 1;
-    }
+      if ((s_ybin < lbinylo) || (s_ybin >= lbinyhi)) continue;
+      if ((s_xbin < lbinxlo) || (s_xbin >= lbinxhi)) continue;
+      ibin = zbin*nb_ssa->mbiny*nb_ssa->mbinx
+           + s_ybin*nb_ssa->mbinx
+           + s_xbin;
 
-    // loop over rest of local atoms in i's bin
-    // just store them, since j is beyond i in linked list
-
-    for (j = bins_ssa[i]; j >= 0; j = bins_ssa[j]) {
-
-      jtype = type[j];
-      if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-
-      delx = xtmp - x[j][0];
-      dely = ytmp - x[j][1];
-      delz = ztmp - x[j][2];
-      rsq = delx*delx + dely*dely + delz*delz;
-
-      if (rsq <= cutneighsq[itype][jtype]) {
-        if (molecular) {
-          if (!moltemplate)
-            which = find_special(special[i],nspecial[i],tag[j]);
-          else if (imol >= 0)
-            which = find_special(onemols[imol]->special[iatom],
-                                 onemols[imol]->nspecial[iatom],
-                                 tag[j]-tagprev);
-          else which = 0;
-          if (which == 0) neighptr[n++] = j;
-          else if (domain->minimum_image_check(delx,dely,delz))
-            neighptr[n++] = j;
-          else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
-        } else neighptr[n++] = j;
-      }
-    }
-
-    ibin = atom2bin[i];
-
-    // loop over all local atoms in other bins in "half" stencil
-
-    for (k = 0; k < nstencil_half; k++) {
-      for (j = binhead_ssa[ibin+stencil[k]]; j >= 0;
-           j = bins_ssa[j]) {
-
-        jtype = type[j];
-        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
-
-        delx = xtmp - x[j][0];
-        dely = ytmp - x[j][1];
-        delz = ztmp - x[j][2];
-        rsq = delx*delx + dely*dely + delz*delz;
-
-        if (rsq <= cutneighsq[itype][jtype]) {
-          if (molecular) {
-            if (!moltemplate)
-              which = find_special(special[i],nspecial[i],tag[j]);
-            else if (imol >= 0)
-              which = find_special(onemols[imol]->special[iatom],
-                                   onemols[imol]->nspecial[iatom],
-                                   tag[j]-tagprev);
-            else which = 0;
-            if (which == 0) neighptr[n++] = j;
-            else if (domain->minimum_image_check(delx,dely,delz))
-              neighptr[n++] = j;
-            else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
-          } else neighptr[n++] = j;
+      for (i = binhead[ibin]; i >= 0; i = bins[i]) {
+        n = 0;
+        neighptr = ipage->vget();
+        itype = type[i];
+        xtmp = x[i][0];
+        ytmp = x[i][1];
+        ztmp = x[i][2];
+        if (moltemplate) {
+          imol = molindex[i];
+          iatom = molatom[i];
+          tagprev = tag[i] - iatom - 1;
         }
+
+        // loop over all local atoms in the current stencil "subphase"
+        for (k = nstencil_ssa[subphase]; k < nstencil_ssa[subphase+1]; k++) {
+          const int jbin = ibin+stencil[k];
+          if (jbin != ibin) j = binhead[jbin];
+          else j = bins[i]; // same bin as i, so start just past i in the bin
+          for (; j >= 0; j = bins[j]) {
+            jtype = type[j];
+            if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+            delx = xtmp - x[j][0];
+            dely = ytmp - x[j][1];
+            delz = ztmp - x[j][2];
+            rsq = delx*delx + dely*dely + delz*delz;
+            if (rsq <= cutneighsq[itype][jtype]) {
+              if (molecular) {
+                if (!moltemplate)
+                  which = find_special(special[i],nspecial[i],tag[j]);
+                else if (imol >= 0)
+                  which = find_special(onemols[imol]->special[iatom],
+                                       onemols[imol]->nspecial[iatom],
+                                       tag[j]-tagprev);
+                else which = 0;
+                if (which == 0) neighptr[n++] = j;
+                else if (domain->minimum_image_check(delx,dely,delz))
+                  neighptr[n++] = j;
+                else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+              } else neighptr[n++] = j;
+            }
+          }
+        }
+
+        if (n > 0) {
+          firstneigh[inum] = neighptr;
+          numneigh[inum] = n;
+          ilist[inum++] = i;
+        }
+        ipage->vgot(n);
+        if (ipage->status())
+          error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
       }
     }
-    AIRct[0] = n;
+    // record where workItem ends in ilist
+    ssa_itemLen[workPhase][workItem] = inum - ssa_itemLoc[workPhase][workItem];
+    if (ssa_itemLen[workPhase][workItem] > 0) workItem++;
+  }
+  }
+  }
 
-    // loop over AIR ghost atoms in all bins in "full" stencil
-    // Note: the non-AIR ghost atoms have already been filtered out
-    // That is a significant time savings because of the "full" stencil
-    // Note2: only non-pure locals can have ghosts as neighbors
+    // record where workPhase ends
+    ssa_phaseLen[workPhase++] = workItem;
+  }
+  }
+  }
 
-    if (ssaAIR[i] == 1) for (k = 0; k < nstencil_full; k++) {
-      for (j = gbinhead_ssa[ibin+stencil[k]]; j >= 0;
-           j = bins_ssa[j]) {
+  if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong");
 
-        jtype = type[j];
-        if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+  list->AIRct_ssa[0] = list->inum = inum;
 
-        delx = xtmp - x[j][0];
-        dely = ytmp - x[j][1];
-        delz = ztmp - x[j][2];
-        rsq = delx*delx + dely*dely + delz*delz;
+  // loop over AIR ghost atoms, storing their local neighbors
+  // since these are ghosts, must check if stencil bin is out of bounds
+  for (int airnum = 1; airnum <= 7; airnum++) {
+    int locAIRct = 0;
+    for (i = gairhead_ssa[airnum]; i >= 0; i = bins[i]) {
+      n = 0;
+      neighptr = ipage->vget();
 
-        if (rsq <= cutneighsq[itype][jtype]) {
-          if (molecular) {
-            if (!moltemplate)
-              which = find_special(special[i],nspecial[i],tag[j]);
-            else if (imol >= 0)
-              which = find_special(onemols[imol]->special[iatom],
-                                   onemols[imol]->nspecial[iatom],
-                                   tag[j]-tagprev);
-            else which = 0;
-            if (which == 0) {
-              neighptr[n++] = j;
-              ++(AIRct[ssaAIR[j] - 1]);
-            } else if (domain->minimum_image_check(delx,dely,delz)) {
-              neighptr[n++] = j;
-              ++(AIRct[ssaAIR[j] - 1]);
-            } else if (which > 0) {
-              neighptr[n++] = j ^ (which << SBBITS);
-              ++(AIRct[ssaAIR[j] - 1]);
-            }
-          } else {
-            neighptr[n++] = j;
-            ++(AIRct[ssaAIR[j] - 1]);
+      itype = type[i];
+      xtmp = x[i][0];
+      ytmp = x[i][1];
+      ztmp = x[i][2];
+
+      ibin = coord2bin(x[i],xbin,ybin,zbin);
+
+      // loop over AIR ghost atoms in all bins in "full" stencil
+      // Note: the non-AIR ghost atoms have already been filtered out
+      for (k = 0; k < nstencil_full; k++) {
+        xbin2 = xbin + stencilxyz[k][0];
+        ybin2 = ybin + stencilxyz[k][1];
+        zbin2 = zbin + stencilxyz[k][2];
+        // Skip it if this bin is outside the extent of local bins
+        if (xbin2 < lbinxlo || xbin2 >= lbinxhi ||
+            ybin2 < lbinylo || ybin2 >= lbinyhi ||
+            zbin2 < lbinzlo || zbin2 >= lbinzhi) continue;
+        for (j = binhead[ibin+stencil[k]]; j >= 0; j = bins[j]) {
+
+          jtype = type[j];
+          if (exclude && exclusion(i,j,itype,jtype,mask,molecule)) continue;
+
+          delx = xtmp - x[j][0];
+          dely = ytmp - x[j][1];
+          delz = ztmp - x[j][2];
+          rsq = delx*delx + dely*dely + delz*delz;
+
+          if (rsq <= cutneighsq[itype][jtype]) {
+            if (molecular) {
+              if (!moltemplate)
+                which = find_special(special[j],nspecial[j],tag[i]);
+              else {
+                int jmol = molindex[j];
+                if (jmol >= 0) {
+                  int jatom = molatom[j];
+                  which = find_special(onemols[jmol]->special[jatom],
+                                     onemols[jmol]->nspecial[jatom],
+                                     tag[i] - (tag[j] - jatom - 1));
+                } else which = 0;
+              }
+              if (which == 0) neighptr[n++] = j;
+              else if (domain->minimum_image_check(delx,dely,delz))
+                neighptr[n++] = j;
+              else if (which > 0) neighptr[n++] = j ^ (which << SBBITS);
+            } else neighptr[n++] = j;
           }
         }
       }
+
+      if (n > 0) {
+        firstneigh[inum + gnum] = neighptr;
+        numneigh[inum + gnum] = n;
+        ilist[inum + (gnum++)] = i;
+        ++locAIRct;
+      }
+      ipage->vgot(n);
+      if (ipage->status())
+        error->one(FLERR,"Neighbor (ghost) list overflow, boost neigh_modify one");
     }
-
-    ilist[inum++] = i;
-    firstneigh[i] = neighptr;
-    numneigh[i] = n;
-    ipage->vgot(n);
-    if (ipage->status())
-      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
-
-    // sort the ghosts in the neighbor list by their ssaAIR number
-
-    ssaAIRptr = atom->ssaAIR;
-    qsort(&(neighptr[AIRct[0]]), n - AIRct[0], sizeof(int), cmp_ssaAIR);
-
-    // do a prefix sum on the counts to turn them into indexes
-
-    list->ndxAIR_ssa[i][0] = AIRct[0];
-    for (int ndx = 1; ndx < 8; ++ndx) {
-      list->ndxAIR_ssa[i][ndx] = AIRct[ndx] + list->ndxAIR_ssa[i][ndx - 1];
-    }
+    list->AIRct_ssa[airnum] = locAIRct;
   }
-
-  list->inum = inum;
+  list->gnum = gnum;
 }
-
-/* ----------------------------------------------------------------------
-   comparison function invoked by qsort()
-   accesses static class member ssaAIRptr, set before call to qsort()
-------------------------------------------------------------------------- */
-
-static int cmp_ssaAIR(const void *iptr, const void *jptr)
-{
-  int i = NEIGHMASK & *((int *) iptr);
-  int j = NEIGHMASK & *((int *) jptr);
-  if (ssaAIRptr[i] < ssaAIRptr[j]) return -1;
-  if (ssaAIRptr[i] > ssaAIRptr[j]) return 1;
-  return 0;
-}
-
diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.h b/src/USER-DPD/npair_half_bin_newton_ssa.h
index 13347b33b0..ea292316ca 100644
--- a/src/USER-DPD/npair_half_bin_newton_ssa.h
+++ b/src/USER-DPD/npair_half_bin_newton_ssa.h
@@ -15,7 +15,7 @@
 
 NPairStyle(half/bin/newton/ssa,
            NPairHalfBinNewtonSSA,
-           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA)
+           NP_HALF | NP_BIN | NP_NEWTON | NP_ORTHO | NP_SSA | NP_GHOST)
 
 #else
 
@@ -28,9 +28,18 @@ namespace LAMMPS_NS {
 
 class NPairHalfBinNewtonSSA : public NPair {
  public:
+  // SSA Work plan data structures
+  int ssa_phaseCt;
+  int *ssa_phaseLen;
+  int **ssa_itemLoc;
+  int **ssa_itemLen;
+
   NPairHalfBinNewtonSSA(class LAMMPS *);
-  ~NPairHalfBinNewtonSSA() {}
+  ~NPairHalfBinNewtonSSA();
   void build(class NeighList *);
+ private:
+  int ssa_maxPhaseCt;
+  int ssa_maxPhaseLen;
 };
 
 }
diff --git a/src/USER-DPD/npair_halffull_newton_ssa.cpp b/src/USER-DPD/npair_halffull_newton_ssa.cpp
deleted file mode 100644
index 2c9de3e50f..0000000000
--- a/src/USER-DPD/npair_halffull_newton_ssa.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/* ----------------------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-/* ----------------------------------------------------------------------
-   Contributing authors:
-   James Larentzos and Timothy I. Mattox (Engility Corporation)
-------------------------------------------------------------------------- */
-
-#include "npair_halffull_newton_ssa.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "atom.h"
-#include "atom_vec.h"
-#include "molecule.h"
-#include "domain.h"
-#include "my_page.h"
-#include "error.h"
-
-using namespace LAMMPS_NS;
-
-// allocate space for static class variable
-// prototype for non-class function
-
-static int *ssaAIRptr;
-static int cmp_ssaAIR(const void *, const void *);
-
-/* ---------------------------------------------------------------------- */
-
-NPairHalffullNewtonSSA::NPairHalffullNewtonSSA(LAMMPS *lmp) : NPair(lmp) {}
-
-/* ----------------------------------------------------------------------
-   build half list from full list for use by Shardlow Spliting Algorithm
-   pair stored once if i,j are both owned and i < j
-   if j is ghost, only store if j coords are "above and to the right" of i
-   works if full list is a skip list
-------------------------------------------------------------------------- */
-
-void NPairHalffullNewtonSSA::build(NeighList *list)
-{
-  int i,j,ii,jj,n,jnum,joriginal;
-  int *neighptr,*jlist;
-
-  int nlocal = atom->nlocal;
-  int *ssaAIR = atom->ssaAIR;
-
-  int *ilist = list->ilist;
-  int *numneigh = list->numneigh;
-  int **firstneigh = list->firstneigh;
-  MyPage<int> *ipage = list->ipage;
-
-  int *ilist_full = list->listfull->ilist;
-  int *numneigh_full = list->listfull->numneigh;
-  int **firstneigh_full = list->listfull->firstneigh;
-  int inum_full = list->listfull->inum;
-
-  int inum = 0;
-  ipage->reset();
-
-  // loop over parent full list
-
-  for (ii = 0; ii < inum_full; ii++) {
-    int AIRct[8] = { 0 };
-    n = 0;
-    neighptr = ipage->vget();
-
-    i = ilist_full[ii];
-
-    // loop over full neighbor list
-
-    jlist = firstneigh_full[i];
-    jnum = numneigh_full[i];
-
-    for (jj = 0; jj < jnum; jj++) {
-      joriginal = jlist[jj];
-      j = joriginal & NEIGHMASK;
-      if (j < nlocal) {
-        if (i > j) continue;
-        ++(AIRct[0]);
-      } else {
-        if (ssaAIR[j] < 2) continue; // skip ghost atoms not in AIR
-        ++(AIRct[ssaAIR[j] - 1]);
-      }
-      neighptr[n++] = joriginal;
-    }
-
-    ilist[inum++] = i;
-    firstneigh[i] = neighptr;
-    numneigh[i] = n;
-    ipage->vgot(n);
-    if (ipage->status())
-      error->one(FLERR,"Neighbor list overflow, boost neigh_modify one");
-
-    // sort the locals+ghosts in the neighbor list by their ssaAIR number
-
-    ssaAIRptr = atom->ssaAIR;
-    qsort(&(neighptr[0]), n, sizeof(int), cmp_ssaAIR);
-
-    // do a prefix sum on the counts to turn them into indexes
-
-    list->ndxAIR_ssa[i][0] = AIRct[0];
-    for (int ndx = 1; ndx < 8; ++ndx) {
-      list->ndxAIR_ssa[i][ndx] = AIRct[ndx] + list->ndxAIR_ssa[i][ndx - 1];
-    }
-  }
-
-  list->inum = inum;
-}
-
-/* ----------------------------------------------------------------------
-   comparison function invoked by qsort()
-   accesses static class member ssaAIRptr, set before call to qsort()
-------------------------------------------------------------------------- */
-
-static int cmp_ssaAIR(const void *iptr, const void *jptr)
-{
-  int i = NEIGHMASK & *((int *) iptr);
-  int j = NEIGHMASK & *((int *) jptr);
-  if (ssaAIRptr[i] < ssaAIRptr[j]) return -1;
-  if (ssaAIRptr[i] > ssaAIRptr[j]) return 1;
-  return 0;
-}
-
diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
index df379a109a..451381c104 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.cpp
@@ -42,23 +42,72 @@ NStencilHalfBin2dNewtonSSA::NStencilHalfBin2dNewtonSSA(LAMMPS *lmp) :
 void NStencilHalfBin2dNewtonSSA::create()
 {
   int i,j,pos = 0;
+  nstencil_ssa[0] = 0; // redundant info, but saves a conditional
 
+  // Include the centroid at the start.
+  // It will be handled as part of Subphase 0.
+  stencilxyz[pos][0] = 0;
+  stencilxyz[pos][1] = 0;
+  stencilxyz[pos][2] = 0;
+  stencil[pos++] = 0;
+
+  // Subphase 0: upper right front bins (red)
   for (j = 0; j <= sy; j++)
-    for (i = -sx; i <= sx; i++)
-      if (j > 0 || (j == 0 && i > 0))
-        if (bin_distance(i,j,0) < cutneighmaxsq)
+    for (i = 0; i <= sx; i++)
+      if (j > 0 || i > 0) // skip the centroid
+        if (bin_distance(i,j,0) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = 0;
           stencil[pos++] = j*mbinx + i;
+        }
 
-  nstencil_half = pos; // record where normal half stencil ends
-
-  // include additional bins for AIR ghosts only
-
-  for (j = -sy; j <= 0; j++)
-    for (i = -sx; i <= sx; i++) {
-      if (j == 0 && i > 0) continue;
-      if (bin_distance(i,j,0) < cutneighmaxsq)
+  nstencil_ssa[1] = pos;
+  // Subphase 1: upper left front bins (light blue)
+  for (j = 1; j <= sy; j++)
+    for (i = -sx; i < 0; i++)
+      if (bin_distance(i,j,0) < cutneighmaxsq) {
+        stencilxyz[pos][0] = i;
+        stencilxyz[pos][1] = j;
+        stencilxyz[pos][2] = 0;
         stencil[pos++] = j*mbinx + i;
-    }
+      }
+
+  nstencil_ssa[2] = pos;
+  // Subphase 2: lower right front bins (yellow)
+
+  nstencil_ssa[3] = pos;
+  // Subphase 3: lower left front bins (blue)
+
+  nstencil_ssa[4] = pos; // record end of half stencil
+  // Now include additional bins for AIR ghosts, and impure-to-pure locals
+  // Subphase 4: upper right back bins (pink)
+
+  // nstencil_ssa[5] = pos;
+  // Subphase 5: upper left back bins (light green)
+
+  // nstencil_ssa[6] = pos;
+  // Subphase 6: lower right back bins (white)
+  for (j = -sy; j < 0; j++)
+    for (i = 0; i <= sx; i++)
+      if (bin_distance(i,j,0) < cutneighmaxsq) {
+        stencilxyz[pos][0] = i;
+        stencilxyz[pos][1] = j;
+        stencilxyz[pos][2] = 0;
+        stencil[pos++] = j*mbinx + i;
+      }
+
+  // nstencil_ssa[7] = pos;
+  // Subphase 7: lower left back bins (purple)
+  for (j = -sy; j <= 0; j++)
+    for (i = -sx; i < 0; i++)
+      if (bin_distance(i,j,0) < cutneighmaxsq) {
+        stencilxyz[pos][0] = i;
+        stencilxyz[pos][1] = j;
+        stencilxyz[pos][2] = 0;
+        stencil[pos++] = j*mbinx + i;
+      }
+  // nstencil_ssa[8] = pos;
 
   nstencil = pos; // record where full stencil ends
 }
diff --git a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h
index 30901bb3e2..1d5cc3f6b2 100644
--- a/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h
+++ b/src/USER-DPD/nstencil_half_bin_2d_newton_ssa.h
@@ -15,7 +15,7 @@
 
 NStencilStyle(half/bin/2d/newton/ssa,
               NStencilHalfBin2dNewtonSSA,
-              NS_HALF | NS_BIN | NS_2D | NS_NEWTON | NS_SSA | NS_ORTHO)
+              NS_HALF | NS_BIN | NS_2D | NS_NEWTON | NS_SSA | NS_ORTHO | NS_GHOST)
 
 #else
 
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
index 76c9931ab2..cdd3b8856f 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.cpp
@@ -42,33 +42,112 @@ NStencilHalfBin3dNewtonSSA::NStencilHalfBin3dNewtonSSA(LAMMPS *lmp) :
 void NStencilHalfBin3dNewtonSSA::create()
 {
   int i,j,k,pos = 0;
+  nstencil_ssa[0] = 0; // redundant info, but saves a conditional
 
+  // Include the centroid at the start.
+  // It will be handled as part of Subphase 0.
+  stencilxyz[pos][0] = 0;
+  stencilxyz[pos][1] = 0;
+  stencilxyz[pos][2] = 0;
+  stencil[pos++] = 0;
+
+  // Subphase 0: upper right front bins (red)
   for (k = 0; k <= sz; k++)
-    for (j = -sy; j <= sy; j++)
-      for (i = -sx; i <= sx; i++)
-        if (k > 0 || j > 0 || (j == 0 && i > 0))
-          if (bin_distance(i,j,k) < cutneighmaxsq)
+    for (j = 0; j <= sy; j++)
+      for (i = 0; i <= sx; i++)
+        if (k > 0 || j > 0 || i > 0) // skip the centroid
+          if (bin_distance(i,j,k) < cutneighmaxsq) {
+            stencilxyz[pos][0] = i;
+            stencilxyz[pos][1] = j;
+            stencilxyz[pos][2] = k;
             stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+          }
 
-  nstencil_half = pos; // record where normal half stencil ends
-
-  // include additional bins for AIR ghosts only
-
-  for (k = -sz; k < 0; k++)
-    for (j = -sy; j <= sy; j++)
-      for (i = -sx; i <= sx; i++)
-        if (bin_distance(i,j,k) < cutneighmaxsq)
+  nstencil_ssa[1] = pos;
+  // Subphase 1: upper left front bins (light blue)
+  for (k = 0; k <= sz; k++)
+    for (j = 1; j <= sy; j++)
+      for (i = -sx; i < 0; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
           stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
 
-  // For k==0, make sure to skip already included bins
+  nstencil_ssa[2] = pos;
+  // Subphase 2: lower right front bins (yellow)
+  for (k = 1; k <= sz; k++)
+    for (j = -sy; j < 0; j++)
+      for (i = 0; i <= sx; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
 
-  k = 0;
-  for (j = -sy; j <= 0; j++)
-    for (i = -sx; i <= sx; i++) {
-      if (j == 0 && i > 0) continue;
-      if (bin_distance(i,j,k) < cutneighmaxsq)
-        stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
-    }
+  nstencil_ssa[3] = pos;
+  // Subphase 3: lower left front bins (blue)
+  for (k = 1; k <= sz; k++)
+    for (j = -sy; j <= 0; j++)
+      for (i = -sx; i < 0; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+
+  nstencil_ssa[4] = pos; // record end of half stencil
+  // Now include additional bins for AIR ghosts, and impure-to-pure locals
+  // Subphase 4: upper right back bins (pink)
+  for (k = -sz; k < 0; k++)
+    for (j = 0; j <= sy; j++)
+      for (i = 0; i <= sx; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+
+  // nstencil_ssa[5] = pos;
+  // Subphase 5: upper left back bins (light green)
+  for (k = -sz; k < 0; k++)
+    for (j = 1; j <= sy; j++)
+      for (i = -sx; i < 0; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+
+  // nstencil_ssa[6] = pos;
+  // Subphase 6: lower right back bins (white)
+  for (k = -sz; k <= 0; k++)
+    for (j = -sy; j < 0; j++)
+      for (i = 0; i <= sx; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+
+  // nstencil_ssa[7] = pos;
+  // Subphase 7: lower left back bins (purple)
+  for (k = -sz; k <= 0; k++)
+    for (j = -sy; j <= 0; j++)
+      for (i = -sx; i < 0; i++)
+        if (bin_distance(i,j,k) < cutneighmaxsq) {
+          stencilxyz[pos][0] = i;
+          stencilxyz[pos][1] = j;
+          stencilxyz[pos][2] = k;
+          stencil[pos++] = k*mbiny*mbinx + j*mbinx + i;
+        }
+  //nstencil_ssa[8] = pos;
 
   nstencil = pos; // record where full stencil ends
 }
diff --git a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h
index 7765b256d3..450a696e46 100644
--- a/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h
+++ b/src/USER-DPD/nstencil_half_bin_3d_newton_ssa.h
@@ -15,7 +15,7 @@
 
 NStencilStyle(half/bin/3d/newton/ssa,
               NStencilHalfBin3dNewtonSSA,
-              NS_HALF | NS_BIN | NS_3D | NS_NEWTON | NS_SSA | NS_ORTHO)
+              NS_HALF | NS_BIN | NS_3D | NS_NEWTON | NS_SSA | NS_ORTHO | NS_GHOST)
 
 #else
 
diff --git a/src/USER-DPD/nstencil_ssa.h b/src/USER-DPD/nstencil_ssa.h
index 9fcd19ee26..f6f91fefde 100644
--- a/src/USER-DPD/nstencil_ssa.h
+++ b/src/USER-DPD/nstencil_ssa.h
@@ -20,11 +20,12 @@ namespace LAMMPS_NS {
 
 class NStencilSSA : public NStencil {
  public:
-  NStencilSSA(class LAMMPS *lmp) : NStencil(lmp) { }
+  NStencilSSA(class LAMMPS *lmp) : NStencil(lmp) { xyzflag = 1; }
   ~NStencilSSA() {}
   virtual void create() = 0;
 
-  int nstencil_half;   // where the half stencil ends
+  // first stencil index for each subphase, with last index at end
+  int nstencil_ssa[5];
 };
 
 }
diff --git a/src/USER-DPD/pair_dpd_fdt.cpp b/src/USER-DPD/pair_dpd_fdt.cpp
index 26f5806cf1..95908c556d 100644
--- a/src/USER-DPD/pair_dpd_fdt.cpp
+++ b/src/USER-DPD/pair_dpd_fdt.cpp
@@ -316,18 +316,17 @@ void PairDPDfdt::init_style()
   if (comm->ghost_velocity == 0)
     error->all(FLERR,"Pair dpd/fdt requires ghost atoms store velocity");
 
-  // if newton off, forces between atoms ij will be double computed
-  // using different random numbers
-
-  if (force->newton_pair == 0 && comm->me == 0) error->warning(FLERR,
-      "Pair dpd/fdt requires newton pair on");
-
   splitFDT_flag = false;
   int irequest = neighbor->request(this,instance_me);
   for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"shardlow") == 0){
+    if (strncmp(modify->fix[i]->style,"shardlow", 8) == 0){
       splitFDT_flag = true;
     }
+
+  // if newton off, forces between atoms ij will be double computed
+  // using different random numbers if splitFDT_flag is false
+  if (!splitFDT_flag && (force->newton_pair == 0) && (comm->me == 0)) error->warning(FLERR,
+      "Pair dpd/fdt requires newton pair on if not also using fix shardlow");
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-DPD/pair_dpd_fdt_energy.cpp b/src/USER-DPD/pair_dpd_fdt_energy.cpp
index c3fc7fb3f5..32ac456b0f 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.cpp
+++ b/src/USER-DPD/pair_dpd_fdt_energy.cpp
@@ -55,6 +55,8 @@ PairDPDfdtEnergy::PairDPDfdtEnergy(LAMMPS *lmp) : Pair(lmp)
 
 PairDPDfdtEnergy::~PairDPDfdtEnergy()
 {
+  if (copymode) return;
+
   if (allocated) {
     memory->destroy(setflag);
     memory->destroy(cutsq);
@@ -403,19 +405,18 @@ void PairDPDfdtEnergy::init_style()
   if (comm->ghost_velocity == 0)
     error->all(FLERR,"Pair dpd/fdt/energy requires ghost atoms store velocity");
 
-  // if newton off, forces between atoms ij will be double computed
-  // using different random numbers
-
-  if (force->newton_pair == 0 && comm->me == 0) error->warning(FLERR,
-      "Pair dpd/fdt/energy requires newton pair on");
-
   splitFDT_flag = false;
   int irequest = neighbor->request(this,instance_me);
   for (int i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"shardlow") == 0){
+    if (strncmp(modify->fix[i]->style,"shardlow", 8) == 0){
       splitFDT_flag = true;
     }
 
+  // if newton off, forces between atoms ij will be double computed
+  // using different random numbers if splitFDT_flag is false
+  if (!splitFDT_flag && (force->newton_pair == 0) && (comm->me == 0)) error->warning(FLERR,
+      "Pair dpd/fdt/energy requires newton pair on if not also using fix shardlow");
+
   bool eos_flag = false;
   for (int i = 0; i < modify->nfix; i++)
     if (strncmp(modify->fix[i]->style,"eos",3) == 0) eos_flag = true;
diff --git a/src/USER-DPD/pair_dpd_fdt_energy.h b/src/USER-DPD/pair_dpd_fdt_energy.h
index 84ab28aca4..dce39f83f0 100644
--- a/src/USER-DPD/pair_dpd_fdt_energy.h
+++ b/src/USER-DPD/pair_dpd_fdt_energy.h
@@ -31,8 +31,8 @@ class PairDPDfdtEnergy : public Pair {
   virtual void compute(int, int);
   virtual void settings(int, char **);
   virtual void coeff(int, char **);
-  void init_style();
-  double init_one(int, int);
+  virtual void init_style();
+  virtual double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   virtual void write_restart_settings(FILE *);
@@ -46,15 +46,15 @@ class PairDPDfdtEnergy : public Pair {
   double **sigma,**kappa;
   double *duCond,*duMech;
 
+  int seed;
   class RanMars *random;
 
  protected:
   double cut_global;
-  int seed;
   bool splitFDT_flag;
   bool a0_is_zero;
 
-  void allocate();
+  virtual void allocate();
 
 };
 
diff --git a/src/USER-DPD/pair_exp6_rx.cpp b/src/USER-DPD/pair_exp6_rx.cpp
index 61b62efc53..8b6fc9beed 100644
--- a/src/USER-DPD/pair_exp6_rx.cpp
+++ b/src/USER-DPD/pair_exp6_rx.cpp
@@ -84,11 +84,15 @@ PairExp6rx::PairExp6rx(LAMMPS *lmp) : Pair(lmp)
 
 PairExp6rx::~PairExp6rx()
 {
-  for (int i=0; i < nparams; ++i) {
-    delete[] params[i].name;
-    delete[] params[i].potential;
+  if (copymode) return;
+
+  if (params != NULL) {
+    for (int i=0; i < nparams; ++i) {
+      delete[] params[i].name;
+      delete[] params[i].potential;
+    }
+    memory->destroy(params);
   }
-  memory->destroy(params);
   memory->destroy(mol2param);
 
   if (allocated) {
diff --git a/src/USER-DPD/pair_exp6_rx.h b/src/USER-DPD/pair_exp6_rx.h
index 33bd6e6623..45c046cc07 100644
--- a/src/USER-DPD/pair_exp6_rx.h
+++ b/src/USER-DPD/pair_exp6_rx.h
@@ -30,13 +30,21 @@ class PairExp6rx : public Pair {
   virtual ~PairExp6rx();
   virtual void compute(int, int);
   void settings(int, char **);
-  void coeff(int, char **);
+  virtual void coeff(int, char **);
   double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
 
+  struct Param {
+    double epsilon,rm,alpha;
+    int ispecies;
+    char *name, *potential;      // names of unique molecules and interaction type
+    char *tablename;             // name of interaction table
+   int potentialType;              // enumerated interaction potential type.
+  };
+
  protected:
   enum{LINEAR};
   enum{NONE,EXPONENT,POLYNOMIAL};
@@ -45,21 +53,14 @@ class PairExp6rx : public Pair {
   double **epsilon,**rm,**alpha;
   double **rminv,**buck1,**buck2,**offset;
 
-  void allocate();
+  virtual void allocate();
   int *mol2param;               // mapping from molecule to parameters
   int nparams;                  // # of stored parameter sets
   int maxparam;                 // max # of parameter sets
-  struct Param {
-    double epsilon,rm,alpha;
-    int ispecies;
-    char *name, *potential;      // names of unique molecules and interaction type
-    char *tablename;             // name of interaction table
-   int potentialType;              // enumerated interaction potential type.
-  };
   Param *params;                // parameter set for an I-J-K interaction
 
   int nspecies;
-  void read_file(char *);
+  virtual void read_file(char *);
   void read_file2(char *);
   void setup();
 
diff --git a/src/USER-DPD/pair_multi_lucy_rx.cpp b/src/USER-DPD/pair_multi_lucy_rx.cpp
index 43d4114741..4628edbc12 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.cpp
+++ b/src/USER-DPD/pair_multi_lucy_rx.cpp
@@ -85,6 +85,8 @@ PairMultiLucyRX::PairMultiLucyRX(LAMMPS *lmp) : Pair(lmp),
 
 PairMultiLucyRX::~PairMultiLucyRX()
 {
+  if (copymode) return;
+
   for (int m = 0; m < ntables; m++) free_table(&tables[m]);
   memory->sfree(tables);
 
diff --git a/src/USER-DPD/pair_multi_lucy_rx.h b/src/USER-DPD/pair_multi_lucy_rx.h
index 092f40f1d1..2bfa5d20e3 100644
--- a/src/USER-DPD/pair_multi_lucy_rx.h
+++ b/src/USER-DPD/pair_multi_lucy_rx.h
@@ -30,17 +30,17 @@ class PairMultiLucyRX : public Pair {
   virtual ~PairMultiLucyRX();
 
   virtual void compute(int, int);
-  void settings(int, char **);
+  virtual void settings(int, char **);
   void coeff(int, char **);
   double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
-  int pack_forward_comm(int, int *, double *, int, int *);
-  void unpack_forward_comm(int, int, double *);
-  int pack_reverse_comm(int, int, double *);
-  void unpack_reverse_comm(int, int *, double *);
+  virtual int pack_forward_comm(int, int *, double *, int, int *);
+  virtual void unpack_forward_comm(int, int, double *);
+  virtual int pack_reverse_comm(int, int, double *);
+  virtual void unpack_reverse_comm(int, int *, double *);
   void computeLocalDensity();
   double rho_0;
 
@@ -64,7 +64,7 @@ class PairMultiLucyRX : public Pair {
 
   int **tabindex;
 
-  void allocate();
+  virtual void allocate();
   void read_table(Table *, char *, char *);
   void param_extract(Table *, char *);
   void bcast_table(Table *);
diff --git a/src/USER-DPD/pair_table_rx.cpp b/src/USER-DPD/pair_table_rx.cpp
index 2529d33f91..89d09e7322 100644
--- a/src/USER-DPD/pair_table_rx.cpp
+++ b/src/USER-DPD/pair_table_rx.cpp
@@ -33,8 +33,6 @@ using namespace LAMMPS_NS;
 
 enum{NONE,RLINEAR,RSQ,BMP};
 
-#define MAXLINE 1024
-
 #ifdef DBL_EPSILON
   #define MY_EPSILON (10.0*DBL_EPSILON)
 #else
@@ -46,25 +44,19 @@ enum{NONE,RLINEAR,RSQ,BMP};
 
 /* ---------------------------------------------------------------------- */
 
-PairTableRX::PairTableRX(LAMMPS *lmp) : Pair(lmp)
+PairTableRX::PairTableRX(LAMMPS *lmp) : PairTable(lmp)
 {
-  ntables = 0;
-  tables = NULL;
   fractionalWeighting = true;
+  site1 = NULL;
+  site2 = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
 
 PairTableRX::~PairTableRX()
 {
-  for (int m = 0; m < ntables; m++) free_table(&tables[m]);
-  memory->sfree(tables);
-
-  if (allocated) {
-    memory->destroy(setflag);
-    memory->destroy(cutsq);
-    memory->destroy(tabindex);
-  }
+  delete [] site1;
+  delete [] site2;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -252,24 +244,6 @@ void PairTableRX::compute(int eflag, int vflag)
   memory->destroy(mixWtSite2);
 }
 
-/* ----------------------------------------------------------------------
-   allocate all arrays
-------------------------------------------------------------------------- */
-
-void PairTableRX::allocate()
-{
-  allocated = 1;
-  const int nt = atom->ntypes + 1;
-
-  memory->create(setflag,nt,nt,"pair:setflag");
-  memory->create(cutsq,nt,nt,"pair:cutsq");
-  memory->create(tabindex,nt,nt,"pair:tabindex");
-
-  memset(&setflag[0][0],0,nt*nt*sizeof(int));
-  memset(&cutsq[0][0],0,nt*nt*sizeof(double));
-  memset(&tabindex[0][0],0,nt*nt*sizeof(int));
-}
-
 /* ----------------------------------------------------------------------
    global settings
 ------------------------------------------------------------------------- */
@@ -299,8 +273,8 @@ void PairTableRX::settings(int narg, char **arg)
     else if (strcmp(arg[iarg],"msm") == 0) msmflag = 1;
     else if (strcmp(arg[iarg],"dispersion") == 0) dispersionflag = 1;
     else if (strcmp(arg[iarg],"tip4p") == 0) tip4pflag = 1;
-    else if (strcmp(arg[iarg],"fractional") == 0)   fractionalWeighting = true;
-    else if (strcmp(arg[iarg],"molecular") == 0)   fractionalWeighting = false;
+    else if (strcmp(arg[iarg],"fractional") == 0) fractionalWeighting = true;
+    else if (strcmp(arg[iarg],"molecular") == 0) fractionalWeighting = false;
     else error->all(FLERR,"Illegal pair_style command");
     iarg++;
   }
@@ -462,602 +436,6 @@ void PairTableRX::coeff(int narg, char **arg)
 
 }
 
-/* ----------------------------------------------------------------------
-   init for one type pair i,j and corresponding j,i
-------------------------------------------------------------------------- */
-
-double PairTableRX::init_one(int i, int j)
-{
-  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
-
-  tabindex[j][i] = tabindex[i][j];
-
-  return tables[tabindex[i][j]].cut;
-}
-
-/* ----------------------------------------------------------------------
-   read a table section from a tabulated potential file
-   only called by proc 0
-   this function sets these values in Table:
-     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi,ntablebits
-------------------------------------------------------------------------- */
-
-void PairTableRX::read_table(Table *tb, char *file, char *keyword)
-{
-  char line[MAXLINE];
-
-  // open file
-
-  FILE *fp = force->open_potential(file);
-  if (fp == NULL) {
-    char str[128];
-    sprintf(str,"Cannot open file %s",file);
-    error->one(FLERR,str);
-  }
-
-  // loop until section found with matching keyword
-
-  while (1) {
-    if (fgets(line,MAXLINE,fp) == NULL)
-      error->one(FLERR,"Did not find keyword in table file");
-    if (strspn(line," \t\n\r") == strlen(line)) continue;  // blank line
-    if (line[0] == '#') continue;                          // comment
-    char *word = strtok(line," \t\n\r");
-    if (strcmp(word,keyword) == 0) break;           // matching keyword
-    fgets(line,MAXLINE,fp);                         // no match, skip section
-    param_extract(tb,line);
-    fgets(line,MAXLINE,fp);
-    for (int i = 0; i < tb->ninput; i++) fgets(line,MAXLINE,fp);
-  }
-
-  // read args on 2nd line of section
-  // allocate table arrays for file values
-
-  fgets(line,MAXLINE,fp);
-  param_extract(tb,line);
-  memory->create(tb->rfile,tb->ninput,"pair:rfile");
-  memory->create(tb->efile,tb->ninput,"pair:efile");
-  memory->create(tb->ffile,tb->ninput,"pair:ffile");
-
-  // setup bitmap parameters for table to read in
-
-  tb->ntablebits = 0;
-  int masklo,maskhi,nmask,nshiftbits;
-  if (tb->rflag == BMP) {
-    while (1 << tb->ntablebits < tb->ninput) tb->ntablebits++;
-    if (1 << tb->ntablebits != tb->ninput)
-      error->one(FLERR,"Bitmapped table is incorrect length in table file");
-    init_bitmap(tb->rlo,tb->rhi,tb->ntablebits,masklo,maskhi,nmask,nshiftbits);
-  }
-
-  // read r,e,f table values from file
-  // if rflag set, compute r
-  // if rflag not set, use r from file
-
-  int itmp;
-  double rtmp;
-  union_int_float_t rsq_lookup;
-
-  fgets(line,MAXLINE,fp);
-  for (int i = 0; i < tb->ninput; i++) {
-    fgets(line,MAXLINE,fp);
-    sscanf(line,"%d %lg %lg %lg",&itmp,&rtmp,&tb->efile[i],&tb->ffile[i]);
-
-    if (tb->rflag == RLINEAR)
-      rtmp = tb->rlo + (tb->rhi - tb->rlo)*i/(tb->ninput-1);
-    else if (tb->rflag == RSQ) {
-      rtmp = tb->rlo*tb->rlo +
-        (tb->rhi*tb->rhi - tb->rlo*tb->rlo)*i/(tb->ninput-1);
-      rtmp = sqrt(rtmp);
-    } else if (tb->rflag == BMP) {
-      rsq_lookup.i = i << nshiftbits;
-      rsq_lookup.i |= masklo;
-      if (rsq_lookup.f < tb->rlo*tb->rlo) {
-        rsq_lookup.i = i << nshiftbits;
-        rsq_lookup.i |= maskhi;
-      }
-      rtmp = sqrtf(rsq_lookup.f);
-    }
-
-    tb->rfile[i] = rtmp;
-  }
-
-  // close file
-
-  fclose(fp);
-}
-
-/* ----------------------------------------------------------------------
-   broadcast read-in table info from proc 0 to other procs
-   this function communicates these values in Table:
-     ninput,rfile,efile,ffile,rflag,rlo,rhi,fpflag,fplo,fphi
-------------------------------------------------------------------------- */
-
-void PairTableRX::bcast_table(Table *tb)
-{
-  MPI_Bcast(&tb->ninput,1,MPI_INT,0,world);
-
-  int me;
-  MPI_Comm_rank(world,&me);
-  if (me > 0) {
-    memory->create(tb->rfile,tb->ninput,"pair:rfile");
-    memory->create(tb->efile,tb->ninput,"pair:efile");
-    memory->create(tb->ffile,tb->ninput,"pair:ffile");
-  }
-
-  MPI_Bcast(tb->rfile,tb->ninput,MPI_DOUBLE,0,world);
-  MPI_Bcast(tb->efile,tb->ninput,MPI_DOUBLE,0,world);
-  MPI_Bcast(tb->ffile,tb->ninput,MPI_DOUBLE,0,world);
-
-  MPI_Bcast(&tb->rflag,1,MPI_INT,0,world);
-  if (tb->rflag) {
-    MPI_Bcast(&tb->rlo,1,MPI_DOUBLE,0,world);
-    MPI_Bcast(&tb->rhi,1,MPI_DOUBLE,0,world);
-  }
-  MPI_Bcast(&tb->fpflag,1,MPI_INT,0,world);
-  if (tb->fpflag) {
-    MPI_Bcast(&tb->fplo,1,MPI_DOUBLE,0,world);
-    MPI_Bcast(&tb->fphi,1,MPI_DOUBLE,0,world);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   build spline representation of e,f over entire range of read-in table
-   this function sets these values in Table: e2file,f2file
-------------------------------------------------------------------------- */
-
-void PairTableRX::spline_table(Table *tb)
-{
-  memory->create(tb->e2file,tb->ninput,"pair:e2file");
-  memory->create(tb->f2file,tb->ninput,"pair:f2file");
-
-  double ep0 = - tb->ffile[0];
-  double epn = - tb->ffile[tb->ninput-1];
-  spline(tb->rfile,tb->efile,tb->ninput,ep0,epn,tb->e2file);
-
-  if (tb->fpflag == 0) {
-    tb->fplo = (tb->ffile[1] - tb->ffile[0]) / (tb->rfile[1] - tb->rfile[0]);
-    tb->fphi = (tb->ffile[tb->ninput-1] - tb->ffile[tb->ninput-2]) /
-      (tb->rfile[tb->ninput-1] - tb->rfile[tb->ninput-2]);
-  }
-
-  double fp0 = tb->fplo;
-  double fpn = tb->fphi;
-  spline(tb->rfile,tb->ffile,tb->ninput,fp0,fpn,tb->f2file);
-}
-
-/* ----------------------------------------------------------------------
-   extract attributes from parameter line in table section
-   format of line: N value R/RSQ/BITMAP lo hi FP fplo fphi
-   N is required, other params are optional
-------------------------------------------------------------------------- */
-
-void PairTableRX::param_extract(Table *tb, char *line)
-{
-  tb->ninput = 0;
-  tb->rflag = NONE;
-  tb->fpflag = 0;
-
-  char *word = strtok(line," \t\n\r\f");
-  while (word) {
-    if (strcmp(word,"N") == 0) {
-      word = strtok(NULL," \t\n\r\f");
-      tb->ninput = atoi(word);
-    } else if (strcmp(word,"R") == 0 || strcmp(word,"RSQ") == 0 ||
-               strcmp(word,"BITMAP") == 0) {
-      if (strcmp(word,"R") == 0) tb->rflag = RLINEAR;
-      else if (strcmp(word,"RSQ") == 0) tb->rflag = RSQ;
-      else if (strcmp(word,"BITMAP") == 0) tb->rflag = BMP;
-      word = strtok(NULL," \t\n\r\f");
-      tb->rlo = atof(word);
-      word = strtok(NULL," \t\n\r\f");
-      tb->rhi = atof(word);
-    } else if (strcmp(word,"FP") == 0) {
-      tb->fpflag = 1;
-      word = strtok(NULL," \t\n\r\f");
-      tb->fplo = atof(word);
-      word = strtok(NULL," \t\n\r\f");
-      tb->fphi = atof(word);
-    } else {
-      printf("WORD: %s\n",word);
-      error->one(FLERR,"Invalid keyword in pair table parameters");
-    }
-    word = strtok(NULL," \t\n\r\f");
-  }
-
-  if (tb->ninput == 0) error->one(FLERR,"Pair table parameters did not set N");
-}
-
-/* ----------------------------------------------------------------------
-   compute r,e,f vectors from splined values
-------------------------------------------------------------------------- */
-
-void PairTableRX::compute_table(Table *tb)
-{
-  int tlm1 = tablength-1;
-
-  // inner = inner table bound
-  // cut = outer table bound
-  // delta = table spacing in rsq for N-1 bins
-
-  double inner;
-  if (tb->rflag) inner = tb->rlo;
-  else inner = tb->rfile[0];
-  tb->innersq = double(inner)*double(inner);
-  tb->delta = double(tb->cut*tb->cut - double(tb->innersq)) / double(tlm1);
-  tb->invdelta = 1.0/double(tb->delta);
-
-  // direct lookup tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // e,f = value at midpt of bin
-  // e,f are N-1 in length since store 1 value at bin midpt
-  // f is converted to f/r when stored in f[i]
-  // e,f are never a match to read-in values, always computed via spline interp
-
-  if (tabstyle == LOOKUP) {
-    memory->create(tb->e,tlm1,"pair:e");
-    memory->create(tb->f,tlm1,"pair:f");
-
-    double r,rsq;
-    for (int i = 0; i < tlm1; i++) {
-      rsq = tb->innersq + (i+0.5)*tb->delta;
-      r = sqrt(rsq);
-      tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-      tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-    }
-  }
-
-  // linear tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // rsq,e,f = value at lower edge of bin
-  // de,df values = delta from lower edge to upper edge of bin
-  // rsq,e,f are N in length so de,df arrays can compute difference
-  // f is converted to f/r when stored in f[i]
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == LINEAR) {
-    memory->create(tb->rsq,tablength,"pair:rsq");
-    memory->create(tb->e,tablength,"pair:e");
-    memory->create(tb->f,tablength,"pair:f");
-    memory->create(tb->de,tlm1,"pair:de");
-    memory->create(tb->df,tlm1,"pair:df");
-
-    double r,rsq;
-    for (int i = 0; i < tablength; i++) {
-      rsq = tb->innersq + i*tb->delta;
-      r = sqrt(rsq);
-      tb->rsq[i] = rsq;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-      }
-    }
-
-    for (int i = 0; i < tlm1; i++) {
-      tb->de[i] = tb->e[i+1] - tb->e[i];
-      tb->df[i] = tb->f[i+1] - tb->f[i];
-    }
-  }
-
-  // cubic spline tables
-  // N-1 evenly spaced bins in rsq from inner to cut
-  // rsq,e,f = value at lower edge of bin
-  // e2,f2 = spline coefficient for each bin
-  // rsq,e,f,e2,f2 are N in length so have N-1 spline bins
-  // f is converted to f/r after e is splined
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == SPLINE) {
-    memory->create(tb->rsq,tablength,"pair:rsq");
-    memory->create(tb->e,tablength,"pair:e");
-    memory->create(tb->f,tablength,"pair:f");
-    memory->create(tb->e2,tablength,"pair:e2");
-    memory->create(tb->f2,tablength,"pair:f2");
-
-    tb->deltasq6 = tb->delta*tb->delta / 6.0;
-
-    double r,rsq;
-    for (int i = 0; i < tablength; i++) {
-      rsq = tb->innersq + i*tb->delta;
-      r = sqrt(rsq);
-      tb->rsq[i] = rsq;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r);
-      }
-    }
-
-    // ep0,epn = dh/dg at inner and at cut
-    // h(r) = e(r) and g(r) = r^2
-    // dh/dg = (de/dr) / 2r = -f/2r
-
-    double ep0 = - tb->f[0] / (2.0 * sqrt(tb->innersq));
-    double epn = - tb->f[tlm1] / (2.0 * tb->cut);
-    spline(tb->rsq,tb->e,tablength,ep0,epn,tb->e2);
-
-    // fp0,fpn = dh/dg at inner and at cut
-    // h(r) = f(r)/r and g(r) = r^2
-    // dh/dg = (1/r df/dr - f/r^2) / 2r
-    // dh/dg in secant approx = (f(r2)/r2 - f(r1)/r1) / (g(r2) - g(r1))
-
-    double fp0,fpn;
-    double secant_factor = 0.1;
-    if (tb->fpflag) fp0 = (tb->fplo/sqrt(tb->innersq) - tb->f[0]/tb->innersq) /
-      (2.0 * sqrt(tb->innersq));
-    else {
-      double rsq1 = tb->innersq;
-      double rsq2 = rsq1 + secant_factor*tb->delta;
-      fp0 = (splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq2)) /
-             sqrt(rsq2) - tb->f[0] / sqrt(rsq1)) / (secant_factor*tb->delta);
-    }
-
-    if (tb->fpflag && tb->cut == tb->rfile[tb->ninput-1]) fpn =
-      (tb->fphi/tb->cut - tb->f[tlm1]/(tb->cut*tb->cut)) / (2.0 * tb->cut);
-    else {
-      double rsq2 = tb->cut * tb->cut;
-      double rsq1 = rsq2 - secant_factor*tb->delta;
-      fpn = (tb->f[tlm1] / sqrt(rsq2) -
-             splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,sqrt(rsq1)) /
-             sqrt(rsq1)) / (secant_factor*tb->delta);
-    }
-
-    for (int i = 0; i < tablength; i++) tb->f[i] /= sqrt(tb->rsq[i]);
-    spline(tb->rsq,tb->f,tablength,fp0,fpn,tb->f2);
-  }
-
-  // bitmapped linear tables
-  // 2^N bins from inner to cut, spaced in bitmapped manner
-  // f is converted to f/r when stored in f[i]
-  // e,f can match read-in values, else compute via spline interp
-
-  if (tabstyle == BITMAP) {
-    double r;
-    union_int_float_t rsq_lookup;
-    int masklo,maskhi;
-
-    // linear lookup tables of length ntable = 2^n
-    // stored value = value at lower edge of bin
-
-    init_bitmap(inner,tb->cut,tablength,masklo,maskhi,tb->nmask,tb->nshiftbits);
-    int ntable = 1 << tablength;
-    int ntablem1 = ntable - 1;
-
-    memory->create(tb->rsq,ntable,"pair:rsq");
-    memory->create(tb->e,ntable,"pair:e");
-    memory->create(tb->f,ntable,"pair:f");
-    memory->create(tb->de,ntable,"pair:de");
-    memory->create(tb->df,ntable,"pair:df");
-    memory->create(tb->drsq,ntable,"pair:drsq");
-
-    union_int_float_t minrsq_lookup;
-    minrsq_lookup.i = 0 << tb->nshiftbits;
-    minrsq_lookup.i |= maskhi;
-
-    for (int i = 0; i < ntable; i++) {
-      rsq_lookup.i = i << tb->nshiftbits;
-      rsq_lookup.i |= masklo;
-      if (rsq_lookup.f < tb->innersq) {
-        rsq_lookup.i = i << tb->nshiftbits;
-        rsq_lookup.i |= maskhi;
-      }
-      r = sqrtf(rsq_lookup.f);
-      tb->rsq[i] = rsq_lookup.f;
-      if (tb->match) {
-        tb->e[i] = tb->efile[i];
-        tb->f[i] = tb->ffile[i]/r;
-      } else {
-        tb->e[i] = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        tb->f[i] = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-      }
-      minrsq_lookup.f = MIN(minrsq_lookup.f,rsq_lookup.f);
-    }
-
-    tb->innersq = minrsq_lookup.f;
-
-    for (int i = 0; i < ntablem1; i++) {
-      tb->de[i] = tb->e[i+1] - tb->e[i];
-      tb->df[i] = tb->f[i+1] - tb->f[i];
-      tb->drsq[i] = 1.0/(tb->rsq[i+1] - tb->rsq[i]);
-    }
-
-    // get the delta values for the last table entries
-    // tables are connected periodically between 0 and ntablem1
-
-    tb->de[ntablem1] = tb->e[0] - tb->e[ntablem1];
-    tb->df[ntablem1] = tb->f[0] - tb->f[ntablem1];
-    tb->drsq[ntablem1] = 1.0/(tb->rsq[0] - tb->rsq[ntablem1]);
-
-    // get the correct delta values at itablemax
-    // smallest r is in bin itablemin
-    // largest r is in bin itablemax, which is itablemin-1,
-    //   or ntablem1 if itablemin=0
-
-    // deltas at itablemax only needed if corresponding rsq < cut*cut
-    // if so, compute deltas between rsq and cut*cut
-    //   if tb->match, data at cut*cut is unavailable, so we'll take
-    //   deltas at itablemax-1 as a good approximation
-
-    double e_tmp,f_tmp;
-    int itablemin = minrsq_lookup.i & tb->nmask;
-    itablemin >>= tb->nshiftbits;
-    int itablemax = itablemin - 1;
-    if (itablemin == 0) itablemax = ntablem1;
-    int itablemaxm1 = itablemax - 1;
-    if (itablemax == 0) itablemaxm1 = ntablem1;
-    rsq_lookup.i = itablemax << tb->nshiftbits;
-    rsq_lookup.i |= maskhi;
-    if (rsq_lookup.f < tb->cut*tb->cut) {
-      if (tb->match) {
-        tb->de[itablemax] = tb->de[itablemaxm1];
-        tb->df[itablemax] = tb->df[itablemaxm1];
-        tb->drsq[itablemax] = tb->drsq[itablemaxm1];
-      } else {
-            rsq_lookup.f = tb->cut*tb->cut;
-        r = sqrtf(rsq_lookup.f);
-        e_tmp = splint(tb->rfile,tb->efile,tb->e2file,tb->ninput,r);
-        f_tmp = splint(tb->rfile,tb->ffile,tb->f2file,tb->ninput,r)/r;
-        tb->de[itablemax] = e_tmp - tb->e[itablemax];
-        tb->df[itablemax] = f_tmp - tb->f[itablemax];
-        tb->drsq[itablemax] = 1.0/(rsq_lookup.f - tb->rsq[itablemax]);
-      }
-    }
-  }
-}
-
-/* ----------------------------------------------------------------------
-   set all ptrs in a table to NULL, so can be freed safely
-------------------------------------------------------------------------- */
-
-void PairTableRX::null_table(Table *tb)
-{
-  tb->rfile = tb->efile = tb->ffile = NULL;
-  tb->e2file = tb->f2file = NULL;
-  tb->rsq = tb->drsq = tb->e = tb->de = NULL;
-  tb->f = tb->df = tb->e2 = tb->f2 = NULL;
-}
-
-/* ----------------------------------------------------------------------
-   free all arrays in a table
-------------------------------------------------------------------------- */
-
-void PairTableRX::free_table(Table *tb)
-{
-  memory->destroy(tb->rfile);
-  memory->destroy(tb->efile);
-  memory->destroy(tb->ffile);
-  memory->destroy(tb->e2file);
-  memory->destroy(tb->f2file);
-
-  memory->destroy(tb->rsq);
-  memory->destroy(tb->drsq);
-  memory->destroy(tb->e);
-  memory->destroy(tb->de);
-  memory->destroy(tb->f);
-  memory->destroy(tb->df);
-  memory->destroy(tb->e2);
-  memory->destroy(tb->f2);
-}
-
-/* ----------------------------------------------------------------------
-   spline and splint routines modified from Numerical Recipes
-------------------------------------------------------------------------- */
-
-void PairTableRX::spline(double *x, double *y, int n,
-                       double yp1, double ypn, double *y2)
-{
-  int i,k;
-  double p,qn,sig,un;
-  double *u = new double[n];
-
-  if (yp1 > 0.99e30) y2[0] = u[0] = 0.0;
-  else {
-    y2[0] = -0.5;
-    u[0] = (3.0/(x[1]-x[0])) * ((y[1]-y[0]) / (x[1]-x[0]) - yp1);
-  }
-  for (i = 1; i < n-1; i++) {
-    sig = (x[i]-x[i-1]) / (x[i+1]-x[i-1]);
-    p = sig*y2[i-1] + 2.0;
-    y2[i] = (sig-1.0) / p;
-    u[i] = (y[i+1]-y[i]) / (x[i+1]-x[i]) - (y[i]-y[i-1]) / (x[i]-x[i-1]);
-    u[i] = (6.0*u[i] / (x[i+1]-x[i-1]) - sig*u[i-1]) / p;
-  }
-  if (ypn > 0.99e30) qn = un = 0.0;
-  else {
-    qn = 0.5;
-    un = (3.0/(x[n-1]-x[n-2])) * (ypn - (y[n-1]-y[n-2]) / (x[n-1]-x[n-2]));
-  }
-  y2[n-1] = (un-qn*u[n-2]) / (qn*y2[n-2] + 1.0);
-  for (k = n-2; k >= 0; k--) y2[k] = y2[k]*y2[k+1] + u[k];
-
-  delete [] u;
-}
-
-/* ---------------------------------------------------------------------- */
-
-double PairTableRX::splint(double *xa, double *ya, double *y2a, int n, double x)
-{
-  int klo,khi,k;
-  double h,b,a,y;
-
-  klo = 0;
-  khi = n-1;
-  while (khi-klo > 1) {
-    k = (khi+klo) >> 1;
-    if (xa[k] > x) khi = k;
-    else klo = k;
-  }
-  h = xa[khi]-xa[klo];
-  a = (xa[khi]-x) / h;
-  b = (x-xa[klo]) / h;
-  y = a*ya[klo] + b*ya[khi] +
-    ((a*a*a-a)*y2a[klo] + (b*b*b-b)*y2a[khi]) * (h*h)/6.0;
-  return y;
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairTableRX::write_restart(FILE *fp)
-{
-  write_restart_settings(fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairTableRX::read_restart(FILE *fp)
-{
-  read_restart_settings(fp);
-  allocate();
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairTableRX::write_restart_settings(FILE *fp)
-{
-  fwrite(&tabstyle,sizeof(int),1,fp);
-  fwrite(&tablength,sizeof(int),1,fp);
-  fwrite(&ewaldflag,sizeof(int),1,fp);
-  fwrite(&pppmflag,sizeof(int),1,fp);
-  fwrite(&msmflag,sizeof(int),1,fp);
-  fwrite(&dispersionflag,sizeof(int),1,fp);
-  fwrite(&tip4pflag,sizeof(int),1,fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairTableRX::read_restart_settings(FILE *fp)
-{
-  if (comm->me == 0) {
-    fread(&tabstyle,sizeof(int),1,fp);
-    fread(&tablength,sizeof(int),1,fp);
-    fread(&ewaldflag,sizeof(int),1,fp);
-    fread(&pppmflag,sizeof(int),1,fp);
-    fread(&msmflag,sizeof(int),1,fp);
-    fread(&dispersionflag,sizeof(int),1,fp);
-    fread(&tip4pflag,sizeof(int),1,fp);
-  }
-  MPI_Bcast(&tabstyle,1,MPI_INT,0,world);
-  MPI_Bcast(&tablength,1,MPI_INT,0,world);
-  MPI_Bcast(&ewaldflag,1,MPI_INT,0,world);
-  MPI_Bcast(&pppmflag,1,MPI_INT,0,world);
-  MPI_Bcast(&msmflag,1,MPI_INT,0,world);
-  MPI_Bcast(&dispersionflag,1,MPI_INT,0,world);
-  MPI_Bcast(&tip4pflag,1,MPI_INT,0,world);
-}
-
 /* ---------------------------------------------------------------------- */
 
 double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
@@ -1129,26 +507,6 @@ double PairTableRX::single(int i, int j, int itype, int jtype, double rsq,
   return factor_lj*phi;
 }
 
-/* ----------------------------------------------------------------------
-   return the Coulomb cutoff for tabled potentials
-   called by KSpace solvers which require that all pairwise cutoffs be the same
-   loop over all tables not just those indexed by tabindex[i][j] since
-     no way to know which tables are active since pair::init() not yet called
-------------------------------------------------------------------------- */
-
-void *PairTableRX::extract(const char *str, int &dim)
-{
-  if (strcmp(str,"cut_coul") != 0) return NULL;
-  if (ntables == 0) error->all(FLERR,"All pair coeffs are not set");
-
-  double cut_coul = tables[0].cut;
-  for (int m = 1; m < ntables; m++)
-    if (tables[m].cut != cut_coul)
-      error->all(FLERR,"Pair table cutoffs must all be equal to use with KSpace");
-  dim = 0;
-  return &tables[0].cut;
-}
-
 /* ---------------------------------------------------------------------- */
 
 void PairTableRX::getMixingWeights(int id, double &mixWtSite1old, double &mixWtSite2old, double &mixWtSite1, double &mixWtSite2)
diff --git a/src/USER-DPD/pair_table_rx.h b/src/USER-DPD/pair_table_rx.h
index 34b9fd75ce..da7889e99a 100644
--- a/src/USER-DPD/pair_table_rx.h
+++ b/src/USER-DPD/pair_table_rx.h
@@ -20,11 +20,11 @@ PairStyle(table/rx,PairTableRX)
 #ifndef LMP_PAIR_TABLE_RX_H
 #define LMP_PAIR_TABLE_RX_H
 
-#include "pair.h"
+#include "pair_table.h"
 
 namespace LAMMPS_NS {
 
-class PairTableRX : public Pair {
+class PairTableRX : public PairTable {
  public:
   PairTableRX(class LAMMPS *);
   virtual ~PairTableRX();
@@ -32,42 +32,9 @@ class PairTableRX : public Pair {
   virtual void compute(int, int);
   void settings(int, char **);
   void coeff(int, char **);
-  double init_one(int, int);
-  void write_restart(FILE *);
-  void read_restart(FILE *);
-  void write_restart_settings(FILE *);
-  void read_restart_settings(FILE *);
-  double single(int, int, int, int, double, double, double, double &);
-  void *extract(const char *, int &);
+  virtual double single(int, int, int, int, double, double, double, double &);
 
  protected:
-  enum{LOOKUP,LINEAR,SPLINE,BITMAP};
-
-  int tabstyle,tablength;
-  struct Table {
-    int ninput,rflag,fpflag,match,ntablebits;
-    int nshiftbits,nmask;
-    double rlo,rhi,fplo,fphi,cut;
-    double *rfile,*efile,*ffile;
-    double *e2file,*f2file;
-    double innersq,delta,invdelta,deltasq6;
-    double *rsq,*drsq,*e,*de,*f,*df,*e2,*f2;
-  };
-  int ntables;
-  Table *tables;
-
-  int **tabindex;
-
-  void allocate();
-  void read_table(Table *, char *, char *);
-  void param_extract(Table *, char *);
-  void bcast_table(Table *);
-  void spline_table(Table *);
-  void compute_table(Table *);
-  void null_table(Table *);
-  void free_table(Table *);
-  void spline(double *, double *, int, double, double, double *);
-  double splint(double *, double *, double *, int, double);
 
   int nspecies;
   char *site1, *site2;
diff --git a/src/atom.cpp b/src/atom.cpp
index d4c00bc0a5..1191f0f2b5 100644
--- a/src/atom.cpp
+++ b/src/atom.cpp
@@ -103,7 +103,6 @@ Atom::Atom(LAMMPS *lmp) : Pointers(lmp)
   uCond = uMech = uChem = uCG = uCGnew = NULL;
   duChem = NULL;
   dpdTheta = NULL;
-  ssaAIR = NULL;
 
   // USER-MESO
 
@@ -305,7 +304,6 @@ Atom::~Atom()
   memory->destroy(uCG);
   memory->destroy(uCGnew);
   memory->destroy(duChem);
-  memory->destroy(ssaAIR);
 
   memory->destroy(cc);
   memory->destroy(cc_flux);
@@ -346,9 +344,11 @@ Atom::~Atom()
     delete [] iname[i];
     memory->destroy(ivector[i]);
   }
-  for (int i = 0; i < ndvector; i++) {
-    delete [] dname[i];
-    memory->destroy(dvector[i]);
+  if (dvector != NULL) {
+    for (int i = 0; i < ndvector; i++) {
+      delete [] dname[i];
+      memory->destroy(dvector[i]);
+    }
   }
 
   memory->sfree(iname);
diff --git a/src/atom.h b/src/atom.h
index 29a1c5d69e..007142a1c0 100644
--- a/src/atom.h
+++ b/src/atom.h
@@ -93,7 +93,6 @@ class Atom : protected Pointers {
   double *duChem;
   double *dpdTheta;
   int nspecies_dpd;
-  int *ssaAIR; // Shardlow Splitting Algorithm Active Interaction Region number
 
   // USER-MESO package
 
@@ -262,8 +261,8 @@ class Atom : protected Pointers {
   void update_callback(int);
 
   int find_custom(const char *, int &);
-  int add_custom(const char *, int);
-  void remove_custom(int, int);
+  virtual int add_custom(const char *, int);
+  virtual void remove_custom(int, int);
 
   virtual void sync_modify(ExecutionSpace, unsigned int, unsigned int) {}
 
diff --git a/src/atom_masks.h b/src/atom_masks.h
index 119f09f273..8e29448488 100644
--- a/src/atom_masks.h
+++ b/src/atom_masks.h
@@ -42,6 +42,18 @@
 #define ENERGY_MASK    0x00010000
 #define VIRIAL_MASK    0x00020000
 
+// DPD
+
+#define DPDRHO_MASK       0x00040000
+#define DPDTHETA_MASK     0x00080000
+#define UCOND_MASK        0x00100000
+#define UMECH_MASK        0x00200000
+#define UCHEM_MASK        0x00400000
+#define UCG_MASK          0x00800000
+#define UCGNEW_MASK       0x01000000
+#define DUCHEM_MASK       0x02000000
+#define DVECTOR_MASK      0x04000000
+
 // granular
 
 #define RADIUS_MASK    0x00100000
diff --git a/src/fix_property_atom.cpp b/src/fix_property_atom.cpp
index b83aadc95d..002260d8f0 100644
--- a/src/fix_property_atom.cpp
+++ b/src/fix_property_atom.cpp
@@ -134,7 +134,6 @@ FixPropertyAtom::FixPropertyAtom(LAMMPS *lmp, int narg, char **arg) :
   // register with Atom class
 
   nmax_old = 0;
-  grow_arrays(atom->nmax);
   atom->add_callback(0);
   atom->add_callback(1);
   if (border) atom->add_callback(2);
@@ -190,6 +189,8 @@ int FixPropertyAtom::setmask()
 
 void FixPropertyAtom::init()
 {
+  grow_arrays(atom->nmax);
+
   // error if atom style has changed since fix was defined
   // don't allow this b/c user could change to style that defines molecule,q
 
diff --git a/src/fix_property_atom.h b/src/fix_property_atom.h
index 77a41f393a..d923d76cac 100644
--- a/src/fix_property_atom.h
+++ b/src/fix_property_atom.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class FixPropertyAtom : public Fix {
  public:
   FixPropertyAtom(class LAMMPS *, int, char **);
-  ~FixPropertyAtom();
+  virtual ~FixPropertyAtom();
   int setmask();
   void init();
 
@@ -38,7 +38,7 @@ class FixPropertyAtom : public Fix {
   void write_data_section_keyword(int, FILE *);
   void write_data_section(int, FILE *, int, double **, int);
 
-  void grow_arrays(int);
+  virtual void grow_arrays(int);
   void copy_arrays(int, int, int);
   int pack_border(int, int *, double *);
   int unpack_border(int, int, double *);
@@ -50,7 +50,7 @@ class FixPropertyAtom : public Fix {
   int maxsize_restart();
   double memory_usage();
 
- private:
+ protected:
   int nvalue,border;
   int molecule_flag,q_flag,rmass_flag;
   int *style,*index;
diff --git a/src/fix_wall.cpp b/src/fix_wall.cpp
index 503b87f4a7..8b569cafc6 100644
--- a/src/fix_wall.cpp
+++ b/src/fix_wall.cpp
@@ -201,6 +201,8 @@ FixWall::FixWall(LAMMPS *lmp, int narg, char **arg) :
 
 FixWall::~FixWall()
 {
+  if (copymode) return;
+
   for (int m = 0; m < nwall; m++) {
     delete [] xstr[m];
     delete [] estr[m];
diff --git a/src/fix_wall_lj93.h b/src/fix_wall_lj93.h
index 40337a5176..3763a02910 100644
--- a/src/fix_wall_lj93.h
+++ b/src/fix_wall_lj93.h
@@ -28,9 +28,9 @@ class FixWallLJ93 : public FixWall {
  public:
   FixWallLJ93(class LAMMPS *, int, char **);
   void precompute(int);
-  void wall_particle(int, int, double);
+  virtual void wall_particle(int, int, double);
 
- private:
+ protected:
   double coeff1[6],coeff2[6],coeff3[6],coeff4[6],offset[6];
 };
 
diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp
index a5ca7a5366..dde544a69f 100644
--- a/src/neigh_list.cpp
+++ b/src/neigh_list.cpp
@@ -79,7 +79,8 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp)
 
   // USER-DPD package
 
-  ndxAIR_ssa = NULL;
+  for (int i = 0; i < 8; i++) AIRct_ssa[i] = 0;
+  np = NULL;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -99,10 +100,6 @@ NeighList::~NeighList()
 
   delete [] iskip;
   memory->destroy(ijskip);
-
-  if (ssa) {
-    memory->sfree(ndxAIR_ssa);
-  }
 }
 
 /* ----------------------------------------------------------------------
@@ -203,14 +200,16 @@ void NeighList::grow(int nlocal, int nall)
   if (listmiddle) listmiddle->grow(nlocal,nall);
 
   // skip if data structs are already big enough
-
-  if (ghost) {
+  if (ssa) {
+    if ((nlocal * 3) + nall <= maxatom) return;
+  } else if (ghost) {
     if (nall <= maxatom) return;
   } else {
     if (nlocal <= maxatom) return;
   }
 
-  maxatom = atom->nmax;
+  if (ssa) maxatom = (nlocal * 3) + nall;
+  else maxatom = atom->nmax;
 
   memory->destroy(ilist);
   memory->destroy(numneigh);
@@ -224,12 +223,6 @@ void NeighList::grow(int nlocal, int nall)
     firstdouble = (double **) memory->smalloc(maxatom*sizeof(double *),
                                               "neighlist:firstdouble");
   }
-
-  if (ssa) {
-    if (ndxAIR_ssa) memory->sfree(ndxAIR_ssa);
-    ndxAIR_ssa = (uint16_t (*)[8]) memory->smalloc(sizeof(uint16_t)*8*maxatom,
-      "neighlist:ndxAIR_ssa");
-  }
 }
 
 /* ----------------------------------------------------------------------
@@ -306,7 +299,5 @@ bigint NeighList::memory_usage()
     }
   }
 
-  if (ndxAIR_ssa) bytes += sizeof(uint16_t) * 8 * maxatom;
-
   return bytes;
 }
diff --git a/src/neigh_list.h b/src/neigh_list.h
index 3fb3868114..4010a68857 100644
--- a/src/neigh_list.h
+++ b/src/neigh_list.h
@@ -81,7 +81,8 @@ class NeighList : protected Pointers {
 
   // USER-DPD package and Shardlow Splitting Algorithm (SSA) support
 
-  uint16_t (*ndxAIR_ssa)[8]; // for each atom, last neighbor index of each AIR
+  int AIRct_ssa[8]; // count of how many atoms in each AIR
+  class NPair *np;           // ptr to NPair instance I depend on
 
   // methods
 
diff --git a/src/neighbor.cpp b/src/neighbor.cpp
index 3943d876f9..a460be0065 100644
--- a/src/neighbor.cpp
+++ b/src/neighbor.cpp
@@ -808,7 +808,7 @@ int Neighbor::init_pair()
     }
 
     PairCreator pair_creator = pairclass[flag-1];
-    neigh_pair[i] = pair_creator(lmp);
+    lists[i]->np = neigh_pair[i] = pair_creator(lmp);
     neigh_pair[i]->post_constructor(requests[i]);
     neigh_pair[i]->istyle = flag;
 
diff --git a/src/nstencil.h b/src/nstencil.h
index 7985d23202..a4c6a4af66 100644
--- a/src/nstencil.h
+++ b/src/nstencil.h
@@ -30,6 +30,7 @@ class NStencil : protected Pointers {
   int *nstencil_multi;             // # bins in each type-based multi stencil
   int **stencil_multi;             // list of bin offsets in each stencil
   double **distsq_multi;           // sq distances to bins in each stencil
+  int sx,sy,sz;                    // extent of stencil in each dim
 
   double cutoff_custom;            // cutoff set by requestor
 
@@ -64,7 +65,6 @@ class NStencil : protected Pointers {
   int xyzflag;                     // 1 if stencilxyz is allocated
   int maxstencil;                  // max size of stencil
   int maxstencil_multi;            // max sizes of stencils
-  int sx,sy,sz;                    // extent of stencil in each dim
 
   int dimension;
 
diff --git a/src/pair.h b/src/pair.h
index b57004d965..0f7b0f85b6 100644
--- a/src/pair.h
+++ b/src/pair.h
@@ -211,10 +211,12 @@ class Pair : protected Pointers {
   double tabinner;                     // inner cutoff for Coulomb table
   double tabinner_disp;                 // inner cutoff for dispersion table
 
+ public:
   // custom data type for accessing Coulomb tables
 
   typedef union {int i; float f;} union_int_float_t;
 
+ protected:
   int vflag_fdotr;
   int maxeatom,maxvatom;
 
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index 31360d13ff..751560deff 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -39,9 +39,6 @@ PairHybrid::PairHybrid(LAMMPS *lmp) : Pair(lmp),
 
   outerflag = 0;
   respaflag = 0;
-
-  if (lmp->kokkos)
-    error->all(FLERR,"Cannot yet use pair hybrid with Kokkos");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -382,6 +379,9 @@ void PairHybrid::coeff(int narg, char **arg)
   if (narg < 3) error->all(FLERR,"Incorrect args for pair coefficients");
   if (!allocated) allocate();
 
+  if (lmp->kokkos)
+    error->all(FLERR,"Cannot yet use pair hybrid with Kokkos");
+
   int ilo,ihi,jlo,jhi;
   force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
   force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
diff --git a/src/pair_hybrid.h b/src/pair_hybrid.h
index b8b9af5f40..463ae00eca 100644
--- a/src/pair_hybrid.h
+++ b/src/pair_hybrid.h
@@ -35,7 +35,7 @@ class PairHybrid : public Pair {
  public:
   PairHybrid(class LAMMPS *);
   virtual ~PairHybrid();
-  void compute(int, int);
+  virtual void compute(int, int);
   void settings(int, char **);
   virtual void coeff(int, char **);
   void init_style();
diff --git a/src/pair_hybrid_overlay.h b/src/pair_hybrid_overlay.h
index 7fd0e3347f..934be05365 100644
--- a/src/pair_hybrid_overlay.h
+++ b/src/pair_hybrid_overlay.h
@@ -27,7 +27,7 @@ namespace LAMMPS_NS {
 class PairHybridOverlay : public PairHybrid {
  public:
   PairHybridOverlay(class LAMMPS *);
-  ~PairHybridOverlay() {}
+  virtual ~PairHybridOverlay() {}
   void coeff(int, char **);
 };
 
diff --git a/src/pair_table.h b/src/pair_table.h
index 2de4b6ea99..b723fd2d98 100644
--- a/src/pair_table.h
+++ b/src/pair_table.h
@@ -30,19 +30,19 @@ class PairTable : public Pair {
   virtual ~PairTable();
 
   virtual void compute(int, int);
-  void settings(int, char **);
+  virtual void settings(int, char **);
   void coeff(int, char **);
-  double init_one(int, int);
+  virtual double init_one(int, int);
   void write_restart(FILE *);
   void read_restart(FILE *);
   void write_restart_settings(FILE *);
   void read_restart_settings(FILE *);
-  double single(int, int, int, int, double, double, double, double &);
+  virtual double single(int, int, int, int, double, double, double, double &);
   void *extract(const char *, int &);
 
- protected:
   enum{LOOKUP,LINEAR,SPLINE,BITMAP};
 
+ protected:
   int tabstyle,tablength;
   struct Table {
     int ninput,rflag,fpflag,match,ntablebits;
@@ -66,8 +66,8 @@ class PairTable : public Pair {
   virtual void compute_table(Table *);
   void null_table(Table *);
   void free_table(Table *);
-  void spline(double *, double *, int, double, double, double *);
-  double splint(double *, double *, double *, int, double);
+  static void spline(double *, double *, int, double, double, double *);
+  static double splint(double *, double *, double *, int, double);
 };
 
 }
diff --git a/src/read_restart.cpp b/src/read_restart.cpp
index 6a950353ef..82583bfe01 100644
--- a/src/read_restart.cpp
+++ b/src/read_restart.cpp
@@ -207,7 +207,13 @@ void ReadRestart::command(int narg, char **arg)
     memory->create(buf,assignedChunkSize,"read_restart:buf");
     mpiio->read((headerOffset+assignedChunkOffset),assignedChunkSize,buf);
     mpiio->close();
-
+    if (!nextra) { // We can actually calculate number of atoms from assignedChunkSize
+      atom->nlocal = 1; // temporarily claim there is one atom...
+      int perAtomSize = avec->size_restart(); // ...so we can get its size
+      atom->nlocal = 0; // restore nlocal to zero atoms
+      int atomCt = (int) (assignedChunkSize / perAtomSize);
+      if (atomCt > atom->nmax) avec->grow(atomCt);
+    }
     m = 0;
     while (m < assignedChunkSize) m += avec->unpack_restart(&buf[m]);
   }
@@ -1010,6 +1016,7 @@ void ReadRestart::file_layout()
         // if the number of ranks that did the writing is different
 
         if (me == 0) {
+          int ndx;
           int *all_written_send_sizes;
           memory->create(all_written_send_sizes,nprocs_file,
                          "write_restart:all_written_send_sizes");
@@ -1019,30 +1026,61 @@ void ReadRestart::file_layout()
 
           fread(all_written_send_sizes,sizeof(int),nprocs_file,fp);
 
-          int init_chunk_number = nprocs_file/nprocs;
-          int num_extra_chunks = nprocs_file - (nprocs*init_chunk_number);
+          if ((nprocs != nprocs_file) && !(atom->nextra_store)) {
+            // nprocs differ, but atom sizes are fixed length, yeah!
+            atom->nlocal = 1; // temporarily claim there is one atom...
+            int perAtomSize = atom->avec->size_restart(); // ...so we can get its size
+            atom->nlocal = 0; // restore nlocal to zero atoms
 
-          for (int i = 0; i < nprocs; i++) {
-            if (i < num_extra_chunks)
-              nproc_chunk_number[i] = init_chunk_number+1;
-            else
-              nproc_chunk_number[i] = init_chunk_number;
-          }
+            bigint total_size = 0;
+            for (int i = 0; i < nprocs_file; ++i) {
+              total_size += all_written_send_sizes[i];
+            }
+            bigint total_ct = total_size / perAtomSize;
 
-          int all_written_send_sizes_index = 0;
-          bigint current_offset = 0;
-          for (int i=0;i<nprocs;i++) {
-            nproc_chunk_offsets[i] = current_offset;
-            nproc_chunk_sizes[i] = 0;
-            for (int j=0;j<nproc_chunk_number[i];j++) {
-              nproc_chunk_sizes[i] +=
-                all_written_send_sizes[all_written_send_sizes_index];
-              current_offset +=
-                (all_written_send_sizes[all_written_send_sizes_index] *
-                 sizeof(double));
-              all_written_send_sizes_index++;
+            bigint base_ct = total_ct / nprocs;
+            bigint leftover_ct = total_ct  - (base_ct * nprocs);
+            bigint current_ByteOffset = 0;
+            base_ct += 1;
+            bigint base_ByteOffset = base_ct * (perAtomSize * sizeof(double));
+            for (ndx = 0; ndx < leftover_ct; ++ndx) {
+              nproc_chunk_offsets[ndx] = current_ByteOffset;
+              nproc_chunk_sizes[ndx] = base_ct * perAtomSize;
+              current_ByteOffset += base_ByteOffset;
+            }
+            base_ct -= 1;
+            base_ByteOffset -= (perAtomSize * sizeof(double));
+            for (; ndx < nprocs; ++ndx) {
+              nproc_chunk_offsets[ndx] = current_ByteOffset;
+              nproc_chunk_sizes[ndx] = base_ct * perAtomSize;
+              current_ByteOffset += base_ByteOffset;
+            }
+          } else { // we have to read in based on how it was written
+            int init_chunk_number = nprocs_file/nprocs;
+            int num_extra_chunks = nprocs_file - (nprocs*init_chunk_number);
+
+            for (int i = 0; i < nprocs; i++) {
+              if (i < num_extra_chunks)
+                nproc_chunk_number[i] = init_chunk_number+1;
+              else
+                nproc_chunk_number[i] = init_chunk_number;
             }
 
+            int all_written_send_sizes_index = 0;
+            bigint current_offset = 0;
+            for (int i=0;i<nprocs;i++) {
+              nproc_chunk_offsets[i] = current_offset;
+              nproc_chunk_sizes[i] = 0;
+              for (int j=0;j<nproc_chunk_number[i];j++) {
+                nproc_chunk_sizes[i] +=
+                  all_written_send_sizes[all_written_send_sizes_index];
+                current_offset +=
+                  (all_written_send_sizes[all_written_send_sizes_index] *
+                   sizeof(double));
+                all_written_send_sizes_index++;
+              }
+
+            }
           }
           memory->destroy(all_written_send_sizes);
           memory->destroy(nproc_chunk_number);