diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst
index 0169826751..9dfb28fa8b 100644
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@@ -72,6 +72,7 @@ KOKKOS, o = OPENMP, t = OPT.
    * :doc:`gyration/shape/chunk <compute_gyration_shape_chunk>`
    * :doc:`heat/flux <compute_heat_flux>`
    * :doc:`heat/flux/tally <compute_tally>`
+   * :doc:`heat/flux/virial/tally <compute_tally>`
    * :doc:`hexorder/atom <compute_hexorder_atom>`
    * :doc:`hma <compute_hma>`
    * :doc:`improper <compute_improper>`
diff --git a/doc/src/compute.rst b/doc/src/compute.rst
index c2b2b92aa1..7a89c6cc87 100644
--- a/doc/src/compute.rst
+++ b/doc/src/compute.rst
@@ -208,7 +208,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` doc
 * :doc:`event/displace <compute_event_displace>` - detect event on atom displacement
 * :doc:`fabric <compute_fabric>` - calculates fabric tensors from pair interactions
 * :doc:`fep <compute_fep>` -
-* :doc:`force/tally <compute_tally>` -
+* :doc:`force/tally <compute_tally>` - force between two groups of atoms via the tally callback mechanism
 * :doc:`fragment/atom <compute_cluster_atom>` - fragment ID for each atom
 * :doc:`global/atom <compute_global_atom>` -
 * :doc:`group/group <compute_group_group>` - energy/force between two groups of atoms
@@ -217,7 +217,8 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` doc
 * :doc:`gyration/shape <compute_gyration_shape>` - shape parameters from gyration tensor
 * :doc:`gyration/shape/chunk <compute_gyration_shape_chunk>` - shape parameters from gyration tensor for each chunk
 * :doc:`heat/flux <compute_heat_flux>` - heat flux through a group of atoms
-* :doc:`heat/flux/tally <compute_tally>` -
+* :doc:`heat/flux/tally <compute_tally>` - heat flux through a group of atoms via the tally callback mechanism
+* :doc:`heat/flux/virial/tally <compute_tally>` - virial heat flux between two groups via the tally callback mechanism
 * :doc:`hexorder/atom <compute_hexorder_atom>` - bond orientational order parameter q6
 * :doc:`hma <compute_hma>` - harmonically mapped averaging for atomic crystals
 * :doc:`improper <compute_improper>` - energy of each improper sub-style
@@ -240,8 +241,8 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` doc
 * :doc:`pe <compute_pe>` - potential energy
 * :doc:`pe/atom <compute_pe_atom>` - potential energy for each atom
 * :doc:`mesont <compute_mesont>` - Nanotube bending,stretching, and intertube energies
-* :doc:`pe/mol/tally <compute_tally>` -
-* :doc:`pe/tally <compute_tally>` -
+* :doc:`pe/mol/tally <compute_tally>` - potential energy between two groups of atoms separated into intermolecular and intramolecular components via the tally callback mechanism
+* :doc:`pe/tally <compute_tally>` - potential energy between two groups of atoms via the tally callback mechanism
 * :doc:`plasticity/atom <compute_plasticity_atom>` - Peridynamic plasticity for each atom
 * :doc:`pressure <compute_pressure>` - total pressure and pressure tensor
 * :doc:`pressure/cylinder <compute_pressure_cylinder>` - pressure tensor in cylindrical coordinates
@@ -289,7 +290,7 @@ The individual style names on the :doc:`Commands compute <Commands_compute>` doc
 * :doc:`stress/atom <compute_stress_atom>` - stress tensor for each atom
 * :doc:`stress/mop <compute_stress_mop>` - normal components of the local stress tensor using the method of planes
 * :doc:`stress/mop/profile <compute_stress_mop>` - profile of the normal components of the local stress tensor using the method of planes
-* :doc:`stress/tally <compute_tally>` -
+* :doc:`stress/tally <compute_tally>` - stress between two groups of atoms via the tally callback mechanism
 * :doc:`tdpd/cc/atom <compute_tdpd_cc_atom>` - per-atom chemical concentration of a specified species for each tDPD particle
 * :doc:`temp <compute_temp>` - temperature of group of atoms
 * :doc:`temp/asphere <compute_temp_asphere>` - temperature of aspherical particles
diff --git a/doc/src/compute_tally.rst b/doc/src/compute_tally.rst
index 32e3e31030..81756dda7c 100644
--- a/doc/src/compute_tally.rst
+++ b/doc/src/compute_tally.rst
@@ -1,5 +1,6 @@
 .. index:: compute force/tally
 .. index:: compute heat/flux/tally
+.. index:: compute heat/flux/virial/tally
 .. index:: compute pe/tally
 .. index:: compute pe/mol/tally
 .. index:: compute stress/tally
@@ -10,6 +11,9 @@ compute force/tally command
 compute heat/flux/tally command
 ===============================
 
+compute heat/flux/virial/tally command
+======================================
+
 compute pe/tally command
 ========================
 
@@ -27,7 +31,7 @@ Syntax
    compute ID group-ID style group2-ID
 
 * ID, group-ID are documented in :doc:`compute <compute>` command
-* style = *force/tally* or *pe/tally* or *pe/mol/tally* or *stress/tally*
+* style = *force/tally* or *heat/flux/tally* or *heat/flux/virial/tally* or * or *pe/tally* or *pe/mol/tally* or *stress/tally*
 * group2-ID = group ID of second (or same) group
 
 Examples
@@ -38,13 +42,17 @@ Examples
    compute 1 lower force/tally upper
    compute 1 left pe/tally right
    compute 1 lower stress/tally lower
+   compute 1 subregion heat/flux/tally all
+   compute 1 liquid heat/flux/virial/tally solid
 
 Description
 """""""""""
 
 Define a computation that calculates properties between two groups of
-atoms by accumulating them from pairwise non-bonded computations.  The
-two groups can be the same. This is similar to :doc:`compute group/group <compute_group_group>` only that the data is
+atoms by accumulating them from pairwise non-bonded computations.
+Except for *heat/flux/virial/tally*, the two groups can be the same.
+This is similar to :doc:`compute group/group <compute_group_group>`
+only that the data is
 accumulated directly during the non-bonded force computation. The
 computes *force/tally*\ , *pe/tally*\ , *stress/tally*\ , and
 *heat/flux/tally* are primarily provided as example how to program
@@ -57,6 +65,76 @@ the based classes of LAMMPS.
 
 ----------
 
+Compute *heat/flux/tally* obtains the heat flux
+(strictly speaking, heat flow) inside the first group,
+which is the sum of the convective contribution
+due to atoms in the first group and the virial contribution
+due to interaction between the first and second groups:
+
+.. math::
+
+   \mathbf{Q}=  \sum_{i \in \text{group 1}} e_i \mathbf{v}_i + \frac{1}{2} \sum_{i \in \text{group 1}} \sum_{\substack{j \in \text{group 2} \\ j \neq i } } \left( \mathbf{F}_{ij} \cdot \mathbf{v}_j \right) \mathbf{r}_{ij}
+
+When the second group in *heat/flux/tally* is set to "all",
+the resulting values will be identical
+to that obtained by :doc:`compute heat/flux <compute_heat_flux>`,
+provided only pairwise interactions exist.
+
+Compute *heat/flux/virial/tally* obtains the total virial heat flux
+(strictly speaking, heat flow) into the first group due to interaction
+with the second group, and is defined as:
+
+.. math::
+
+   Q = \frac{1}{2} \sum_{i \in \text{group 1}} \sum_{j \in \text{group 2}} \mathbf{F}_{ij} \cdot \left(\mathbf{v}_i + \mathbf{v}_j \right)
+
+Although, the *heat/flux/virial/tally* compute
+does not include the convective term,
+it can be used to obtain the total heat flux over control surfaces,
+when there are no particles crossing over,
+such as is often in solid-solid and solid-liquid interfaces.
+This would be identical to the method of planes method.
+Note that the *heat/flux/virial/tally* compute is distinctly different
+from the *heat/flux* and *heat/flux/tally* computes,
+that are essentially volume averaging methods.
+The following example demonstrates the difference:
+
+.. code-block:: LAMMPS
+
+   # System with only pairwise interactions.
+   # Non-periodic boundaries in the x direction.
+   # Has LeftLiquid and RightWall groups along x direction.
+
+   # Heat flux over the solid-liquid interface
+   compute hflow_hfvt LeftLiquid heat/flux/virial/tally RightWall
+   variable hflux_hfvt equal c_hflow_hfvt/(ly*lz)
+
+   # x component of approximate heat flux vector inside the liquid region,
+   # two approaches.
+   #
+   compute myKE all ke/atom
+   compute myPE all pe/atom
+   compute myStress all stress/atom NULL virial
+   compute hflow_hf LeftLiquid heat/flux myKE myPE myStress
+   variable hflux_hf equal c_hflow_hf[1]/${volLiq}
+   #
+   compute hflow_hft LeftLiquid heat/flux/tally all
+   variable hflux_hft equal c_hflow_hft[1]/${volLiq}
+
+   # Pressure over the solid-liquid interface, three approaches.
+   #
+   compute force_gg RightWall group/group LeftLiquid
+   variable press_gg equal c_force_gg[1]/(ly*lz)
+   #
+   compute force_ft RightWall force/tally LeftLiquid
+   compute rforce_ft RightWall reduce sum c_force_ft[1]
+   variable press_ft equal c_rforce_ft/(ly*lz)
+   #
+   compute rforce_hfvt all reduce sum c_hflow_hfvt[1]
+   variable press_hfvt equal -c_rforce_hfvt/(ly*lz)
+
+----------
+
 The pairwise contributions are computing via a callback that the
 compute registers with the non-bonded pairwise force computation.
 This limits the use to systems that have no bonds, no Kspace, and no
@@ -83,7 +161,17 @@ magnitude) and a per atom 3-element vector (force contribution from
 each atom).  Compute *stress/tally* calculates a global scalar
 (average of the diagonal elements of the stress tensor) and a per atom
 vector (the 6 elements of stress tensor contributions from the
-individual atom).
+individual atom). As in :doc:`compute heat/flux <compute_heat_flux>`,
+compute *heat/flux/tally* calculates a global vector of length 6,
+where the first 3 components are the :math:`x`, :math:`y`, :math:`z`
+components of the full heat flow vector,
+and the next 3 components are the corresponding components
+of just the convective portion of the flow, i.e. the
+first term in the equation for :math:`\mathbf{Q}`.
+Compute *heat/flux/virial/tally* calculates a global scalar (heat flow)
+and a per atom 3-element vector
+(contribution to the force acting over atoms in the first group
+from individual atoms in both groups).
 
 Both the scalar and vector values calculated by this compute are
 "extensive".
diff --git a/src/BOCS/fix_bocs.cpp b/src/BOCS/fix_bocs.cpp
index d372d607e8..89ff3d8e4e 100644
--- a/src/BOCS/fix_bocs.cpp
+++ b/src/BOCS/fix_bocs.cpp
@@ -1100,7 +1100,12 @@ void FixBocs::initial_integrate_respa(int /*vflag*/, int ilevel, int /*iloop*/)
     nve_x();
     if (pstat_flag) remap();
   }
+}
 
+/* ---------------------------------------------------------------------- */
+
+void FixBocs::pre_force_respa(int /*vflag*/, int ilevel, int /*iloop*/)
+{
   // if barostat, redo KSpace coeffs at outermost level,
   // since volume has changed
 
diff --git a/src/BOCS/fix_bocs.h b/src/BOCS/fix_bocs.h
index 4edf670fa5..69b32d4cd0 100644
--- a/src/BOCS/fix_bocs.h
+++ b/src/BOCS/fix_bocs.h
@@ -37,6 +37,7 @@ class FixBocs : public Fix {
   virtual void initial_integrate(int);
   virtual void final_integrate();
   void initial_integrate_respa(int, int, int);
+  void pre_force_respa(int, int, int);
   void final_integrate_respa(int, int);
   virtual void pre_exchange();
   double compute_scalar();
diff --git a/src/DRUDE/fix_tgnh_drude.cpp b/src/DRUDE/fix_tgnh_drude.cpp
index 9f9bcada78..475a81e843 100644
--- a/src/DRUDE/fix_tgnh_drude.cpp
+++ b/src/DRUDE/fix_tgnh_drude.cpp
@@ -589,6 +589,7 @@ int FixTGNHDrude::setmask()
   mask |= INITIAL_INTEGRATE;
   mask |= FINAL_INTEGRATE;
   mask |= INITIAL_INTEGRATE_RESPA;
+  mask |= PRE_FORCE_RESPA;
   mask |= FINAL_INTEGRATE_RESPA;
   if (pre_exchange_flag) mask |= PRE_EXCHANGE;
   return mask;
@@ -1023,7 +1024,12 @@ void FixTGNHDrude::initial_integrate_respa(int /*vflag*/, int ilevel, int /*iloo
     nve_x();
     if (pstat_flag) remap();
   }
+}
 
+/* ---------------------------------------------------------------------- */
+
+void FixTGNHDrude::pre_force_respa(int /*vflag*/, int ilevel, int /*iloop*/)
+{
   // if barostat, redo KSpace coeffs at outermost level,
   // since volume has changed
 
diff --git a/src/DRUDE/fix_tgnh_drude.h b/src/DRUDE/fix_tgnh_drude.h
index 3549317655..617a99f856 100644
--- a/src/DRUDE/fix_tgnh_drude.h
+++ b/src/DRUDE/fix_tgnh_drude.h
@@ -28,6 +28,7 @@ class FixTGNHDrude : public Fix {
   virtual void setup(int);
   virtual void initial_integrate(int);
   virtual void final_integrate();
+  void pre_force_respa(int, int, int);
   void initial_integrate_respa(int, int, int);
   void final_integrate_respa(int, int);
   virtual void pre_exchange();
diff --git a/src/INTEL/intel_intrinsics.h b/src/INTEL/intel_intrinsics.h
index a17cc2859a..295310283d 100644
--- a/src/INTEL/intel_intrinsics.h
+++ b/src/INTEL/intel_intrinsics.h
@@ -85,7 +85,11 @@ struct vector_ops {};
 // Intrinsic routines for IMCI and AVX-512
 #if defined(__MIC__) || defined(__AVX512F__)
 // Integer vector class
+#ifdef __INTEL_LLVM_COMPILER
+#pragma pack(push,16)
+#else
 #pragma pack(push,64)
+#endif
 struct ivec32x16 {
   __m512i vec;
   ivec32x16() {}
@@ -113,7 +117,7 @@ struct vector_ops<double, KNC> {
     typedef double fscal;
     typedef F64vec8 fvec;
     typedef ivec32x16 ivec;
-    typedef __mmask bvec;
+    typedef __mmask16 bvec;
     typedef double farr[8] __attribute__((aligned(64)));
     typedef int iarr[16] __attribute__((aligned(64)));
     static fvec recip(const fvec &a) { return _mm512_recip_pd(a); }
@@ -250,7 +254,7 @@ struct vector_ops<float, KNC> {
     typedef float fscal;
     typedef F32vec16 fvec;
     typedef ivec32x16 ivec;
-    typedef __mmask bvec;
+    typedef __mmask16 bvec;
     typedef float farr[16] __attribute__((aligned(64)));
     typedef int iarr[16] __attribute__((aligned(64)));
     static const bvec full_mask = 0xFFFF;
@@ -380,16 +384,18 @@ struct vector_ops<float, KNC> {
       *r3 = gather<4>(*r3, mask, idxs, reinterpret_cast<const char *>(base) + 12);
     }
     // Additional routines needed for the implementation of mixed precision
-    static fvec cvtdown(const vector_ops<double,KNC>::fvec &lo, const vector_ops<double,KNC>::fvec &hi) {
+    static fvec cvtdown(const vector_ops<double,KNC>::fvec &lo, 
+                        const vector_ops<double,KNC>::fvec &hi) {
       __m512 t1 = _mm512_cvtpd_pslo(lo);
       __m512 t2 = _mm512_cvtpd_pslo(hi);
-      return _mm512_mask_permute4f128_ps(t1, 0xFF00, t2, _MM_PERM_BADC);
+      return _mm512_mask_shuffle_f32x4(_mm512_undefined_ps(), 0xFF00, t2, t2,
+                                       0x4E);
     }
     static vector_ops<double,KNC>::fvec cvtup_lo(const fvec &a) {
       return _mm512_cvtpslo_pd(a);
     }
     static vector_ops<double,KNC>::fvec cvtup_hi(const fvec &a) {
-      return _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a, _MM_PERM_BADC)); // permute DCBA -> BADC
+      return _mm512_cvtpslo_pd(_mm512_shuffle_f32x4(a, a, 0x4E));
     }
     static void mask_cvtup(const bvec &a, vector_ops<double,KNC>::bvec *blo, vector_ops<double,KNC>::bvec *bhi) {
       *blo = a & 0xFF;
@@ -1692,7 +1698,7 @@ struct vector_ops<flt_t, NONE> {
     typedef flt_t fscal;
     typedef flt_t fvec;
     typedef int ivec;
-    typedef bool bvec;
+    typedef int bvec;
     typedef flt_t farr[1];
     typedef int iarr[1];
     static fvec recip(const fvec &a) {
diff --git a/src/INTEL/intel_intrinsics_airebo.h b/src/INTEL/intel_intrinsics_airebo.h
index f49abbaf3d..ac58ca2438 100644
--- a/src/INTEL/intel_intrinsics_airebo.h
+++ b/src/INTEL/intel_intrinsics_airebo.h
@@ -651,12 +651,12 @@ class avec16pd {
     return a >> 8;
   }
   VEC_INLINE static __m512i get_ivec_hi(__m512i a) {
-    return _mm512_permute4f128_epi32(a, _MM_PERM_BADC);
+    return _mm512_shuffle_i32x4(a, a, 0x4E);
   }
 public:
   VEC_INLINE avec16pd(const FVEC_NAME &a) {
     lo_ = _mm512_cvtpslo_pd(a.val_);
-    hi_ = _mm512_cvtpslo_pd(_mm512_permute4f128_ps(a.val_, _MM_PERM_BADC));
+    hi_ = _mm512_cvtpslo_pd(_mm512_shuffle_f32x4(a.val_, a.val_, 0x4E));
   }
   VEC_INLINE static avec16pd undefined() {
     return avec16pd(_mm512_undefined_pd(), _mm512_undefined_pd());
diff --git a/src/INTEL/intel_preprocess.h b/src/INTEL/intel_preprocess.h
index 6f9b37a700..0bec9935db 100644
--- a/src/INTEL/intel_preprocess.h
+++ b/src/INTEL/intel_preprocess.h
@@ -16,6 +16,11 @@
    Contributing author: W. Michael Brown (Intel)
 ------------------------------------------------------------------------- */
 
+#ifdef __INTEL_LLVM_COMPILER
+#define __INTEL_COMPILER __INTEL_LLVM_COMPILER
+#define __INTEL_COMPILER_BUILD_DATE __INTEL_LLVM_COMPILER
+#endif
+
 #ifdef __INTEL_COMPILER
 #define LMP_SIMD_COMPILER
 #if (__INTEL_COMPILER_BUILD_DATE > 20160720)
diff --git a/src/INTEL/pair_airebo_intel.cpp b/src/INTEL/pair_airebo_intel.cpp
index 94874153aa..5ea0d3168b 100644
--- a/src/INTEL/pair_airebo_intel.cpp
+++ b/src/INTEL/pair_airebo_intel.cpp
@@ -2332,7 +2332,7 @@ static void aut_rebo_neigh(KernelArgsAIREBOT<flt_t,acc_t> * ka) {
     int n_skin = 0;
 
     int lowest_idx;
-    #pragma unroll(4)
+    //#pragma unroll(4)
     for (lowest_idx = 0; lowest_idx < jnum; lowest_idx += fvec::VL) {
       bvec j_mask = bvec::full();
       if (lowest_idx + fvec::VL > jnum) j_mask = bvec::only(jnum - lowest_idx);
diff --git a/src/INTEL/pair_dpd_intel.cpp b/src/INTEL/pair_dpd_intel.cpp
index 089396afa3..e7514a1f95 100644
--- a/src/INTEL/pair_dpd_intel.cpp
+++ b/src/INTEL/pair_dpd_intel.cpp
@@ -289,7 +289,7 @@ void PairDPDIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned nog2s
+        #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                  sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
diff --git a/src/INTEL/pair_eam_intel.cpp b/src/INTEL/pair_eam_intel.cpp
index a86040b6b3..dcff8957fd 100644
--- a/src/INTEL/pair_eam_intel.cpp
+++ b/src/INTEL/pair_eam_intel.cpp
@@ -308,7 +308,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         acc_t rhoi = (acc_t)0.0;
         int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned nog2s
+        #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
@@ -327,7 +327,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned nog2s
+        #pragma vector aligned
         #pragma simd reduction(+:rhoi)
         #endif
         for (int jj = 0; jj < ej; jj++) {
@@ -414,7 +414,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
       if (EFLAG) tevdwl = (acc_t)0.0;
 
       #if defined(LMP_SIMD_COMPILER)
-      #pragma vector aligned nog2s
+      #pragma vector aligned
       #pragma simd reduction(+:tevdwl)
       #endif
       for (int ii = iifrom; ii < iito; ++ii) {
@@ -488,7 +488,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
 
         int ej = 0;
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned nog2s
+        #pragma vector aligned
         #pragma ivdep
         #endif
         for (int jj = 0; jj < jnum; jj++) {
@@ -510,7 +510,7 @@ void PairEAMIntel::eval(const int offload, const int vflag,
         }
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned nog2s
+        #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                  sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
diff --git a/src/INTEL/pair_lj_cut_intel.cpp b/src/INTEL/pair_lj_cut_intel.cpp
index eb60e0442f..84bc664e18 100644
--- a/src/INTEL/pair_lj_cut_intel.cpp
+++ b/src/INTEL/pair_lj_cut_intel.cpp
@@ -241,7 +241,7 @@ void PairLJCutIntel::eval(const int offload, const int vflag,
           if (vflag == VIRIAL_PAIR) sv0 = sv1 = sv2 = sv3 = sv4 = sv5 = (acc_t)0;
 
         #if defined(LMP_SIMD_COMPILER)
-        #pragma vector aligned nog2s
+        #pragma vector aligned
         #pragma simd reduction(+:fxtmp, fytmp, fztmp, fwtmp, sevdwl, \
                                sv0, sv1, sv2, sv3, sv4, sv5)
         #endif
diff --git a/src/INTEL/pair_tersoff_intel.cpp b/src/INTEL/pair_tersoff_intel.cpp
index 248c3420e6..d4b8f7d499 100644
--- a/src/INTEL/pair_tersoff_intel.cpp
+++ b/src/INTEL/pair_tersoff_intel.cpp
@@ -349,11 +349,11 @@ void PairTersoffIntel::eval(const int offload, const int vflag,
           lmp_intel::vector_traits<lmp_intel::mode>::support_integer_and_gather_ops;
         bool use_scalar = VL < 4;
         if (use_scalar) {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::kernel<EFLAG>(ARGS);
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::NONE,false>::template kernel<EFLAG>(ARGS);
         } else if (pack_i) {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::kernel<EFLAG>(ARGS);
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,true >::template kernel<EFLAG>(ARGS);
         } else {
-          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::kernel<EFLAG>(ARGS);
+          IntelKernelTersoff<flt_t,acc_t,lmp_intel::mode,false>::template kernel<EFLAG>(ARGS);
         }
         if (EFLAG) oevdwl += sevdwl;
       }
@@ -673,7 +673,8 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
   fvec vrijsq = vdx_ij * vdx_ij + vdy_ij *  vdy_ij + vdz_ij * vdz_ij;
   fvec vrij = sqrt(vrijsq);
   ivec vis_orig = v::int_load_vl(is);
-  ivec vnumneigh_i = v::int_gather<4>(v_i0, vmask, vis_orig, numneigh);
+  ivec vnumneigh_i = v::template int_gather<4>(v_i0, vmask, vis_orig,
+                                               numneigh);
   ivec vc_idx_ij = v::int_mullo(v_i4floats, vw_j + v::int_mullo(v_i_ntypes, vw_i));
 
   fvec vzeta = v::zero();
@@ -700,14 +701,16 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     while (! v::mask_testz(vactive_mask) && cache_idx < N_CACHE) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
       vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
-          v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
+                         (v::template int_gather<4>(vks, vactive_mask,
+                                                    vkks + vcnumneigh_i,
+                                                    firstneigh)));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = (vx_k - vx_i);
       fvec vdy_ik = (vy_k - vy_i);
       fvec vdz_ik = (vz_k - vz_i);
       fvec vrsq = vdx_ik * vdx_ik + vdy_ik *  vdy_ik + vdz_ik * vdz_ik;
       ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
-      vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
+      vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
       bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
       bvec vsame_mask = v::int_cmpneq(vjs, vks);
       bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
@@ -751,14 +754,16 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     while (! v::mask_testz(vactive_mask)) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
       vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
-          v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
+                         (v::template int_gather<4>(vks, vactive_mask,
+                                                    vkks + vcnumneigh_i,
+                                                    firstneigh)));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = (vx_k - vx_i);
       fvec vdy_ik = (vy_k - vy_i);
       fvec vdz_ik = (vz_k - vz_i);
       fvec vrsq = vdx_ik * vdx_ik + vdy_ik *  vdy_ik + vdz_ik * vdz_ik;
       ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
-      vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
+      vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
       bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
       bvec vsame_mask = v::int_cmpneq(vjs, vks);
       bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
@@ -818,14 +823,16 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::kernel_step(
     while (! v::mask_testz(vactive_mask)) {
       bvec vnew_mask = vactive_mask & ~ veff_old_mask;
       vks = v::int_mullo(v_i4floats, v_i_NEIGHMASK &
-          v::int_gather<4>(vks, vactive_mask, vkks + vcnumneigh_i, firstneigh));
+                         (v::template int_gather<4>(vks, vactive_mask,
+                                                    vkks + vcnumneigh_i,
+                                                    firstneigh)));
       v::gather_x(vks, vnew_mask, x, &vx_k, &vy_k, &vz_k, &vw_k);
       fvec vdx_ik = vx_k - vx_i;
       fvec vdy_ik = vy_k - vy_i;
       fvec vdz_ik = vz_k - vz_i;
       fvec vrsq = vdx_ik * vdx_ik + vdy_ik *  vdy_ik + vdz_ik * vdz_ik;
       ivec vc_idx = v::int_mullo(v_i4floats, vw_k) + v::int_mullo(v_i_ntypes, vc_idx_ij);
-      vcutsq = v::gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
+      vcutsq = v::template gather<4>(vcutsq, vnew_mask, vc_idx, c_inner);
       bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
       bvec vsame_mask = v::int_cmpneq(vjs, vks);
       bvec veff_mask = vcutoff_mask & vsame_mask & vactive_mask;
@@ -973,7 +980,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     fvec vdy_ik = vy_k - vy_i;
     fvec vdz_ik = vz_k - vz_i;
     fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
-    fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
+    fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
     bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
@@ -1017,7 +1024,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     fvec vdy_ik = vy_k - vy_i;
     fvec vdz_ik = vz_k - vz_i;
     fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
-    fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
+    fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k]);
     bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
@@ -1064,7 +1071,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel_step_const_i(
     fvec vdy_ik = vy_k - vy_i;
     fvec vdz_ik = vz_k - vz_i;
     fvec vrsq = vdx_ik * vdx_ik + vdy_ik * vdy_ik + vdz_ik * vdz_ik;
-    fvec vcutsq = v::gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
+    fvec vcutsq = v::template gather<4>(v::zero(), vmask, vc_idx_j_ntypes, &c_inner[ntypes * ntypes * w_i + w_k].cutsq);
     bvec vcutoff_mask = v::cmplt(vrsq, vcutsq);
     bvec vsame_mask = v::int_cmpneq(vjs, ivec(static_cast<int>(4 * sizeof(typename v::fscal) * k)));
     bvec veff_mask = vcutoff_mask & vsame_mask & vmask;
@@ -1210,7 +1217,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::kernel(
 
 
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
-IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
+typename IntelKernelTersoff<flt_t,acc_t,mic,pack_i>::fvec IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::zeta_vector(
     const c_inner_t * param,
     ivec xjw, bvec mask,
     fvec vrij, fvec rsq2,
@@ -1336,6 +1343,8 @@ void IntelKernelTersoff<flt_t, acc_t, mic, pack_i>::force_zeta_vector(
   }
 }
 
+#define BCF lmp_intel::vector_routines<flt_t, acc_t, mic>
+
 template<class flt_t, class acc_t, lmp_intel::CalculationMode mic, bool pack_i>
 template<bool ZETA>
 void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
@@ -1375,7 +1384,7 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
   fvec varg3 = varg1 * varg1 * varg1;
   bvec mask_ex = v::cmpeq(vppowermint, fvec(3.));
   fvec varg  = v::blend(mask_ex, varg1, varg3);
-  fvec vex_delr = min(fvec(1.e30), exp(varg));
+  fvec vex_delr = BCF::min(fvec(1.e30), exp(varg));
   fvec vex_delr_d_factor = v::blend(mask_ex, v_1_0, fvec(3.0) * varg1 * varg1);
   fvec vex_delr_d = vplam3 * vex_delr_d_factor * vex_delr;
   bvec vmask_need_sine = v::cmpnle(vrik, vpbigr - vpbigd) & mask;
@@ -1395,12 +1404,12 @@ void IntelKernelTersoff<flt_t,acc_t,mic, pack_i>::attractive_vector(
   if (ZETA) *zeta = vfc * vgijk * vex_delr;
 
   fvec vminus_costheta = - vcostheta;
-  fvec vdcosdrjx = vrijinv * fmadd(vminus_costheta, vrij_hatx, rik_hatx);
-  fvec vdcosdrjy = vrijinv * fmadd(vminus_costheta, vrij_haty, rik_haty);
-  fvec vdcosdrjz = vrijinv * fmadd(vminus_costheta, vrij_hatz, rik_hatz);
-  fvec vdcosdrkx = rikinv * fmadd(vminus_costheta, rik_hatx, vrij_hatx);
-  fvec vdcosdrky = rikinv * fmadd(vminus_costheta, rik_haty, vrij_haty);
-  fvec vdcosdrkz = rikinv * fmadd(vminus_costheta, rik_hatz, vrij_hatz);
+  fvec vdcosdrjx = vrijinv * BCF::fmadd(vminus_costheta, vrij_hatx, rik_hatx);
+  fvec vdcosdrjy = vrijinv * BCF::fmadd(vminus_costheta, vrij_haty, rik_haty);
+  fvec vdcosdrjz = vrijinv * BCF::fmadd(vminus_costheta, vrij_hatz, rik_hatz);
+  fvec vdcosdrkx = rikinv * BCF::fmadd(vminus_costheta, rik_hatx, vrij_hatx);
+  fvec vdcosdrky = rikinv * BCF::fmadd(vminus_costheta, rik_haty, vrij_haty);
+  fvec vdcosdrkz = rikinv * BCF::fmadd(vminus_costheta, rik_hatz, vrij_hatz);
   fvec vdcosdrix = -(vdcosdrjx + vdcosdrkx);
   fvec vdcosdriy = -(vdcosdrjy + vdcosdrky);
   fvec vdcosdriz = -(vdcosdrjz + vdcosdrkz);
diff --git a/src/INTEL/pair_tersoff_intel.h b/src/INTEL/pair_tersoff_intel.h
index 0a40844622..b40ce19787 100644
--- a/src/INTEL/pair_tersoff_intel.h
+++ b/src/INTEL/pair_tersoff_intel.h
@@ -24,12 +24,12 @@ PairStyle(tersoff/intel,PairTersoffIntel);
 #ifndef LMP_PAIR_TERSOFF_INTEL_H
 #define LMP_PAIR_TERSOFF_INTEL_H
 
-#ifdef __INTEL_COMPILER
-
 #include "pair.h"
 #include "fix_intel.h"
 #include "pair_tersoff.h"
 
+#ifdef __INTEL_COMPILER
+
 namespace LAMMPS_NS {
 
 class PairTersoffIntel : public PairTersoff {
diff --git a/src/TALLY/compute_heat_flux_virial_tally.cpp b/src/TALLY/compute_heat_flux_virial_tally.cpp
new file mode 100644
index 0000000000..1a594c1b36
--- /dev/null
+++ b/src/TALLY/compute_heat_flux_virial_tally.cpp
@@ -0,0 +1,238 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#include "compute_heat_flux_virial_tally.h"
+
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "group.h"
+#include "memory.h"
+#include "pair.h"
+#include "update.h"
+
+#include <cmath>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+ComputeHeatFluxVirialTally::ComputeHeatFluxVirialTally(LAMMPS *lmp, int narg, char **arg) :
+    Compute(lmp, narg, arg)
+{
+  if (narg < 4) error->all(FLERR, "Illegal compute heat/flux/virial/tally command");
+
+  igroup2 = group->find(arg[3]);
+  if (igroup2 == -1)
+    error->all(FLERR, "Could not find compute heat/flux/virial/tally second group ID");
+  groupbit2 = group->bitmask[igroup2];
+
+  scalar_flag = 1;
+  vector_flag = 0;
+  peratom_flag = 1;
+  timeflag = 1;
+  dynamic_group_allow = 0;
+
+  comm_reverse = size_peratom_cols = 3;
+  extscalar = 1;
+  peflag = 1;    // we need Pair::ev_tally() to be run
+
+  did_setup = invoked_peratom = invoked_scalar = -1;
+  nmax = -1;
+  fatom = nullptr;
+}
+
+/* ---------------------------------------------------------------------- */
+
+ComputeHeatFluxVirialTally::~ComputeHeatFluxVirialTally()
+{
+  if (force && force->pair) force->pair->del_tally_callback(this);
+  memory->destroy(fatom);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeHeatFluxVirialTally::init()
+{
+  if (force->pair == nullptr)
+    error->all(FLERR, "Trying to use compute heat/flux/virial/tally without pair style");
+  else
+    force->pair->add_tally_callback(this);
+
+  if (comm->me == 0) {
+    if (force->pair->single_enable == 0 || force->pair->manybody_flag)
+      error->warning(FLERR, "Compute heat/flux/virial/tally used with incompatible pair style");
+
+    if (force->bond || force->angle || force->dihedral || force->improper || force->kspace)
+      error->warning(FLERR, "Compute heat/flux/virial/tally only called from pair style");
+  }
+
+  // error out if any atoms are in both groups
+  for (int i = 0; i < atom->nlocal; i++) {
+    if ((atom->mask[i] & groupbit) && (atom->mask[i] & groupbit2)) {
+      if (atom->tag_enable) {
+        error->all(FLERR,
+                   "Atom {} belonging to both groups is not allowed"
+                   " with compute heat/flux/virial/tally",
+                   atom->tag[i]);
+      } else {
+        error->all(FLERR,
+                   "Atom belonging to both groups is not allowed"
+                   " with compute heat/flux/virial/tally");
+      }
+    }
+  }
+
+  did_setup = -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeHeatFluxVirialTally::pair_setup_callback(int, int)
+{
+  // run setup only once per time step.
+  // we may be called from multiple pair styles
+
+  if (did_setup == update->ntimestep) return;
+
+  const int ntotal = atom->nlocal + atom->nghost;
+
+  // grow per-atom storage, if needed
+
+  if (atom->nmax > nmax) {
+    memory->destroy(fatom);
+    nmax = atom->nmax;
+    memory->create(fatom, nmax, size_peratom_cols, "heat/flux/virial/tally:fatom");
+    array_atom = fatom;
+  }
+
+  // clear storage
+
+  for (int i = 0; i < ntotal; ++i)
+    for (int j = 0; j < size_peratom_cols; ++j) fatom[i][j] = 0.0;
+
+  did_setup = update->ntimestep;
+}
+
+/* ---------------------------------------------------------------------- */
+void ComputeHeatFluxVirialTally::pair_tally_callback(int i, int j, int nlocal, int newton, double,
+                                                     double, double fpair, double dx, double dy,
+                                                     double dz)
+{
+  const int *const mask = atom->mask;
+
+  const bool ingroup1 = (mask[i] & groupbit);
+  if ((ingroup1 && (mask[j] & groupbit2)) || ((mask[i] & groupbit2) && (mask[j] & groupbit))) {
+
+    // signs set to calculate heat flux from group2 into group1
+    const double fx = (ingroup1 ? 0.5 : -0.5) * dx * fpair;
+    const double fy = (ingroup1 ? 0.5 : -0.5) * dy * fpair;
+    const double fz = (ingroup1 ? 0.5 : -0.5) * dz * fpair;
+
+    if (newton || i < nlocal) {
+      fatom[i][0] += fx;
+      fatom[i][1] += fy;
+      fatom[i][2] += fz;
+    }
+    if (newton || j < nlocal) {
+      fatom[j][0] += fx;
+      fatom[j][1] += fy;
+      fatom[j][2] += fz;
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int ComputeHeatFluxVirialTally::pack_reverse_comm(int n, int first, double *buf)
+{
+  int i, m, last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    buf[m++] = fatom[i][0];
+    buf[m++] = fatom[i][1];
+    buf[m++] = fatom[i][2];
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeHeatFluxVirialTally::unpack_reverse_comm(int n, int *list, double *buf)
+{
+  int i, j, m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    fatom[j][0] += buf[m++];
+    fatom[j][1] += buf[m++];
+    fatom[j][2] += buf[m++];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+double ComputeHeatFluxVirialTally::compute_scalar()
+{
+  // need to collect contributions from ghost atoms
+  // because we need to use local velocities to compute heat flux
+  if (invoked_peratom != update->ntimestep) compute_peratom();
+
+  invoked_scalar = update->ntimestep;
+  if ((did_setup != invoked_scalar) || (update->eflag_global != invoked_scalar))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
+
+  // sum heat flux across procs
+  double hflux = 0.0;
+  for (int i = 0; i < atom->nlocal; i++)
+    hflux +=
+        fatom[i][0] * atom->v[i][0] + fatom[i][1] * atom->v[i][1] + fatom[i][2] * atom->v[i][2];
+
+  MPI_Allreduce(&hflux, &scalar, 1, MPI_DOUBLE, MPI_SUM, world);
+
+  return scalar;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputeHeatFluxVirialTally::compute_peratom()
+{
+  invoked_peratom = update->ntimestep;
+  if ((did_setup != invoked_peratom) || (update->eflag_global != invoked_peratom))
+    error->all(FLERR, "Energy was not tallied on needed timestep");
+
+  // collect contributions from ghost atoms
+
+  if (force->newton_pair) {
+    comm->reverse_comm_compute(this);
+
+    // clear out ghost atom data after it has been collected to local atoms
+    const int nall = atom->nlocal + atom->nghost;
+    for (int i = atom->nlocal; i < nall; ++i)
+      for (int j = 0; j < size_peratom_cols; ++j) fatom[i][j] = 0.0;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   memory usage of local atom-based array
+------------------------------------------------------------------------- */
+
+double ComputeHeatFluxVirialTally::memory_usage()
+{
+  double bytes = (nmax < 0) ? 0 : nmax * size_peratom_cols * sizeof(double);
+  return bytes;
+}
diff --git a/src/TALLY/compute_heat_flux_virial_tally.h b/src/TALLY/compute_heat_flux_virial_tally.h
new file mode 100644
index 0000000000..d2a3d39a7d
--- /dev/null
+++ b/src/TALLY/compute_heat_flux_virial_tally.h
@@ -0,0 +1,64 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(heat/flux/virial/tally,ComputeHeatFluxVirialTally);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_HEAT_FLUX_VIRIAL_TALLY_H
+#define LMP_COMPUTE_HEAT_FLUX_VIRIAL_TALLY_H
+
+#include "compute.h"
+
+namespace LAMMPS_NS {
+
+class ComputeHeatFluxVirialTally : public Compute {
+
+ public:
+  ComputeHeatFluxVirialTally(class LAMMPS *, int, char **);
+  virtual ~ComputeHeatFluxVirialTally();
+
+  void init();
+
+  double compute_scalar();
+  void compute_peratom();
+
+  int pack_reverse_comm(int, int, double *);
+  void unpack_reverse_comm(int, int *, double *);
+  double memory_usage();
+
+  void pair_setup_callback(int, int);
+  void pair_tally_callback(int, int, int, int, double, double, double, double, double, double);
+
+ private:
+  bigint did_setup;
+  int nmax, igroup2, groupbit2;
+  double **fatom;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Illegal ... command
+
+Self-explanatory.  Check the input script syntax and compare to the
+documentation for the command.  You can use -echo screen as a
+command-line option when running LAMMPS to see the offending line.
+
+*/
diff --git a/src/USER-MISC/fix_npt_cauchy.cpp b/src/USER-MISC/fix_npt_cauchy.cpp
index 2390872631..e58632635f 100644
--- a/src/USER-MISC/fix_npt_cauchy.cpp
+++ b/src/USER-MISC/fix_npt_cauchy.cpp
@@ -671,6 +671,7 @@ int FixNPTCauchy::setmask()
   mask |= INITIAL_INTEGRATE;
   mask |= FINAL_INTEGRATE;
   mask |= INITIAL_INTEGRATE_RESPA;
+  mask |= PRE_FORCE_RESPA;
   mask |= FINAL_INTEGRATE_RESPA;
   if (pre_exchange_flag) mask |= PRE_EXCHANGE;
   return mask;
@@ -1032,7 +1033,12 @@ void FixNPTCauchy::initial_integrate_respa(int /*vflag*/, int ilevel, int /*iloo
     nve_x();
     if (pstat_flag) remap();
   }
+}
 
+/* ---------------------------------------------------------------------- */
+
+void FixNPTCauchy::pre_force_respa(int /*vflag*/, int ilevel, int /*iloop*/)
+{
   // if barostat, redo KSpace coeffs at outermost level,
   // since volume has changed
 
diff --git a/src/USER-MISC/fix_npt_cauchy.h b/src/USER-MISC/fix_npt_cauchy.h
index 1348b1bce4..80122ad0ac 100644
--- a/src/USER-MISC/fix_npt_cauchy.h
+++ b/src/USER-MISC/fix_npt_cauchy.h
@@ -35,6 +35,7 @@ class FixNPTCauchy : public Fix {
   virtual void initial_integrate(int);
   virtual void final_integrate();
   void initial_integrate_respa(int, int, int);
+  void pre_force_respa(int, int, int);
   void final_integrate_respa(int, int);
   virtual void pre_exchange();
   double compute_scalar();
diff --git a/src/fix_nh.cpp b/src/fix_nh.cpp
index 40940977a6..c906000ec5 100644
--- a/src/fix_nh.cpp
+++ b/src/fix_nh.cpp
@@ -635,6 +635,7 @@ int FixNH::setmask()
   mask |= FINAL_INTEGRATE;
   mask |= INITIAL_INTEGRATE_RESPA;
   mask |= FINAL_INTEGRATE_RESPA;
+  mask |= PRE_FORCE_RESPA;
   if (pre_exchange_flag) mask |= PRE_EXCHANGE;
   return mask;
 }
@@ -1006,7 +1007,12 @@ void FixNH::initial_integrate_respa(int /*vflag*/, int ilevel, int /*iloop*/)
     nve_x();
     if (pstat_flag) remap();
   }
+}
 
+/* ---------------------------------------------------------------------- */
+
+void FixNH::pre_force_respa(int /*vflag*/, int ilevel, int /*iloop*/)
+{
   // if barostat, redo KSpace coeffs at outermost level,
   // since volume has changed
 
diff --git a/src/fix_nh.h b/src/fix_nh.h
index ef541a0e05..1ec6bec2f8 100644
--- a/src/fix_nh.h
+++ b/src/fix_nh.h
@@ -28,6 +28,7 @@ class FixNH : public Fix {
   virtual void initial_integrate(int);
   virtual void final_integrate();
   void initial_integrate_respa(int, int, int);
+  void pre_force_respa(int, int, int);
   void final_integrate_respa(int, int);
   virtual void pre_exchange();
   double compute_scalar();
diff --git a/src/lmptype.h b/src/lmptype.h
index 6a7a7ef1b9..871bf5ff6c 100644
--- a/src/lmptype.h
+++ b/src/lmptype.h
@@ -242,9 +242,9 @@ union ubuf {
 
 // define stack variable alignment
 
-#if defined(__INTEL_LLVM_COMPILER) || defined(__INTEL_COMPILER)
+#if defined(__INTEL_COMPILER)
 #define _alignvar(expr, val) __declspec(align(val)) expr
-#elif defined(__GNUC__) || defined(__PGI)
+#elif defined(__GNUC__) || defined(__PGI) || defined(__INTEL_LLVM_COMPILER)
 #define _alignvar(expr, val) expr __attribute((aligned(val)))
 #else
 #define _alignvar(expr, val) expr
@@ -266,7 +266,7 @@ union ubuf {
 
 #if defined(__clang__)
 #define _noopt __attribute__((optnone))
-#elif defined(__INTEL_COMPILER)
+#elif defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)
 #define _noopt
 #elif defined(__PGI)
 #define _noopt
diff --git a/src/memory.cpp b/src/memory.cpp
index 3e67ed1496..8f7faad545 100644
--- a/src/memory.cpp
+++ b/src/memory.cpp
@@ -16,7 +16,8 @@
 
 #include "error.h"
 
-#if defined(LMP_USER_INTEL) && defined(__INTEL_COMPILER)
+#if defined(LMP_USER_INTEL) && \
+  ((defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER)))
 #ifndef LMP_INTEL_NO_TBB
 #define LMP_USE_TBB_ALLOCATOR
 #include "tbb/scalable_allocator.h"
@@ -81,7 +82,7 @@ void *Memory::srealloc(void *ptr, bigint nbytes, const char *name)
 #if defined(LMP_USE_TBB_ALLOCATOR)
   ptr = scalable_aligned_realloc(ptr, nbytes, LAMMPS_MEMALIGN);
 #elif defined(LMP_INTEL_NO_TBB) && defined(LAMMPS_MEMALIGN) && \
-      defined(__INTEL_COMPILER)
+  (defined(__INTEL_COMPILER) || defined(__INTEL_LLVM_COMPILER))
 
   ptr = realloc(ptr, nbytes);
   uintptr_t offset = ((uintptr_t)(const void *)(ptr)) % LAMMPS_MEMALIGN;