Merge branch 'develop' of github.com:lammps/lammps into comm_tiled

2024-02-27 14:15:58 -07:00
parent 86b7560740 4d89741d8c
commit 7593f0df22
31 changed files with 1581 additions and 123 deletions
--- a/doc/src/Bibliography.rst
+++ b/doc/src/Bibliography.rst
@ -877,6 +877,9 @@ Bibliography
 **(PLUMED)**
   G.A. Tribello, M. Bonomi, D. Branduardi, C. Camilloni and G. Bussi, Comp. Phys. Comm 185, 604 (2014)

+**(Pavlov)**
+D Pavlov, V Galigerov, D Kolotinskii, V Nikolskiy, V Stegailov, International Journal of High Performance Computing Applications, 38, 34-49 (2024).
+
 **(Paquay)**
   Paquay and Kusters, Biophys. J., 110, 6, (2016). preprint available at `arXiv:1411.3019 <https://arxiv.org/abs/1411.3019/>`_.

--- a/doc/src/Commands_fix.rst
+++ b/doc/src/Commands_fix.rst
@ -263,6 +263,7 @@ OPT.
   * :doc:`wall/body/polyhedron <fix_wall_body_polyhedron>`
   * :doc:`wall/colloid <fix_wall>`
   * :doc:`wall/ees <fix_wall_ees>`
+   * :doc:`wall/flow (k) <fix_wall_flow>`
   * :doc:`wall/gran (k) <fix_wall_gran>`
   * :doc:`wall/gran/region <fix_wall_gran_region>`
   * :doc:`wall/harmonic <fix_wall>`
--- a/doc/src/Howto_granular.rst
+++ b/doc/src/Howto_granular.rst
@ -45,10 +45,15 @@ atoms, and should be used for granular system instead of the fix style

 To model heat conduction, one must add the temperature and heatflow
 atom variables with:
+
 * :doc:`fix property/atom <fix_property_atom>`
+
 a temperature integration fix
+
 * :doc:`fix heat/flow <fix_heat_flow>`
+
 and a heat conduction option defined in both
+
 * :doc:`pair_style granular <pair_granular>`
 * :doc:`fix wall/gran <fix_wall_gran>`

--- a/doc/src/compute_fabric.rst
+++ b/doc/src/compute_fabric.rst
@ -64,7 +64,7 @@ tangential force tensor. The contact tensor is calculated as

 .. math::

-   C_{ab}  =  \frac{15}{2} (\phi_{ab} - \mathrm{Tr}(\phi) \delta_{ab})
+   C_{ab}  =  \frac{15}{2} (\phi_{ab} - \frac{1}{3} \mathrm{Tr}(\phi) \delta_{ab})

 where :math:`a` and :math:`b` are the :math:`x`, :math:`y`, :math:`z`
 directions, :math:`\delta_{ab}` is the Kronecker delta function, and
@ -83,7 +83,7 @@ The branch tensor is calculated as

 .. math::

-   B_{ab}  =  \frac{15}{6 \mathrm{Tr}(D)} (D_{ab} - \mathrm{Tr}(D) \delta_{ab})
+   B_{ab}  =  \frac{15}{2\, \mathrm{Tr}(D)} (D_{ab} - \frac{1}{3} \mathrm{Tr}(D) \delta_{ab})

 where the tensor :math:`D` is defined as

@ -101,7 +101,7 @@ The normal force fabric tensor is calculated as

 .. math::

-   F^n_{ab}  =  \frac{15}{6\, \mathrm{Tr}(N)} (N_{ab} - \mathrm{Tr}(N) \delta_{ab})
+   F^n_{ab}  =  \frac{15}{2\, \mathrm{Tr}(N)} (N_{ab} - \frac{1}{3} \mathrm{Tr}(N) \delta_{ab})

 where the tensor :math:`N` is defined as

@ -119,7 +119,7 @@ as

 .. math::

-   F^t_{ab}  =  \frac{15}{9\, \mathrm{Tr}(N)} (T_{ab} - \mathrm{Tr}(T) \delta_{ab})
+   F^t_{ab}  =  \frac{5}{\mathrm{Tr}(N)} (T_{ab} - \frac{1}{3} \mathrm{Tr}(T) \delta_{ab})

 where the tensor :math:`T` is defined as

--- a/doc/src/compute_property_atom.rst
+++ b/doc/src/compute_property_atom.rst
@ -23,8 +23,9 @@ Syntax
                             spx, spy, spz, sp, fmx, fmy, fmz,
                             nbonds,
                             radius, diameter, omegax, omegay, omegaz,
+                             temperature, heatflow,
                             angmomx, angmomy, angmomz,
-                             shapex,shapey, shapez,
+                             shapex, shapey, shapez,
                             quatw, quati, quatj, quatk, tqx, tqy, tqz,
                             end1x, end1y, end1z, end2x, end2y, end2z,
                             corner1x, corner1y, corner1z,
@ -56,6 +57,8 @@ Syntax
           *nbonds* = number of bonds assigned to an atom
           *radius,diameter* = radius,diameter of spherical particle
           *omegax,omegay,omegaz* = angular velocity of spherical particle
+           *temperature* = internal temperature of spherical particle
+           *heatflow* = internal heat flow of spherical particle
           *angmomx,angmomy,angmomz* = angular momentum of aspherical particle
           *shapex,shapey,shapez* = 3 diameters of aspherical particle
           *quatw,quati,quatj,quatk* = quaternion components for aspherical or body particles
--- a/doc/src/dump.rst
+++ b/doc/src/dump.rst
@ -104,7 +104,6 @@ Syntax
                               q, mux, muy, muz, mu,
                               radius, diameter, omegax, omegay, omegaz,
                               angmomx, angmomy, angmomz, tqx, tqy, tqz,
-                               heatflow, temperature,
                               c_ID, c_ID[I], f_ID, f_ID[I], v_name,
                               i_name, d_name, i2_name[I], d2_name[I]

@ -131,8 +130,6 @@ Syntax
           omegax,omegay,omegaz = angular velocity of spherical particle
           angmomx,angmomy,angmomz = angular momentum of aspherical particle
           tqx,tqy,tqz = torque on finite-size particles
-           heatflow = rate of heat flow into particle
-           temperature = temperature of particle
           c_ID = per-atom vector calculated by a compute with ID
           c_ID[I] = Ith column of per-atom array calculated by a compute with ID, I can include wildcard (see below)
           f_ID = per-atom vector calculated by a fix with ID
--- a/doc/src/fix.rst
+++ b/doc/src/fix.rst
@ -428,6 +428,7 @@ accelerated styles exist.
 * :doc:`wall/body/polyhedron <fix_wall_body_polyhedron>` - time integration for body particles of style :doc:`rounded/polyhedron <Howto_body>`
 * :doc:`wall/colloid <fix_wall>` - Lennard-Jones wall interacting with finite-size particles
 * :doc:`wall/ees <fix_wall_ees>` - wall for ellipsoidal particles
+* :doc:`wall/flow <fix_wall_flow>` - flow boundary conditions
 * :doc:`wall/gran <fix_wall_gran>` - frictional wall(s) for granular simulations
 * :doc:`wall/gran/region <fix_wall_gran_region>` - :doc:`fix wall/region <fix_wall_region>` equivalent for use with granular particles
 * :doc:`wall/harmonic <fix_wall>` - harmonic spring wall
--- a/doc/src/fix_property_atom.rst
+++ b/doc/src/fix_property_atom.rst
@ -22,6 +22,8 @@ Syntax
       *mol* = molecule IDs
       *q* = charge
       *rmass* = per-atom mass
+       *temperature* = internal temperature of atom
+       *heatflow* = internal heat flow of atom
       i_name = new integer vector referenced by name
       d_name = new floating-point vector referenced by name
       i2_name = new integer array referenced by name
@ -59,14 +61,18 @@ these properties for each atom in the system when a data file is read.
 This fix augments the set of per-atom properties with new custom
 ones. This can be useful in several scenarios.

-If the atom style does not define molecule IDs, per-atom charge, or
-per-atom mass, they can be added using the *mol*\ , *q* or *rmass*
+If the atom style does not define molecule IDs, per-atom charge,
+per-atom mass, internal temperature, or internal heat flow, they can
+be added using the *mol*\ , *q*, *rmass*, *temperature*, or *heatflow*
 keywords.  This could be useful to define "molecules" to use as rigid
 bodies with the :doc:`fix rigid <fix_rigid>` command, or to carry
 around an extra flag with atoms (stored as a molecule ID) that can be
 used by various commands like :doc:`compute chunk/atom
 <compute_chunk_atom>` to group atoms without having to use the group
 command (which is limited to a total of 32 groups including *all*\ ).
+For finite-size particles, an internal temperature and heat flow can
+be used to model heat conduction as in the
+:doc:`GRANULAR package <Howto_granular>`.

 Another application is to use the *rmass* flag in order to have
 per-atom masses instead of per-type masses.  This could be used to
@ -85,9 +91,10 @@ properties that are not needed such as bond lists, which incurs some
 overhead when there are no bonds.

 In the future, we may add additional existing per-atom properties to
-fix property/atom, similar to *mol*\ , *q* or *rmass*\ , which
-"turn-on" specific properties defined by some atom styles, so they can
-be easily used by atom styles that do not define them.
+fix property/atom, similar to *mol*\ , *q*, *rmass*\ , *temperature*\ ,
+or *heatflow* which "turn-on" specific properties defined by some atom
+styles, so they can be easily used by atom styles that do not define
+them.

 More generally, the *i_name* and *d_name* options allow one or more
 new custom per-atom vectors to be defined.  Likewise the *i2_name* and
--- a/doc/src/fix_wall_flow.rst
+++ b/doc/src/fix_wall_flow.rst
@ -0,0 +1,175 @@
+.. index:: fix wall/flow
+.. index:: fix wall/flow/kk
+
+fix wall/flow command
+=====================
+
+Accelerator Variants: *wall/flow/kk*
+
+Syntax
+""""""
+
+.. code-block:: LAMMPS
+
+   fix ID group-ID wall/flow axis vflow T seed N coords ... keyword value
+
+* ID, group-ID are documented in :doc:`fix <fix>` command
+* wall/flow = style name of this fix command
+* axis = flow axis (*x*, *y*, or *z*)
+* vflow = generated flow velocity in *axis* direction (velocity units)
+* T = flow temperature (temperature units)
+* seed = random seed for stochasticity (positive integer)
+* N = number of walls
+* coords = list of N wall positions along the *axis* direction in ascending order (distance units)
+* zero or more keyword/value pairs may be appended
+* keyword = *units*
+
+  .. parsed-literal::
+
+       *units* value = *lattice* or *box*
+         *lattice* = wall positions are defined in lattice units
+         *box* = the wall positions are defined in simulation box units
+
+Examples
+""""""""
+
+.. code-block:: LAMMPS
+
+   fix 1 all wall/flow x 0.4 1.5 593894 4 2.0 4.0 6.0 8.0
+
+Description
+"""""""""""
+
+.. versionadded:: TBD
+
+This fix implements flow boundary conditions (FBC) introduced in
+:ref:`(Pavlov1) <fbc-Pavlov1>` and :ref:`(Pavlov2) <fbc-Pavlov2>`.
+The goal is to generate a stationary flow with a shifted Maxwell
+velocity distribution:
+
+.. math::
+
+   f_a(v_a) \propto \exp{\left(-\frac{m (v_a-v_{\text{flow}})^2}{2 kB T}\right)}
+
+where :math:`v_a` is the component of velocity along the specified
+*axis* argument (a = x,y,z), :math:`v_{\text{flow}}` is the flow
+velocity specified as the *vflow* argument, *T* is the specified flow
+temperature, *m* is the particle mass, and *kB* is the Boltzmann
+constant.
+
+This is achieved by defining a series of *N* transparent walls along
+the flow *axis* direction.  Each wall is at the specified position
+listed in the *coords* argument.  Note that an additional transparent
+wall is defined by the code at the boundary of the (periodic)
+simulation domain in the *axis* direction.  So there are effectively
+N+1 walls.
+
+Each time a particle in the specified group passes through one of the
+transparent walls, its velocity is re-assigned.  Particles not in the
+group do not interact with the wall. This can be used, for example, to
+add obstacles composed of atoms, or to simulate a solution of complex
+molecules in a one-atom liquid (note that the fix has been tested for
+one-atom systems only).
+
+Conceptually, the velocity re-assignment represents creation of a new
+particle within the system with simultaneous removal of the particle
+which passed through the wall.  The velocity components in directions
+parallel to the wall are re-assigned according to the standard Maxwell
+velocity distribution for the specified temperature *T*.  The velocity
+component perpendicular to the wall is re-assigned according to the
+shifted Maxwell distribution defined above:
+
+.. math::
+
+   f_{\text{a generated}}(v_a) \propto v_a f_a(v_a)
+
+It can be shown that for an ideal-gas scenario this procedure makes
+the velocity distribution of particles between walls exactly as
+desired.
+
+Since in most cases simulated systems are not an ideal gas, multiple
+walls can be defined, since a single wall may not be sufficient for
+maintaining a stationary flow without "congestion" which can manifest
+itself as regions in the flow with increased particle density located
+upstream from static obstacles.
+
+For the same reason, the actual temperature and velocity of the
+generated flow may differ from what is requested.  The degree of
+discrepancy is determined by how different from an ideal gas the
+simulated system is.  Therefore, a calibration procedure may be
+required for such a system as described in :ref:`(Pavlov)
+<fbc-Pavlov2>`.
+
+Note that the interactions between particles on different sides of a
+transparent wall are not disabled or neglected.  Likewise particle
+positions are not altered by the velocity reassignment.  This removes
+the need to modify the force field to work correctly in cases when a
+particle is close to a wall.
+
+For example, if particle positions were uniformly redistributed across
+the surface of a wall, two particles could end up too close to each
+other, potentially causing the simulation to explode.  However due to
+this compromise, some collective phenomena such as regions with
+increased/decreased density or collective movements are not fully
+removed when particles cross a wall.  This unwanted consequence can
+also be potentially mitigated by using more multiple walls.
+
+.. note::
+
+  When the specified flow has a high velocity, a lost atoms error can
+  occur (see :doc:`error messages <Errors_messages>`).  If this
+  happens, you should ensure the checks for neighbor list rebuilds,
+  set via the :doc:`neigh_modify <neigh_modify>` command, are as
+  conservative as possible (every timestep if needed).  Those are the
+  default settings.
+
+Restart, fix_modify, output, run start/stop, minimize info
+"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
+
+No information about this fix is written to :doc:`binary restart files
+<restart>`.
+
+None of the :doc:`fix_modify <fix_modify>` options are relevant to
+this fix.
+
+No global or per-atom quantities are stored by this fix for access by
+various :doc:`output commands <Howto_output>`.
+
+No parameter of this fix can be used with the *start/stop* keywords of
+the :doc:`run <run>` command.
+
+This fix is not invoked during :doc:`energy minimization <minimize>`.
+
+Restrictions
+""""""""""""
+
+Fix *wall_flow* is part of the EXTRA-FIX package.  It is only enabled
+if LAMMPS was built with that package.  See the :doc:`Build package
+<Build_package>` page for more info.
+
+Flow boundary conditions should not be used with rigid bodies such as
+those defined by a "fix rigid" command.
+
+This fix can only be used with periodic boundary conditions along the
+flow axis. The size of the box in this direction must not change. Also,
+the fix is designed to work only in an orthogonal simulation box.
+
+Related commands
+""""""""""""""""
+
+:doc:`fix wall/reflect <fix_wall>` command
+
+Default
+"""""""
+
+The default for the units keyword is lattice.
+
+----------
+
+.. _fbc-Pavlov1:
+
+**(Pavlov1)** Pavlov, Kolotinskii, Stegailov, "GPU-Based Molecular Dynamics of Turbulent Liquid Flows with OpenMM", Proceedings of PPAM-2022, LNCS (Springer), vol. 13826, pp. 346-358 (2023)
+
+.. _fbc-Pavlov2:
+
+**(Pavlov2)** Pavlov, Galigerov, Kolotinskii, Nikolskiy, Stegailov, "GPU-based Molecular Dynamics of Fluid Flows: Reaching for Turbulence", Int. J. High Perf. Comp. Appl., (2024)
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@ -1770,6 +1770,7 @@ Kolafa
 Kollman
 kolmogorov
 Kolmogorov
+Kolotinskii
 Kondor
 konglt
 Koning
@ -2775,6 +2776,7 @@ PEigenDense
 Peng
 peptide
 peratom
+Perf
 Pergamon
 pergrid
 peri
@ -3887,7 +3889,9 @@ Verlet
 versa
 Verstraelen
 ves
+vf
 vflag
+vflow
 vfrac
 vhi
 vibrational
--- a/examples/granular/in.pour.heat
+++ b/examples/granular/in.pour.heat
@ -73,7 +73,8 @@ thermo          100

 timestep        0.001

-#dump           1 all custom 1000 ${name}.dump id type radius mass x y z temperature heatflow
+compute        1 all property/atom temperature heatflow
+#dump           1 all custom 1000 ${name}.dump id type radius mass x y z c_1[*]

 run             100000

--- a/examples/wall/in.wall.flow
+++ b/examples/wall/in.wall.flow
@ -0,0 +1,79 @@
+variable nrun equal 1000
+variable dump_count equal 10
+
+variable nwall equal 4
+variable w1 equal 67
+variable w2 equal 71
+variable w3 equal 75
+variable w4 equal 79
+
+variable x_cylinder equal 20
+variable y_cylinder equal 17
+variable r_cylinder equal 4
+
+variable MASS  equal 1
+variable TEMP  equal 0.4
+variable VFLOW equal 0.5
+
+units         lj
+atom_style    atomic
+
+lattice       fcc 0.3
+region        sim_box block 0 84 0 34 0 10
+
+boundary      p p p
+
+create_box    2 sim_box
+region        reg_cylinder cylinder z ${x_cylinder} ${y_cylinder} ${r_cylinder} EDGE EDGE
+
+create_atoms  1 box
+
+## setup obstacle ##
+group  g_obst region reg_cylinder
+group  g_flow subtract all g_obst
+set    group g_obst type 2 
+
+mass          1 ${MASS}
+mass          2 ${MASS}
+
+velocity  g_flow create ${TEMP} 4928459 rot yes dist gaussian
+velocity  g_obst set    0.0 0.0 0.0
+
+pair_style  lj/cut 1.122462
+pair_coeff  1 1 1.0 1.0
+pair_coeff  1 2 1.0 1.0
+pair_coeff  2 2 1.0 1.0
+pair_modify shift yes
+
+neighbor      0.3 bin
+neigh_modify  delay 0 every 20 check no
+
+fix    1     g_flow nve
+fix    2     g_flow wall/flow  x ${VFLOW} ${TEMP} 123 ${nwall} ${w1} ${w2} ${w3} ${w4}
+
+variable dump_every equal ${nrun}/${dump_count}
+variable thermo_every equal ${dump_every}
+variable restart_every equal ${nrun}/10
+
+##### uncomment for grid aggregation #####
+#variable gr_Nx equal 42
+#variable gr_Ny equal 17
+#variable gr_Nz equal 1
+#variable gr_Nevery equal ${dump_every}
+#variable gr_Nrepeat equal 1
+#variable gr_Nfreq equal ${dump_every}
+#fix    3     g_flow ave/grid ${gr_Nevery} ${gr_Nrepeat} ${gr_Nfreq} ${gr_Nx} ${gr_Ny} ${gr_Nz} vx vy vz density/mass norm all ave one
+#compute ct_gridId g_flow property/grid ${gr_Nx} ${gr_Ny} ${gr_Nz} id
+#dump   dmp_grid  g_flow grid ${dump_every} grid.lammpstrj c_ct_gridId:grid:data f_3:grid:data[*]
+##########################################
+
+#dump   dmp_coord all atom ${dump_every} dump.lammpstrj
+
+#compute ct_Temp   g_flow temp/com
+#thermo_style custom step temp epair emol etotal press c_ct_Temp
+
+#restart  ${restart_every} flow.restart
+
+timestep 0.005
+thermo   ${thermo_every}
+run      ${nrun}
--- a/examples/wall/log.7Feb24.wall.flow.g++.1
+++ b/examples/wall/log.7Feb24.wall.flow.g++.1
@ -0,0 +1,182 @@
+LAMMPS (21 Nov 2023 - Development - patch_21Nov2023-758-ge33590b2fc-modified)
+  using 1 OpenMP thread(s) per MPI task
+variable nrun equal 1000
+variable dump_count equal 10
+
+variable nwall equal 4
+variable w1 equal 67
+variable w2 equal 71
+variable w3 equal 75
+variable w4 equal 79
+
+variable x_cylinder equal 20
+variable y_cylinder equal 17
+variable r_cylinder equal 4
+
+variable MASS  equal 1
+variable TEMP  equal 0.4
+variable VFLOW equal 0.5
+
+units         lj
+atom_style    atomic
+
+lattice       fcc 0.3
+Lattice spacing in x,y,z = 2.3712622 2.3712622 2.3712622
+region        sim_box block 0 84 0 34 0 10
+
+boundary      p p p
+
+create_box    2 sim_box
+Created orthogonal box = (0 0 0) to (199.18603 80.622915 23.712622)
+  1 by 1 by 1 MPI processor grid
+region        reg_cylinder cylinder z ${x_cylinder} ${y_cylinder} ${r_cylinder} EDGE EDGE
+region        reg_cylinder cylinder z 20 ${y_cylinder} ${r_cylinder} EDGE EDGE
+region        reg_cylinder cylinder z 20 17 ${r_cylinder} EDGE EDGE
+region        reg_cylinder cylinder z 20 17 4 EDGE EDGE
+
+create_atoms  1 box
+Created 114240 atoms
+  using lattice units in orthogonal box = (0 0 0) to (199.18603 80.622915 23.712622)
+  create_atoms CPU = 0.010 seconds
+
+## setup obstacle ##
+group  g_obst region reg_cylinder
+1950 atoms in group g_obst
+group  g_flow subtract all g_obst
+112290 atoms in group g_flow
+set    group g_obst type 2
+Setting atom values ...
+  1950 settings made for type
+
+mass          1 ${MASS}
+mass          1 1
+mass          2 ${MASS}
+mass          2 1
+
+velocity  g_flow create ${TEMP} 4928459 rot yes dist gaussian
+velocity  g_flow create 0.4 4928459 rot yes dist gaussian
+velocity  g_obst set    0.0 0.0 0.0
+
+pair_style  lj/cut 1.122462
+pair_coeff  1 1 1.0 1.0
+pair_coeff  1 2 1.0 1.0
+pair_coeff  2 2 1.0 1.0
+pair_modify shift yes
+
+neighbor      0.3 bin
+neigh_modify  delay 0 every 20 check no
+
+fix    1     g_flow nve
+fix    2     g_flow wall/flow  x ${VFLOW} ${TEMP} 123 ${nwall} ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 ${TEMP} 123 ${nwall} ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 ${nwall} ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 71 ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 71 75 ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 71 75 79
+
+variable dump_every equal ${nrun}/${dump_count}
+variable dump_every equal 1000/${dump_count}
+variable dump_every equal 1000/10
+variable thermo_every equal ${dump_every}
+variable thermo_every equal 100
+variable restart_every equal ${nrun}/10
+variable restart_every equal 1000/10
+
+##### uncomment for grid aggregation #####
+#variable gr_Nx equal 42
+#variable gr_Ny equal 17
+#variable gr_Nz equal 1
+#variable gr_Nevery equal ${dump_every}
+#variable gr_Nrepeat equal 1
+#variable gr_Nfreq equal ${dump_every}
+#fix    3     g_flow ave/grid ${gr_Nevery} ${gr_Nrepeat} ${gr_Nfreq} ${gr_Nx} ${gr_Ny} ${gr_Nz} vx vy vz density/mass norm all ave one
+#compute ct_gridId g_flow property/grid ${gr_Nx} ${gr_Ny} ${gr_Nz} id
+#dump   dmp_grid  g_flow grid ${dump_every} grid.lammpstrj c_ct_gridId:grid:data f_3:grid:data[*]
+##########################################
+
+#dump   dmp_coord all atom ${dump_every} dump.lammpstrj
+
+#compute ct_Temp   g_flow temp/com
+#thermo_style custom step temp epair emol etotal press c_ct_Temp
+
+#restart  ${restart_every} flow.restart
+
+timestep 0.005
+thermo   ${thermo_every}
+thermo   100
+run      ${nrun}
+run      1000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- fix wall/flow command: doi:10.1177/10943420231213013
+
+@Article{Pavlov-etal-IJHPCA-2024,
+ author = {Daniil Pavlov and Vladislav Galigerov and Daniil Kolotinskii and Vsevolod Nikolskiy and Vladimir Stegailov},
+ title = {GPU-based molecular dynamics of fluid flows: Reaching for turbulence},
+ journal = {The International Journal of High Performance Computing Applications},
+ year =    2024,
+ volume =  38,
+ number =  1,
+ pages =   34-49
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 20 steps, delay = 0 steps, check = no
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 1.422462
+  ghost atom cutoff = 1.422462
+  binsize = 0.711231, bins = 281 114 34
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 26.69 | 26.69 | 26.69 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0.39317221     0              0              0.58975315     0.11795063   
+       100   0.3671684      0.045118445    0              0.59586622     0.27378331   
+       200   0.3732041      0.036897471    0              0.59669873     0.24917809   
+       300   0.37432305     0.036501844    0              0.5979815      0.24715194   
+       400   0.37603886     0.035350565    0              0.59940392     0.24480762   
+       500   0.37617142     0.036949771    0              0.60120196     0.24862985   
+       600   0.37751983     0.036484268    0              0.60275905     0.24784635   
+       700   0.37787831     0.037327783    0              0.60414029     0.25060427   
+       800   0.37959242     0.036206184    0              0.60558983     0.2476903    
+       900   0.38019033     0.036874395    0              0.6071549      0.24984211   
+      1000   0.38070666     0.037068948    0              0.60812395     0.25041936   
+Loop time of 5.61598 on 1 procs for 1000 steps with 114240 atoms
+
+Performance: 76923.319 tau/day, 178.063 timesteps/s, 20.342 Matom-step/s
+99.7% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 2.6351     | 2.6351     | 2.6351     |   0.0 | 46.92
+Neigh   | 1.2994     | 1.2994     | 1.2994     |   0.0 | 23.14
+Comm    | 0.26576    | 0.26576    | 0.26576    |   0.0 |  4.73
+Output  | 0.0030531  | 0.0030531  | 0.0030531  |   0.0 |  0.05
+Modify  | 1.3019     | 1.3019     | 1.3019     |   0.0 | 23.18
+Other   |            | 0.1107     |            |       |  1.97
+
+Nlocal:         114240 ave      114240 max      114240 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:          20119 ave       20119 max       20119 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:         164018 ave      164018 max      164018 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 164018
+Ave neighs/atom = 1.4357318
+Neighbor list builds = 50
+Dangerous builds not checked
+Total wall time: 0:00:05
--- a/examples/wall/log.7Feb24.wall.flow.g++.4
+++ b/examples/wall/log.7Feb24.wall.flow.g++.4
@ -0,0 +1,182 @@
+LAMMPS (21 Nov 2023 - Development - patch_21Nov2023-758-ge33590b2fc-modified)
+  using 1 OpenMP thread(s) per MPI task
+variable nrun equal 1000
+variable dump_count equal 10
+
+variable nwall equal 4
+variable w1 equal 67
+variable w2 equal 71
+variable w3 equal 75
+variable w4 equal 79
+
+variable x_cylinder equal 20
+variable y_cylinder equal 17
+variable r_cylinder equal 4
+
+variable MASS  equal 1
+variable TEMP  equal 0.4
+variable VFLOW equal 0.5
+
+units         lj
+atom_style    atomic
+
+lattice       fcc 0.3
+Lattice spacing in x,y,z = 2.3712622 2.3712622 2.3712622
+region        sim_box block 0 84 0 34 0 10
+
+boundary      p p p
+
+create_box    2 sim_box
+Created orthogonal box = (0 0 0) to (199.18603 80.622915 23.712622)
+  4 by 1 by 1 MPI processor grid
+region        reg_cylinder cylinder z ${x_cylinder} ${y_cylinder} ${r_cylinder} EDGE EDGE
+region        reg_cylinder cylinder z 20 ${y_cylinder} ${r_cylinder} EDGE EDGE
+region        reg_cylinder cylinder z 20 17 ${r_cylinder} EDGE EDGE
+region        reg_cylinder cylinder z 20 17 4 EDGE EDGE
+
+create_atoms  1 box
+Created 114240 atoms
+  using lattice units in orthogonal box = (0 0 0) to (199.18603 80.622915 23.712622)
+  create_atoms CPU = 0.003 seconds
+
+## setup obstacle ##
+group  g_obst region reg_cylinder
+1950 atoms in group g_obst
+group  g_flow subtract all g_obst
+112290 atoms in group g_flow
+set    group g_obst type 2
+Setting atom values ...
+  1950 settings made for type
+
+mass          1 ${MASS}
+mass          1 1
+mass          2 ${MASS}
+mass          2 1
+
+velocity  g_flow create ${TEMP} 4928459 rot yes dist gaussian
+velocity  g_flow create 0.4 4928459 rot yes dist gaussian
+velocity  g_obst set    0.0 0.0 0.0
+
+pair_style  lj/cut 1.122462
+pair_coeff  1 1 1.0 1.0
+pair_coeff  1 2 1.0 1.0
+pair_coeff  2 2 1.0 1.0
+pair_modify shift yes
+
+neighbor      0.3 bin
+neigh_modify  delay 0 every 20 check no
+
+fix    1     g_flow nve
+fix    2     g_flow wall/flow  x ${VFLOW} ${TEMP} 123 ${nwall} ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 ${TEMP} 123 ${nwall} ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 ${nwall} ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 ${w1} ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 ${w2} ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 71 ${w3} ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 71 75 ${w4}
+fix    2     g_flow wall/flow  x 0.5 0.4 123 4 67 71 75 79
+
+variable dump_every equal ${nrun}/${dump_count}
+variable dump_every equal 1000/${dump_count}
+variable dump_every equal 1000/10
+variable thermo_every equal ${dump_every}
+variable thermo_every equal 100
+variable restart_every equal ${nrun}/10
+variable restart_every equal 1000/10
+
+##### uncomment for grid aggregation #####
+#variable gr_Nx equal 42
+#variable gr_Ny equal 17
+#variable gr_Nz equal 1
+#variable gr_Nevery equal ${dump_every}
+#variable gr_Nrepeat equal 1
+#variable gr_Nfreq equal ${dump_every}
+#fix    3     g_flow ave/grid ${gr_Nevery} ${gr_Nrepeat} ${gr_Nfreq} ${gr_Nx} ${gr_Ny} ${gr_Nz} vx vy vz density/mass norm all ave one
+#compute ct_gridId g_flow property/grid ${gr_Nx} ${gr_Ny} ${gr_Nz} id
+#dump   dmp_grid  g_flow grid ${dump_every} grid.lammpstrj c_ct_gridId:grid:data f_3:grid:data[*]
+##########################################
+
+#dump   dmp_coord all atom ${dump_every} dump.lammpstrj
+
+#compute ct_Temp   g_flow temp/com
+#thermo_style custom step temp epair emol etotal press c_ct_Temp
+
+#restart  ${restart_every} flow.restart
+
+timestep 0.005
+thermo   ${thermo_every}
+thermo   100
+run      ${nrun}
+run      1000
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Your simulation uses code contributions which should be cited:
+
+- fix wall/flow command: doi:10.1177/10943420231213013
+
+@Article{Pavlov-etal-IJHPCA-2024,
+ author = {Daniil Pavlov and Vladislav Galigerov and Daniil Kolotinskii and Vsevolod Nikolskiy and Vladimir Stegailov},
+ title = {GPU-based molecular dynamics of fluid flows: Reaching for turbulence},
+ journal = {The International Journal of High Performance Computing Applications},
+ year =    2024,
+ volume =  38,
+ number =  1,
+ pages =   34-49
+}
+
+CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE-CITE
+
+Generated 0 of 1 mixed pair_coeff terms from geometric mixing rule
+Neighbor list info ...
+  update: every = 20 steps, delay = 0 steps, check = no
+  max neighbors/atom: 2000, page size: 100000
+  master list distance cutoff = 1.422462
+  ghost atom cutoff = 1.422462
+  binsize = 0.711231, bins = 281 114 34
+  1 neighbor lists, perpetual/occasional/extra = 1 0 0
+  (1) pair lj/cut, perpetual
+      attributes: half, newton on
+      pair build: half/bin/atomonly/newton
+      stencil: half/bin/3d
+      bin: standard
+Per MPI rank memory allocation (min/avg/max) = 8.496 | 8.496 | 8.496 Mbytes
+   Step          Temp          E_pair         E_mol          TotEng         Press     
+         0   0.39317221     0              0              0.58975315     0.11795063   
+       100   0.36726398     0.045386014    0              0.59627716     0.27402111   
+       200   0.37384538     0.036574547    0              0.5973377      0.24836729   
+       300   0.37487455     0.036519645    0              0.59882654     0.24691726   
+       400   0.37591417     0.036405755    0              0.60027207     0.24700641   
+       500   0.37654714     0.037008829    0              0.60182459     0.24883444   
+       600   0.3778008      0.03663706     0              0.6033333      0.24874392   
+       700   0.37851338     0.036714175    0              0.60447928     0.24881829   
+       800   0.37984876     0.036237049    0              0.6060052      0.24843003   
+       900   0.38022763     0.036847615    0              0.60718407     0.24987198   
+      1000   0.38084717     0.037139994    0              0.60840575     0.25070072   
+Loop time of 2.20347 on 4 procs for 1000 steps with 114240 atoms
+
+Performance: 196054.093 tau/day, 453.829 timesteps/s, 51.845 Matom-step/s
+95.6% CPU use with 4 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 0.67927    | 0.70882    | 0.73473    |   2.4 | 32.17
+Neigh   | 0.32928    | 0.34467    | 0.36084    |   2.0 | 15.64
+Comm    | 0.3211     | 0.36609    | 0.40741    |   6.1 | 16.61
+Output  | 0.0017748  | 0.0032465  | 0.0046508  |   2.1 |  0.15
+Modify  | 0.71135    | 0.74424    | 0.76001    |   2.3 | 33.78
+Other   |            | 0.03641    |            |       |  1.65
+
+Nlocal:          28560 ave       29169 max       27884 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+Nghost:        6452.25 ave        6546 max        6368 min
+Histogram: 1 0 0 0 2 0 0 0 0 1
+Neighs:          40893 ave       42032 max       39445 min
+Histogram: 1 0 0 0 1 0 0 1 0 1
+
+Total # of neighbors = 163572
+Ave neighs/atom = 1.4318277
+Neighbor list builds = 50
+Dangerous builds not checked
+Total wall time: 0:00:02
--- a/src/.gitignore
+++ b/src/.gitignore
@ -1025,6 +1025,8 @@
 /fix_wall_colloid.h
 /fix_wall_ees.cpp
 /fix_wall_ees.h
+/fix_wall_flow.cpp
+/fix_wall_flow.h
 /fix_wall_region_ees.cpp
 /fix_wall_region_ees.h
 /fix_wall_reflect_stochastic.cpp
--- a/src/BPM/bond_bpm.cpp
+++ b/src/BPM/bond_bpm.cpp
@ -357,7 +357,6 @@ void BondBPM::process_broken(int i, int j)
  if (i < nlocal) {
    for (m = 0; m < num_bond[i]; m++) {
      if (bond_atom[i][m] == tag[j]) {
-        bond_type[i][m] = 0;
        n = num_bond[i];
        bond_type[i][m] = bond_type[i][n - 1];
        bond_atom[i][m] = bond_atom[i][n - 1];
@ -372,7 +371,6 @@ void BondBPM::process_broken(int i, int j)
  if (j < nlocal) {
    for (m = 0; m < num_bond[j]; m++) {
      if (bond_atom[j][m] == tag[i]) {
-        bond_type[j][m] = 0;
        n = num_bond[j];
        bond_type[j][m] = bond_type[j][n - 1];
        bond_atom[j][m] = bond_atom[j][n - 1];
--- a/src/EXTRA-FIX/fix_wall_flow.cpp
+++ b/src/EXTRA-FIX/fix_wall_flow.cpp
@ -0,0 +1,325 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Vladislav Galigerov (HSE),
+                         Daniil Pavlov (MIPT)
+------------------------------------------------------------------------- */
+
+#include "fix_wall_flow.h"
+
+#include "atom.h"
+#include "citeme.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
+#include "force.h"
+#include "input.h"
+#include "lattice.h"
+#include "math_const.h"
+#include "memory.h"
+#include "modify.h"
+#include "random_mars.h"
+#include "update.h"
+#include "variable.h"
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+static const char cite_fix_wall_flow_c[] =
+    "fix wall/flow command: doi:10.1177/10943420231213013\n\n"
+    "@Article{Pavlov-etal-IJHPCA-2024,\n"
+    " author = {Daniil Pavlov and Vladislav Galigerov and Daniil Kolotinskii and Vsevolod "
+    "Nikolskiy and Vladimir Stegailov},\n"
+    " title = {GPU-based molecular dynamics of fluid flows: Reaching for turbulence},\n"
+    " journal = {The International Journal of High Performance Computing Applications},\n"
+    " year =    2024,\n"
+    " volume =  38,\n"
+    " number =  1,\n"
+    " pages =   34-49\n"
+    "}\n\n";
+
+FixWallFlow::FixWallFlow(LAMMPS *lmp, int narg, char **arg) :
+    Fix(lmp, narg, arg), flowax(FlowAxis::AX_X), flowvel(0.0), flowdir(0), rndseed(0),
+    current_segment(nullptr)
+{
+  if (lmp->citeme) lmp->citeme->add(cite_fix_wall_flow_c);
+  if (narg < 9) utils::missing_cmd_args(FLERR, "fix wall/flow", error);
+
+  if (domain->triclinic != 0)
+    error->all(FLERR, "Fix wall/flow cannot be used with triclinic simulation box");
+
+  dynamic_group_allow = 1;
+  bool do_abort = false;
+
+  int iarg = 3;
+  // parsing axis
+  if (strcmp(arg[iarg], "x") == 0)
+    flowax = FlowAxis::AX_X;
+  else if (strcmp(arg[iarg], "y") == 0)
+    flowax = FlowAxis::AX_Y;
+  else if (strcmp(arg[iarg], "z") == 0)
+    flowax = FlowAxis::AX_Z;
+  else
+    error->all(FLERR, "Illegal fix wall/flow argument: axis must by x or y or z, but {} specified",
+               arg[iarg]);
+
+  if (domain->periodicity[flowax] != 1)
+    error->all(FLERR,
+               "Fix wall/flow cannot be used with a non-periodic boundary along the flow axis");
+
+  ++iarg;
+  // parsing velocity
+  flowvel = utils::numeric(FLERR, arg[iarg], do_abort, lmp);
+  if (flowvel == 0.0) error->all(FLERR, "Illegal fix wall/flow argument: velocity cannot be 0");
+  if (flowvel > 0.0)
+    flowdir = 1;
+  else
+    flowdir = -1;
+  if (flowdir < 0)
+    error->all(FLERR, "Illegal fix wall/flow argument: negative direction is not supported yet");
+
+  ++iarg;
+  // parsing temperature
+  double flowtemp = utils::numeric(FLERR, arg[iarg], do_abort, lmp);
+  kT = lmp->force->boltz * flowtemp / force->mvv2e;
+
+  ++iarg;
+  // parsing seed
+  rndseed = utils::inumeric(FLERR, arg[iarg], do_abort, lmp);
+  if (rndseed <= 0)
+    error->all(FLERR, "Illegal fix wall/flow argument: random seed must be positive integer");
+
+  ++iarg;
+  // parsing wall count
+  int wallcount = utils::inumeric(FLERR, arg[iarg], do_abort, lmp);
+  if (wallcount <= 0)
+    error->all(FLERR, "Illegal fix wall/flow argument: wall count must be positive integer");
+
+  ++iarg;
+  // parsing walls
+  if (narg - iarg != wallcount && narg - iarg != wallcount + 2)
+    error->all(FLERR, "Wrong fix wall/flow wall count");
+
+  double scale = 0.0;
+  if (flowax == FlowAxis::AX_X)
+    scale = domain->lattice->xlattice;
+  else if (flowax == FlowAxis::AX_Y)
+    scale = domain->lattice->ylattice;
+  else if (flowax == FlowAxis::AX_Z)
+    scale = domain->lattice->zlattice;
+
+  if (narg - iarg == wallcount + 2) {
+    if (strcmp(arg[narg - 2], "units") != 0) error->all(FLERR, "Wrong fix wall/flow units command");
+    if (strcmp(arg[narg - 1], "box") == 0)
+      scale = 1.0;
+    else if (strcmp(arg[narg - 1], "lattice") != 0)
+      error->all(FLERR, "Wrong fix wall/flow units command");
+  }
+
+  walls.resize(wallcount + 2);
+  walls.front() = domain->boxlo[flowax];
+  for (int w = 1; w <= wallcount; ++w, ++iarg) {
+    walls[w] = utils::numeric(FLERR, arg[iarg], do_abort, lmp) * scale;
+  }
+  walls.back() = domain->boxhi[flowax];
+  if (!std::is_sorted(walls.begin(), walls.end(), std::less_equal<double>())) {
+    error->all(FLERR,
+               "Wrong fix wall/flow wall ordering or some walls are outside simulation domain");
+  }
+
+  if (std::adjacent_find(walls.begin(), walls.end()) != walls.end()) {
+    error->all(FLERR,
+               "Wrong fix wall/flow wall coordinates: some walls have the same coordinates or lie "
+               "on the boundary");
+  }
+
+  memory->grow(current_segment, atom->nmax, "WallFlow::current_segment");
+  atom->add_callback(Atom::GROW);
+  if (restart_peratom) atom->add_callback(Atom::RESTART);
+
+  maxexchange = 1;
+
+  random = new RanMars(lmp, rndseed + comm->me);
+}
+
+/* ---------------------------------------------------------------------- */
+
+FixWallFlow::~FixWallFlow()
+{
+  if (copymode) return;
+  atom->delete_callback(id, Atom::GROW);
+  if (restart_peratom) atom->delete_callback(id, Atom::RESTART);
+  memory->destroy(current_segment);
+
+  delete random;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixWallFlow::setmask()
+{
+  int mask = 0;
+
+  mask |= END_OF_STEP;
+
+  return mask;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixWallFlow::init()
+{
+  if (domain->triclinic != 0)
+    error->all(FLERR, "Fix wall/flow cannot be used with triclinic simulation box");
+
+  int nrigid = 0;
+  int box_change_flowax = 0;
+  for (const auto &ifix : modify->get_fix_list()) {
+    if (ifix->rigid_flag) nrigid++;
+    switch (flowax) {
+      case FlowAxis::AX_X:
+        if (ifix->box_change & Fix::BOX_CHANGE_X) box_change_flowax++;
+        break;
+      case FlowAxis::AX_Y:
+        if (ifix->box_change & Fix::BOX_CHANGE_Y) box_change_flowax++;
+        break;
+      case FlowAxis::AX_Z:
+        if (ifix->box_change & Fix::BOX_CHANGE_Z) box_change_flowax++;
+        break;
+    }
+  }
+
+  if (nrigid) error->all(FLERR, "Fix wall/flow is not compatible with rigid bodies");
+  if (box_change_flowax)
+    error->all(
+        FLERR,
+        "Fix wall/flow is not compatible with simulation box size changing along flow direction");
+
+  for (int i = 0; i < atom->nlocal; ++i) {
+    double pos = atom->x[i][flowax];
+    current_segment[i] = compute_current_segment(pos);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixWallFlow::end_of_step()
+{
+  double **x = atom->x;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; ++i) {
+    if (mask[i] & groupbit) {
+      double pos = x[i][flowax];
+      int prev_segment = current_segment[i];
+      current_segment[i] = compute_current_segment(pos);
+
+      if (prev_segment != current_segment[i]) generate_velocity(i);
+    }
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixWallFlow::generate_velocity(int atom_i)
+{
+  const int newton_iteration_count = 10;
+  double *vel = atom->v[atom_i];
+
+  double *prmass = atom->rmass;
+  double *pmass = atom->mass;
+  double mass = 0.0;
+  if (prmass)
+    mass = prmass[atom_i];
+  else
+    mass = pmass[atom->type[atom_i]];
+
+  const double gamma = 1.0 / std::sqrt(2.0 * kT / mass);
+  double delta = gamma * flowvel;
+
+  const double edd = std::exp(-delta * delta) / MathConst::MY_PIS + delta * std::erf(delta);
+  const double probability_threshold = 0.5f * (1.f + delta / edd);
+
+  double direction = 1.0;
+
+  if (random->uniform() > probability_threshold) {
+    delta = -delta;
+    direction = -direction;
+  }
+
+  const double xi_0 = random->uniform();
+  const double F_inf = edd + delta;
+  const double xi = xi_0 * F_inf;
+  const double x_0 = (std::sqrt(delta * delta + 2) - delta) * 0.5;
+  double x = x_0;
+  for (int i = 0; i < newton_iteration_count; ++i) {
+    x -= (std::exp(x * x) * MathConst::MY_PIS * (xi - delta * std::erfc(x)) - 1.0) / (x + delta) *
+        0.5;
+  }
+
+  const double nu = x + delta;
+  const double v = nu / gamma;
+
+  vel[flowax] = v * direction;
+  vel[(flowax + 1) % 3] = random->gaussian() / (gamma * MathConst::MY_SQRT2);
+  vel[(flowax + 2) % 3] = random->gaussian() / (gamma * MathConst::MY_SQRT2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixWallFlow::compute_current_segment(double pos) const
+{
+  int result = 0;
+  for (; result < (int)walls.size() - 1; ++result) {
+    if (pos >= walls[result] && pos < walls[result + 1]) { return result; }
+  }
+  return -1;    // -1 is "out of box" region
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixWallFlow::grow_arrays(int nmax)
+{
+  memory->grow(current_segment, nmax, "WallFlow::current_segment");
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixWallFlow::copy_arrays(int i, int j, int)
+{
+  current_segment[j] = current_segment[i];
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixWallFlow::pack_exchange(int i, double *buf)
+{
+  buf[0] = static_cast<double>(current_segment[i]);
+  return 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int FixWallFlow::unpack_exchange(int i, double *buf)
+{
+  current_segment[i] = static_cast<int>(buf[0]);
+  return 1;
+}
--- a/src/EXTRA-FIX/fix_wall_flow.h
+++ b/src/EXTRA-FIX/fix_wall_flow.h
@ -0,0 +1,61 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+// clang-format off
+FixStyle(wall/flow,FixWallFlow);
+// clang-format on
+#else
+
+#ifndef LMP_FIX_WALL_FLOW_H
+#define LMP_FIX_WALL_FLOW_H
+
+#include "fix.h"
+
+namespace LAMMPS_NS {
+
+class FixWallFlow : public Fix {
+ public:
+  enum FlowAxis { AX_X = 0, AX_Y = 1, AX_Z = 2 };
+
+  FixWallFlow(class LAMMPS *, int, char **);
+  ~FixWallFlow() override;
+  int setmask() override;
+  void init() override;
+  void end_of_step() override;
+
+  void grow_arrays(int) override;
+  void copy_arrays(int, int, int) override;
+
+  int pack_exchange(int, double *) override;
+  int unpack_exchange(int, double *) override;
+
+ protected:
+  FlowAxis flowax;
+  double flowvel;
+  double kT;
+  std::vector<double> walls;
+
+  int flowdir;
+  int rndseed;
+  class RanMars *random;
+  int *current_segment;
+
+  int compute_current_segment(double pos) const;
+  void generate_velocity(int i);
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
--- a/src/GRANULAR/compute_fabric.cpp
+++ b/src/GRANULAR/compute_fabric.cpp
@ -184,7 +184,7 @@ void ComputeFabric::compute_vector()
  double nx, ny, nz;
  double ncinv, denom, fn, ft, prefactor;
  double br_tensor[6], ft_tensor[6], fn_tensor[6];
-  double trace_phi, trace_D, trace_Xfn, trace_Xft;
+  double trace_third_phi, trace_third_D, trace_third_Xfn, trace_third_Xft;
  double phi_ij[6] = {0.0};
  double Ac_ij[6] = {0.0};
  double D_ij[6] = {0.0};
@ -295,11 +295,11 @@ void ComputeFabric::compute_vector()
  MPI_Allreduce(phi_ij, temp_dbl, 6, MPI_DOUBLE, MPI_SUM, world);
  for (i = 0; i < 6; i++) phi_ij[i] = temp_dbl[i] * ncinv;

-  trace_phi = (1.0 / 3.0) * (phi_ij[0] + phi_ij[1] + phi_ij[2]);
+  trace_third_phi = (1.0 / 3.0) * (phi_ij[0] + phi_ij[1] + phi_ij[2]);

-  Ac_ij[0] = (15.0 / 2.0) * (phi_ij[0] - trace_phi);
-  Ac_ij[1] = (15.0 / 2.0) * (phi_ij[1] - trace_phi);
-  Ac_ij[2] = (15.0 / 2.0) * (phi_ij[2] - trace_phi);
+  Ac_ij[0] = (15.0 / 2.0) * (phi_ij[0] - trace_third_phi);
+  Ac_ij[1] = (15.0 / 2.0) * (phi_ij[1] - trace_third_phi);
+  Ac_ij[2] = (15.0 / 2.0) * (phi_ij[2] - trace_third_phi);
  Ac_ij[3] = (15.0 / 2.0) * (phi_ij[3]);
  Ac_ij[4] = (15.0 / 2.0) * (phi_ij[4]);
  Ac_ij[5] = (15.0 / 2.0) * (phi_ij[5]);
@ -419,14 +419,14 @@ void ComputeFabric::compute_vector()
    MPI_Allreduce(D_ij, temp_dbl, 6, MPI_DOUBLE, MPI_SUM, world);
    for (i = 0; i < 6; i++) D_ij[i] = temp_dbl[i];

-    trace_D = (1.0 / 3.0) * (D_ij[0] + D_ij[1] + D_ij[2]);
+    trace_third_D = (1.0 / 3.0) * (D_ij[0] + D_ij[1] + D_ij[2]);

-    br_tensor[0] = (15.0 / (6.0 * trace_D)) * (D_ij[0] - trace_D);
-    br_tensor[1] = (15.0 / (6.0 * trace_D)) * (D_ij[1] - trace_D);
-    br_tensor[2] = (15.0 / (6.0 * trace_D)) * (D_ij[2] - trace_D);
-    br_tensor[3] = (15.0 / (6.0 * trace_D)) * (D_ij[3]);
-    br_tensor[4] = (15.0 / (6.0 * trace_D)) * (D_ij[4]);
-    br_tensor[5] = (15.0 / (6.0 * trace_D)) * (D_ij[5]);
+    br_tensor[0] = (15.0 / (6.0 * trace_third_D)) * (D_ij[0] - trace_third_D);
+    br_tensor[1] = (15.0 / (6.0 * trace_third_D)) * (D_ij[1] - trace_third_D);
+    br_tensor[2] = (15.0 / (6.0 * trace_third_D)) * (D_ij[2] - trace_third_D);
+    br_tensor[3] = (15.0 / (6.0 * trace_third_D)) * (D_ij[3]);
+    br_tensor[4] = (15.0 / (6.0 * trace_third_D)) * (D_ij[4]);
+    br_tensor[5] = (15.0 / (6.0 * trace_third_D)) * (D_ij[5]);

    for (i = 0; i < ntensors; i++) {
      if (tensor_style[i] == BR) {
@ -439,17 +439,17 @@ void ComputeFabric::compute_vector()
    MPI_Allreduce(Xfn_ij, temp_dbl, 6, MPI_DOUBLE, MPI_SUM, world);
    for (i = 0; i < 6; i++) Xfn_ij[i] = temp_dbl[i];

-    trace_Xfn = (1.0 / 3.0) * (Xfn_ij[0] + Xfn_ij[1] + Xfn_ij[2]);
+    trace_third_Xfn = (1.0 / 3.0) * (Xfn_ij[0] + Xfn_ij[1] + Xfn_ij[2]);
  }

  if (fn_flag) {

-    fn_tensor[0] = (15.0 / (6.0 * trace_Xfn)) * (Xfn_ij[0] - trace_Xfn);
-    fn_tensor[1] = (15.0 / (6.0 * trace_Xfn)) * (Xfn_ij[1] - trace_Xfn);
-    fn_tensor[2] = (15.0 / (6.0 * trace_Xfn)) * (Xfn_ij[2] - trace_Xfn);
-    fn_tensor[3] = (15.0 / (6.0 * trace_Xfn)) * (Xfn_ij[3]);
-    fn_tensor[4] = (15.0 / (6.0 * trace_Xfn)) * (Xfn_ij[4]);
-    fn_tensor[5] = (15.0 / (6.0 * trace_Xfn)) * (Xfn_ij[5]);
+    fn_tensor[0] = (15.0 / (6.0 * trace_third_Xfn)) * (Xfn_ij[0] - trace_third_Xfn);
+    fn_tensor[1] = (15.0 / (6.0 * trace_third_Xfn)) * (Xfn_ij[1] - trace_third_Xfn);
+    fn_tensor[2] = (15.0 / (6.0 * trace_third_Xfn)) * (Xfn_ij[2] - trace_third_Xfn);
+    fn_tensor[3] = (15.0 / (6.0 * trace_third_Xfn)) * (Xfn_ij[3]);
+    fn_tensor[4] = (15.0 / (6.0 * trace_third_Xfn)) * (Xfn_ij[4]);
+    fn_tensor[5] = (15.0 / (6.0 * trace_third_Xfn)) * (Xfn_ij[5]);

    for (i = 0; i < ntensors; i++) {
      if (tensor_style[i] == FN) {
@ -462,14 +462,14 @@ void ComputeFabric::compute_vector()
    MPI_Allreduce(Xft_ij, temp_dbl, 6, MPI_DOUBLE, MPI_SUM, world);
    for (i = 0; i < 6; i++) Xft_ij[i] = temp_dbl[i];

-    trace_Xft = (1.0 / 3.0) * (Xft_ij[0] + Xft_ij[1] + Xft_ij[2]);
+    trace_third_Xft = (1.0 / 3.0) * (Xft_ij[0] + Xft_ij[1] + Xft_ij[2]);

-    ft_tensor[0] = (15.0 / (9.0 * trace_Xfn)) * (Xft_ij[0] - trace_Xft);
-    ft_tensor[1] = (15.0 / (9.0 * trace_Xfn)) * (Xft_ij[1] - trace_Xft);
-    ft_tensor[2] = (15.0 / (9.0 * trace_Xfn)) * (Xft_ij[2] - trace_Xft);
-    ft_tensor[3] = (15.0 / (9.0 * trace_Xfn)) * (Xft_ij[3]);
-    ft_tensor[4] = (15.0 / (9.0 * trace_Xfn)) * (Xft_ij[4]);
-    ft_tensor[5] = (15.0 / (9.0 * trace_Xfn)) * (Xft_ij[5]);
+    ft_tensor[0] = (15.0 / (9.0 * trace_third_Xfn)) * (Xft_ij[0] - trace_third_Xft);
+    ft_tensor[1] = (15.0 / (9.0 * trace_third_Xfn)) * (Xft_ij[1] - trace_third_Xft);
+    ft_tensor[2] = (15.0 / (9.0 * trace_third_Xfn)) * (Xft_ij[2] - trace_third_Xft);
+    ft_tensor[3] = (15.0 / (9.0 * trace_third_Xfn)) * (Xft_ij[3]);
+    ft_tensor[4] = (15.0 / (9.0 * trace_third_Xfn)) * (Xft_ij[4]);
+    ft_tensor[5] = (15.0 / (9.0 * trace_third_Xfn)) * (Xft_ij[5]);

    for (i = 0; i < ntensors; i++) {
      if (tensor_style[i] == FT) {
--- a/src/GRANULAR/gran_sub_mod_damping.cpp
+++ b/src/GRANULAR/gran_sub_mod_damping.cpp
@ -130,6 +130,12 @@ void GranSubModDampingTsuji::init()

 double GranSubModDampingTsuji::calculate_forces()
 {
-  damp_prefactor = damp * sqrt(gm->meff * gm->Fnormal / gm->delta);
+  // in case argument <= 0 due to precision issues
+  double sqrt1;
+  if (gm->delta > 0.0)
+    sqrt1 = MAX(0.0, gm->meff * gm->Fnormal / gm->delta);
+  else
+    sqrt1 = 0.0;
+  damp_prefactor = damp * sqrt(sqrt1);
  return -damp_prefactor * gm->vnnr;
 }
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@ -187,6 +187,8 @@ action fix_temp_rescale_kokkos.cpp
 action fix_temp_rescale_kokkos.h
 action fix_viscous_kokkos.cpp
 action fix_viscous_kokkos.h
+action fix_wall_flow_kokkos.cpp fix_wall_flow.cpp
+action fix_wall_flow_kokkos.h fix_wall_flow.h
 action fix_wall_gran_kokkos.cpp fix_wall_gran.cpp
 action fix_wall_gran_kokkos.h fix_wall_gran.h
 action fix_wall_gran_old.cpp fix_wall_gran.cpp
--- a/src/KOKKOS/fix_wall_flow_kokkos.cpp
+++ b/src/KOKKOS/fix_wall_flow_kokkos.cpp
@ -0,0 +1,291 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Vladislav Galigerov (HSE),
+                         Daniil Pavlov (MIPT)
+------------------------------------------------------------------------- */
+
+#include "fix_wall_flow_kokkos.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "force.h"
+#include "math_const.h"
+#include "memory_kokkos.h"
+
+using namespace LAMMPS_NS;
+
+template <class DeviceType>
+FixWallFlowKokkos<DeviceType>::FixWallFlowKokkos(LAMMPS *lmp, int narg, char **arg) :
+    FixWallFlow(lmp, narg, arg), rand_pool(rndseed + comm->me)
+{
+  kokkosable = 1;
+  exchange_comm_device = sort_device = 1;
+  atomKK = (AtomKokkos *) atom;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+  datamask_read = X_MASK | RMASS_MASK | TYPE_MASK | MASK_MASK;
+  datamask_modify = V_MASK;
+
+  memory->destroy(current_segment);
+  current_segment = nullptr;
+  grow_arrays(atomKK->nmax);
+
+  d_walls = d_walls_t("FixWallFlowKokkos::walls", walls.size());
+  auto h_walls = Kokkos::create_mirror_view(d_walls);
+  for (int i = 0; i < (int) walls.size(); ++i) h_walls(i) = walls[i];
+  Kokkos::deep_copy(d_walls, h_walls);
+}
+
+template <class DeviceType> FixWallFlowKokkos<DeviceType>::~FixWallFlowKokkos()
+{
+  if (copymode) return;
+  memoryKK->destroy_kokkos(k_current_segment, current_segment);
+}
+
+template <class DeviceType> void FixWallFlowKokkos<DeviceType>::init()
+{
+  atomKK->sync(execution_space, datamask_read);
+  k_current_segment.template sync<DeviceType>();
+  d_x = atomKK->k_x.template view<DeviceType>();
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixWallFlowInit>(0, atom->nlocal), *this);
+  copymode = 0;
+
+  k_current_segment.template modify<DeviceType>();
+}
+
+template <class DeviceType>
+KOKKOS_INLINE_FUNCTION void FixWallFlowKokkos<DeviceType>::operator()(TagFixWallFlowInit,
+                                                                      const int &i) const
+{
+  double pos = d_x(i, flowax);
+  d_current_segment(i) = compute_current_segment_kk(pos);
+}
+
+template <class DeviceType> void FixWallFlowKokkos<DeviceType>::end_of_step()
+{
+  atomKK->sync(execution_space, datamask_read);
+  k_current_segment.template sync<DeviceType>();
+
+  d_x = atomKK->k_x.template view<DeviceType>();
+  d_v = atomKK->k_v.template view<DeviceType>();
+  d_type = atomKK->k_type.template view<DeviceType>();
+  d_mask = atomKK->k_mask.template view<DeviceType>();
+  d_mass = atomKK->k_mass.template view<DeviceType>();
+  d_rmass = atomKK->k_rmass.template view<DeviceType>();
+
+  copymode = 1;
+  if (d_rmass.data()) {
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<DeviceType, TagFixWallFlowEndOfStep<RMassTag>>(0, atom->nlocal), *this);
+  } else {
+    Kokkos::parallel_for(
+        Kokkos::RangePolicy<DeviceType, TagFixWallFlowEndOfStep<MassTag>>(0, atom->nlocal), *this);
+  }
+  copymode = 0;
+  atomKK->modified(execution_space, datamask_modify);
+  k_current_segment.template modify<DeviceType>();
+}
+
+template <class DeviceType>
+template <class MTag>
+KOKKOS_INLINE_FUNCTION void FixWallFlowKokkos<DeviceType>::operator()(TagFixWallFlowEndOfStep<MTag>,
+                                                                      const int &atom_i) const
+{
+  if (d_mask[atom_i] & groupbit) {
+    double pos = d_x(atom_i, flowax);
+    int prev_segment = d_current_segment(atom_i);
+    d_current_segment(atom_i) = compute_current_segment_kk(pos);
+    if (prev_segment != d_current_segment(atom_i)) { generate_velocity_kk<MTag>(atom_i); }
+  }
+}
+
+template <class DeviceType>
+template <class MTag>
+KOKKOS_INLINE_FUNCTION void FixWallFlowKokkos<DeviceType>::generate_velocity_kk(int atom_i) const
+{
+  const int newton_iteration_count = 10;
+  double mass = get_mass(MTag(), atom_i);
+  const double gamma = 1.0 / std::sqrt(2.0 * kT / mass);
+  double delta = gamma * flowvel;
+
+  const double edd = std::exp(-delta * delta) / MathConst::MY_PIS + delta * std::erf(delta);
+  const double probability_threshold = 0.5 * (1. + delta / edd);
+
+  double direction = 1.0;
+
+  rand_type_t rand_gen = rand_pool.get_state();
+
+  if (/*random->uniform()*/ rand_gen.drand() > probability_threshold) {
+    delta = -delta;
+    direction = -direction;
+  }
+
+  const double xi_0 = rand_gen.drand();    //random->uniform();
+  const double F_inf = edd + delta;
+  const double xi = xi_0 * F_inf;
+  const double x_0 = (std::sqrt(delta * delta + 2) - delta) * 0.5;
+  double x = x_0;
+  for (int i = 0; i < newton_iteration_count; ++i) {
+    x -= (std::exp(x * x) * MathConst::MY_PIS * (xi - delta * std::erfc(x)) - 1.0) / (x + delta) *
+        0.5;
+  }
+
+  const double nu = x + delta;
+  const double v = nu / gamma;
+
+  d_v(atom_i, flowax) = v * direction;
+  d_v(atom_i, (flowax + 1) % 3) =
+      /*random->gaussian()*/ rand_gen.normal() / (gamma * MathConst::MY_SQRT2);
+  d_v(atom_i, (flowax + 2) % 3) =
+      /*random->gaussian()*/ rand_gen.normal() / (gamma * MathConst::MY_SQRT2);
+
+  rand_pool.free_state(rand_gen);
+}
+
+template <class DeviceType>
+KOKKOS_INLINE_FUNCTION int
+FixWallFlowKokkos<DeviceType>::compute_current_segment_kk(double pos) const
+{
+  int result = 0;
+  for (; result < (int) d_walls.extent(0) - 1; ++result) {
+    if (pos >= d_walls[result] && pos < d_walls[result + 1]) { return result; }
+  }
+  return -1;    // -1 is "out of box" region
+}
+
+template <class DeviceType> void FixWallFlowKokkos<DeviceType>::grow_arrays(int nmax)
+{
+  k_current_segment.template sync<DeviceType>();
+  memoryKK->grow_kokkos(k_current_segment, current_segment, nmax, "WallFlowKK::current_segment");
+  k_current_segment.template modify<DeviceType>();
+
+  d_current_segment = k_current_segment.template view<DeviceType>();
+  h_current_segment = k_current_segment.template view<LMPHostType>();
+}
+
+template <class DeviceType> void FixWallFlowKokkos<DeviceType>::copy_arrays(int i, int j, int)
+{
+  k_current_segment.template sync<LMPHostType>();
+  h_current_segment(j) = h_current_segment(i);
+  k_current_segment.template modify<LMPHostType>();
+}
+
+/* ----------------------------------------------------------------------
+   sort local atom-based arrays
+------------------------------------------------------------------------- */
+
+template <class DeviceType>
+void FixWallFlowKokkos<DeviceType>::sort_kokkos(Kokkos::BinSort<KeyViewType, BinOp> &Sorter)
+{
+  // always sort on the device
+
+  k_current_segment.sync_device();
+
+  Sorter.sort(LMPDeviceType(), k_current_segment.d_view);
+
+  k_current_segment.modify_device();
+}
+
+template <class DeviceType> int FixWallFlowKokkos<DeviceType>::pack_exchange(int i, double *buf)
+{
+  k_current_segment.sync_host();
+  buf[0] = static_cast<double>(h_current_segment(i));
+  return 1;
+}
+
+template <class DeviceType>
+KOKKOS_INLINE_FUNCTION void FixWallFlowKokkos<DeviceType>::operator()(TagFixWallFlowPackExchange,
+                                                                      const int &mysend) const
+{
+  const int send_i = d_sendlist(mysend);
+  const int segment = d_current_segment(send_i);
+  d_buf(mysend) = static_cast<double>(segment);
+
+  const int copy_i = d_copylist(mysend);
+  if (copy_i > -1) { d_current_segment(send_i) = d_current_segment(copy_i); }
+}
+
+template <class DeviceType>
+int FixWallFlowKokkos<DeviceType>::pack_exchange_kokkos(const int &nsend,
+                                                        DAT::tdual_xfloat_2d &k_buf,
+                                                        DAT::tdual_int_1d k_sendlist,
+                                                        DAT::tdual_int_1d k_copylist,
+                                                        ExecutionSpace /*space*/)
+{
+  k_current_segment.template sync<DeviceType>();
+
+  k_buf.template sync<DeviceType>();
+  k_sendlist.template sync<DeviceType>();
+  k_copylist.template sync<DeviceType>();
+
+  d_sendlist = k_sendlist.view<DeviceType>();
+  d_copylist = k_copylist.view<DeviceType>();
+
+  d_buf = typename ArrayTypes<DeviceType>::t_xfloat_1d_um(k_buf.template view<DeviceType>().data(),
+                                                          k_buf.extent(0) * k_buf.extent(1));
+
+  copymode = 1;
+
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixWallFlowPackExchange>(0, nsend),
+                       *this);
+
+  copymode = 0;
+
+  k_buf.template modify<DeviceType>();
+  k_current_segment.template modify<DeviceType>();
+
+  return nsend;
+}
+
+template <class DeviceType> int FixWallFlowKokkos<DeviceType>::unpack_exchange(int i, double *buf)
+{
+  k_current_segment.sync_host();
+  h_current_segment(i) = static_cast<int>(buf[0]);
+  k_current_segment.modify_host();
+  return 1;
+}
+
+template <class DeviceType>
+KOKKOS_INLINE_FUNCTION void FixWallFlowKokkos<DeviceType>::operator()(TagFixWallFlowUnpackExchange,
+                                                                      const int &i) const
+{
+  int index = d_indices(i);
+  if (index > -1) { d_current_segment(index) = static_cast<int>(d_buf(i)); }
+}
+
+template <class DeviceType>
+void FixWallFlowKokkos<DeviceType>::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
+                                                           DAT::tdual_int_1d &k_indices, int nrecv,
+                                                           int /*nrecv1*/, int /*nextrarecv1*/,
+                                                           ExecutionSpace /*space*/)
+{
+  d_buf = typename ArrayTypes<DeviceType>::t_xfloat_1d_um(k_buf.template view<DeviceType>().data(),
+                                                          k_buf.extent(0) * k_buf.extent(1));
+  d_indices = k_indices.view<DeviceType>();
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixWallFlowUnpackExchange>(0, nrecv),
+                       *this);
+  copymode = 0;
+
+  k_current_segment.template modify<DeviceType>();
+}
+
+namespace LAMMPS_NS {
+template class FixWallFlowKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class FixWallFlowKokkos<LMPHostType>;
+#endif
+}    // namespace LAMMPS_NS
--- a/src/KOKKOS/fix_wall_flow_kokkos.h
+++ b/src/KOKKOS/fix_wall_flow_kokkos.h
@ -0,0 +1,130 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   LAMMPS development team: developers@lammps.org
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+// clang-format off
+FixStyle(wall/flow/kk,FixWallFlowKokkos<LMPDeviceType>);
+FixStyle(wall/flow/kk/device,FixWallFlowKokkos<LMPDeviceType>);
+FixStyle(wall/flow/kk/host,FixWallFlowKokkos<LMPHostType>);
+// clang-format on
+#else
+
+// clang-format off
+#ifndef LMP_FIX_WALL_FLOW_KOKKOS_H
+#define LMP_FIX_WALL_FLOW_KOKKOS_H
+
+#include "fix_wall_flow.h"
+#include "kokkos_type.h"
+#include "kokkos_base.h"
+#include "Kokkos_Random.hpp"
+#include "comm_kokkos.h"
+
+namespace LAMMPS_NS {
+
+struct TagFixWallFlowInit{};
+template<class MTag>
+struct TagFixWallFlowEndOfStep{};
+struct TagFixWallFlowPackExchange{};
+struct TagFixWallFlowUnpackExchange{};
+
+template<class DeviceType>
+class FixWallFlowKokkos : public FixWallFlow, public KokkosBase {
+ public:
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  struct MassTag{};
+  struct RMassTag{};
+  FixWallFlowKokkos(class LAMMPS *, int, char **);
+  ~FixWallFlowKokkos();
+
+  void init() override;
+  void end_of_step() override;
+  void grow_arrays(int) override;
+  void copy_arrays(int, int, int) override;
+  void sort_kokkos(Kokkos::BinSort<KeyViewType, BinOp> &Sorter) override;
+  int pack_exchange(int, double *) override;
+  int unpack_exchange(int, double *) override;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (TagFixWallFlowInit, const int&) const;
+
+  template<class MTag>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixWallFlowEndOfStep<MTag>, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixWallFlowPackExchange, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixWallFlowUnpackExchange, const int&) const;
+
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space) override;
+
+  void unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,
+                              DAT::tdual_int_1d &indices,int nrecv,
+                              int /*nrecv1*/, int /*nextrarecv1*/,
+                              ExecutionSpace space) override;
+ protected:
+  typename AT::t_x_array d_x;
+  typename AT::t_v_array d_v;
+  typename AT::t_int_1d d_type;
+  typename AT::t_int_1d d_mask;
+
+  typename AT::t_float_1d d_mass;
+  typename AT::t_float_1d d_rmass;
+
+  typedef typename AT::t_xfloat_1d d_walls_t;
+  typedef Kokkos::Random_XorShift64_Pool<DeviceType> rand_pool_t;
+  typedef typename rand_pool_t::generator_type rand_type_t;
+
+  typename AT::tdual_int_1d k_current_segment;
+  typename AT::t_int_1d d_current_segment;
+  typename HAT::t_int_1d h_current_segment;
+
+  typename AT::t_int_1d d_sendlist;
+  typename AT::t_xfloat_1d d_buf;
+  typename AT::t_int_1d d_copylist;
+  typename AT::t_int_1d d_indices;
+
+  d_walls_t d_walls;
+
+  rand_pool_t rand_pool;
+
+  template<class MTag>
+  KOKKOS_INLINE_FUNCTION
+  void generate_velocity_kk(int atom_i) const;
+
+  KOKKOS_INLINE_FUNCTION
+  int compute_current_segment_kk(double pos) const;
+
+  KOKKOS_INLINE_FUNCTION
+  double get_mass(MassTag, int atom_i) const
+  {
+    return d_mass(d_type(atom_i));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  double get_mass(RMassTag, int atom_i) const
+  {
+    return d_rmass(atom_i);
+  }
+};
+
+}
+
+#endif
+#endif
+
--- a/src/KOKKOS/mliap_unified_couple_kokkos.pyx
+++ b/src/KOKKOS/mliap_unified_couple_kokkos.pyx
@ -13,6 +13,7 @@ from libc.stdint cimport uintptr_t
 cimport cython
 from cpython.ref cimport PyObject
 from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy


 cdef extern from "lammps.h" namespace "LAMMPS_NS":
@ -451,15 +452,24 @@ cdef public object mliap_unified_connect_kokkos(char *fname, MLIAPDummyModel * m

    cdef int nelements = <int>len(unified.element_types)
    cdef char **elements = <char**>malloc(nelements * sizeof(char*))
+    cdef char * c_str
+    cdef char * s
+    cdef ssize_t slen

    if not elements:
        raise MemoryError("failed to allocate memory for element names")

-    cdef char *elem_name
    for i, elem in enumerate(unified.element_types):
-        elem_name_bytes = elem.encode('UTF-8')
-        elem_name = elem_name_bytes
-        elements[i] = &elem_name[0]
+        py_str = elem.encode('UTF-8')
+        s = py_str
+        slen = len(py_str)
+        c_str = <char *>malloc((slen+1)*sizeof(char))
+        if not c_str:
+            raise MemoryError("failed to allocate memory for element names")
+        memcpy(c_str, s, slen)
+        c_str[slen] = 0
+        elements[i] = c_str
+
    unified_int.descriptor.set_elements(elements, nelements)
    unified_int.model.nelements = nelements

--- a/src/KOKKOS/mliap_unified_kokkos.cpp
+++ b/src/KOKKOS/mliap_unified_kokkos.cpp
@ -315,7 +315,6 @@ void LAMMPS_NS::update_pair_forces(MLIAPDataKokkosDevice *data, double *fij)
    int i = pair_i[ii];
    int j = j_atoms[ii];
    // must not count any contribution where i is not a local atom
-    if (i < nlocal) {
      Kokkos::atomic_add(&f[i*3+0], fij[ii3+0]);
      Kokkos::atomic_add(&f[i*3+1], fij[ii3+1]);
      Kokkos::atomic_add(&f[i*3+2], fij[ii3+2]);
@ -352,7 +351,6 @@ void LAMMPS_NS::update_pair_forces(MLIAPDataKokkosDevice *data, double *fij)
          Kokkos::atomic_add(&d_vatom(j,3), 0.5*v[3]);
          Kokkos::atomic_add(&d_vatom(j,4), 0.5*v[4]);
          Kokkos::atomic_add(&d_vatom(j,5), 0.5*v[5]);
-        }
      }
    }
  });
@ -382,11 +380,9 @@ void LAMMPS_NS::update_atom_energy(MLIAPDataKokkosDevice *data, double *ei)

  Kokkos::parallel_reduce(nlocal, KOKKOS_LAMBDA(int i, double &local_sum){
    double e = ei[i];
-    // must not count any contribution where i is not a local atom
-    if (i < nlocal) {
-      d_eatoms[i] = e;
-      local_sum += e;
-    }
+    
+    d_eatoms[i] = e;
+    local_sum += e;
  },*data->energy);
 }

--- a/src/ML-IAP/mliap_unified.cpp
+++ b/src/ML-IAP/mliap_unified.cpp
@ -254,10 +254,8 @@ void LAMMPS_NS::update_pair_energy(MLIAPData *data, double *eij)
    double e = 0.5 * eij[ii];

    // must not count any contribution where i is not a local atom
-    if (i < nlocal) {
-      data->eatoms[i] += e;
-      e_total += e;
-    }
+    data->eatoms[i] += e;
+    e_total += e;
  }
  data->energy = e_total;
 }
@ -277,17 +275,14 @@ void LAMMPS_NS::update_pair_forces(MLIAPData *data, double *fij)
    int i = data->pair_i[ii];
    int j = data->jatoms[ii];

-    // must not count any contribution where i is not a local atom
-    if (i < nlocal) {
-      f[i][0] += fij[ii3];
-      f[i][1] += fij[ii3 + 1];
-      f[i][2] += fij[ii3 + 2];
-      f[j][0] -= fij[ii3];
-      f[j][1] -= fij[ii3 + 1];
-      f[j][2] -= fij[ii3 + 2];
+    f[i][0] += fij[ii3];
+    f[i][1] += fij[ii3 + 1];
+    f[i][2] += fij[ii3 + 2];
+    f[j][0] -= fij[ii3];
+    f[j][1] -= fij[ii3 + 1];
+    f[j][2] -= fij[ii3 + 2];

-      if (data->vflag) data->pairmliap->v_tally(i, j, &fij[ii3], data->rij[ii]);
-    }
+    if (data->vflag) data->pairmliap->v_tally(i, j, &fij[ii3], data->rij[ii]);
  }
 }

--- a/src/ML-IAP/mliap_unified_couple.pyx
+++ b/src/ML-IAP/mliap_unified_couple.pyx
@ -8,6 +8,7 @@ import lammps.mliap
 cimport cython
 from cpython.ref cimport PyObject
 from libc.stdlib cimport malloc, free
+from libc.string cimport memcpy


 cdef extern from "lammps.h" namespace "LAMMPS_NS":
@ -387,15 +388,26 @@ cdef public object mliap_unified_connect(char *fname, MLIAPDummyModel * model,

    cdef int nelements = <int>len(unified.element_types)
    cdef char **elements = <char**>malloc(nelements * sizeof(char*))
+    cdef char * c_str
+    cdef char * s
+    cdef ssize_t slen

    if not elements:
        raise MemoryError("failed to allocate memory for element names")

-    cdef char *elem_name
    for i, elem in enumerate(unified.element_types):
-        elem_name_bytes = elem.encode('UTF-8')
-        elem_name = elem_name_bytes
-        elements[i] = &elem_name[0]
+        py_str = elem.encode('UTF-8')
+
+        s = py_str
+        slen = len(py_str)
+        c_str = <char *>malloc((slen+1)*sizeof(char))
+        if not c_str:
+            raise MemoryError("failed to allocate memory for element names")
+        memcpy(c_str, s, slen)
+        c_str[slen] = 0
+        
+        elements[i] = c_str
+    
    unified_int.descriptor.set_elements(elements, nelements)
    unified_int.model.nelements = nelements

--- a/src/compute_property_atom.cpp
+++ b/src/compute_property_atom.cpp
@ -205,6 +205,14 @@ ComputePropertyAtom::ComputePropertyAtom(LAMMPS *lmp, int narg, char **arg) :
      if (!atom->omega_flag)
        error->all(FLERR,"Compute property/atom {} is not available", arg[iarg]);
      pack_choice[i] = &ComputePropertyAtom::pack_omegaz;
+    } else if (strcmp(arg[iarg],"temperature") == 0) {
+      if (!atom->temperature_flag)
+        error->all(FLERR,"Compute property/atom {} is not available", arg[iarg]);
+      pack_choice[i] = &ComputePropertyAtom::pack_temperature;
+    } else if (strcmp(arg[iarg],"heatflow") == 0) {
+      if (!atom->heatflow_flag)
+        error->all(FLERR,"Compute property/atom {} is not available", arg[iarg]);
+      pack_choice[i] = &ComputePropertyAtom::pack_heatflow;
    } else if (strcmp(arg[iarg],"angmomx") == 0) {
      if (!atom->angmom_flag)
        error->all(FLERR,"Compute property/atom {} is not available", arg[iarg]);
@ -1213,6 +1221,36 @@ void ComputePropertyAtom::pack_omegaz(int n)

 /* ---------------------------------------------------------------------- */

+void ComputePropertyAtom::pack_temperature(int n)
+{
+  double *temperature = atom->temperature;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+    if (mask[i] & groupbit) buf[n] = temperature[i];
+    else buf[n] = 0.0;
+    n += nvalues;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void ComputePropertyAtom::pack_heatflow(int n)
+{
+  double *heatflow = atom->heatflow;
+  int *mask = atom->mask;
+  int nlocal = atom->nlocal;
+
+  for (int i = 0; i < nlocal; i++) {
+    if (mask[i] & groupbit) buf[n] = heatflow[i];
+    else buf[n] = 0.0;
+    n += nvalues;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
 void ComputePropertyAtom::pack_angmomx(int n)
 {
  double **angmom = atom->angmom;
--- a/src/compute_property_atom.h
+++ b/src/compute_property_atom.h
@ -95,6 +95,8 @@ class ComputePropertyAtom : public Compute {
  void pack_omegax(int);
  void pack_omegay(int);
  void pack_omegaz(int);
+  void pack_temperature(int);
+  void pack_heatflow(int);
  void pack_angmomx(int);
  void pack_angmomy(int);
  void pack_angmomz(int);
--- a/src/dump_custom.cpp
+++ b/src/dump_custom.cpp
@ -41,7 +41,7 @@ enum{ID,MOL,PROC,PROCP1,TYPE,ELEMENT,MASS,
     XSU,YSU,ZSU,XSUTRI,YSUTRI,ZSUTRI,
     IX,IY,IZ,
     VX,VY,VZ,FX,FY,FZ,
-     Q,MUX,MUY,MUZ,MU,RADIUS,DIAMETER,HEATFLOW,TEMPERATURE,
+     Q,MUX,MUY,MUZ,MU,RADIUS,DIAMETER,
     OMEGAX,OMEGAY,OMEGAZ,ANGMOMX,ANGMOMY,ANGMOMZ,
     TQX,TQY,TQZ,
     COMPUTE,FIX,VARIABLE,IVEC,DVEC,IARRAY,DARRAY};
@ -929,18 +929,6 @@ int DumpCustom::count()
        for (i = 0; i < nlocal; i++) dchoose[i] = 2.0*radius[i];
        ptr = dchoose;
        nstride = 1;
-      } else if (thresh_array[ithresh] == HEATFLOW) {
-        if (!atom->heatflow_flag)
-          error->all(FLERR,
-                     "Threshold for an atom property that isn't allocated");
-        ptr = atom->heatflow;
-        nstride = 1;
-      } else if (thresh_array[ithresh] == TEMPERATURE) {
-        if (!atom->temperature_flag)
-          error->all(FLERR,
-                     "Threshold for an atom property that isn't allocated");
-        ptr = atom->temperature;
-        nstride = 1;
      } else if (thresh_array[ithresh] == OMEGAX) {
        if (!atom->omega_flag)
          error->all(FLERR,
@ -1395,16 +1383,6 @@ int DumpCustom::parse_fields(int narg, char **arg)
        error->all(FLERR,"Dumping an atom property that isn't allocated");
      pack_choice[iarg] = &DumpCustom::pack_diameter;
      vtype[iarg] = Dump::DOUBLE;
-    } else if (strcmp(arg[iarg],"heatflow") == 0) {
-      if (!atom->heatflow_flag)
-        error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[iarg] = &DumpCustom::pack_heatflow;
-      vtype[iarg] = Dump::DOUBLE;
-    } else if (strcmp(arg[iarg],"temperature") == 0) {
-      if (!atom->temperature_flag)
-        error->all(FLERR,"Dumping an atom property that isn't allocated");
-      pack_choice[iarg] = &DumpCustom::pack_temperature;
-      vtype[iarg] = Dump::DOUBLE;
    } else if (strcmp(arg[iarg],"omegax") == 0) {
      if (!atom->omega_flag)
        error->all(FLERR,"Dumping an atom property that isn't allocated");
@ -1875,8 +1853,6 @@ int DumpCustom::modify_param(int narg, char **arg)

    else if (strcmp(arg[1],"radius") == 0) thresh_array[nthresh] = RADIUS;
    else if (strcmp(arg[1],"diameter") == 0) thresh_array[nthresh] = DIAMETER;
-    else if (strcmp(arg[1],"heatflow") == 0) thresh_array[nthresh] = HEATFLOW;
-    else if (strcmp(arg[1],"temperature") == 0) thresh_array[nthresh] = TEMPERATURE;
    else if (strcmp(arg[1],"omegax") == 0) thresh_array[nthresh] = OMEGAX;
    else if (strcmp(arg[1],"omegay") == 0) thresh_array[nthresh] = OMEGAY;
    else if (strcmp(arg[1],"omegaz") == 0) thresh_array[nthresh] = OMEGAZ;
@ -2791,30 +2767,6 @@ void DumpCustom::pack_diameter(int n)

 /* ---------------------------------------------------------------------- */

-void DumpCustom::pack_heatflow(int n)
-{
-  double *heatflow = atom->heatflow;
-
-  for (int i = 0; i < nchoose; i++) {
-    buf[n] = heatflow[clist[i]];
-    n += size_one;
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void DumpCustom::pack_temperature(int n)
-{
-  double *temperature = atom->temperature;
-
-  for (int i = 0; i < nchoose; i++) {
-    buf[n] = temperature[clist[i]];
-    n += size_one;
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
 void DumpCustom::pack_omegax(int n)
 {
  double **omega = atom->omega;
--- a/src/dump_custom.h
+++ b/src/dump_custom.h
@ -188,8 +188,6 @@ class DumpCustom : public Dump {
  void pack_mu(int);
  void pack_radius(int);
  void pack_diameter(int);
-  void pack_heatflow(int);
-  void pack_temperature(int);

  void pack_omegax(int);
  void pack_omegay(int);