From 1b7bc36505f66cfa2ccf257febd5b240b31310d8 Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Sun, 10 Mar 2024 12:56:33 +0100
Subject: [PATCH 01/27] Fix variables compatibility with chunk arrays

When using variables with chunk computes that produce arrays (such as `compute chunk/atom`) the compute will not have set `size_array_rows` to the appropriate value before it has ever been called and will therefore incorrectly have thrown the error "Variable formula compute array is zero length".
---
 src/variable.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/src/variable.cpp b/src/variable.cpp
index 8124d9c4a1..2bde0e5adb 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -1657,10 +1657,6 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
 
             if (!compute->array_flag)
               print_var_error(FLERR,"Mismatched compute in variable formula",ivar);
-            if (compute->size_array_rows == 0)
-              print_var_error(FLERR,"Variable formula compute array is zero length",ivar);
-            if (index1 > compute->size_array_cols)
-              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
             if (!compute->is_initialized())
               print_var_error(FLERR,"Variable formula compute cannot be invoked before "
                               "initialization by a run",ivar);
@@ -1668,6 +1664,10 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
               compute->compute_array();
               compute->invoked_flag |= Compute::INVOKED_ARRAY;
             }
+            if (compute->size_array_rows == 0)
+              print_var_error(FLERR,"Variable formula compute array is zero length",ivar);
+            if (index1 > compute->size_array_cols)
+              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
 
             auto newtree = new Tree();
             newtree->type = VECTORARRAY;

From 1211af65a16f06d6ba99d202b783509c362f6e9b Mon Sep 17 00:00:00 2001
From: Stan Moore <stamoor@sandia.gov>
Date: Fri, 15 Mar 2024 12:10:12 -0600
Subject: [PATCH 02/27] Fix Kokkos teamsize too large issue

---
 src/KOKKOS/pair_kokkos.h | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/pair_kokkos.h b/src/KOKKOS/pair_kokkos.h
index 54502f290c..15417d7620 100644
--- a/src/KOKKOS/pair_kokkos.h
+++ b/src/KOKKOS/pair_kokkos.h
@@ -950,6 +950,8 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
 
     static int vectorsize = 0;
     static int atoms_per_team = 0;
+    static int teamsize_max_for = 0;
+    static int teamsize_max_reduce = 0;
 
 #if defined(LMP_KOKKOS_GPU)
     static int lastcall = -1;
@@ -966,7 +968,6 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
 
       vectorsize = MIN(vectorsize,max_vectorsize);
 
-      int teamsize_max_for,teamsize_max_reduce;
       if (fpair->atom->ntypes > MAX_TYPES_STACKPARAMS) {
         PairComputeFunctor<PairStyle,NEIGHFLAG,false,ZEROFLAG,Specialisation > ff(fpair,list);
         GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
@@ -974,12 +975,12 @@ EV_FLOAT pair_compute_neighlist (PairStyle* fpair, std::enable_if_t<(NEIGHFLAG&P
         PairComputeFunctor<PairStyle,NEIGHFLAG,true,ZEROFLAG,Specialisation > ff(fpair,list);
         GetMaxTeamSize<typename PairStyle::device_type>(ff, inum, teamsize_max_for, teamsize_max_reduce);
       }
-
-      int teamsize_max = teamsize_max_for;
-      if (fpair->eflag || fpair->vflag)
-        teamsize_max = teamsize_max_reduce;
-      atoms_per_team = teamsize_max/vectorsize;
     }
+
+    int teamsize_max = teamsize_max_for;
+    if (fpair->eflag || fpair->vflag)
+      teamsize_max = teamsize_max_reduce;
+    atoms_per_team = teamsize_max/vectorsize;
 #else
     vectorsize = 1;
     atoms_per_team = 1;

From 2fe23b98f57c7248e1d326cfa443838609be415d Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 17 Mar 2024 00:04:06 -0400
Subject: [PATCH 03/27] modernize and enable clang-format

---
 src/fix_print.cpp | 103 ++++++++++++++++++++++++----------------------
 src/fix_print.h   |   2 +-
 2 files changed, 54 insertions(+), 51 deletions(-)

diff --git a/src/fix_print.cpp b/src/fix_print.cpp
index 023b9355cd..ccef03c3ae 100644
--- a/src/fix_print.cpp
+++ b/src/fix_print.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -14,6 +13,7 @@
 
 #include "fix_print.h"
 
+#include "comm.h"
 #include "error.h"
 #include "input.h"
 #include "memory.h"
@@ -29,24 +29,22 @@ using namespace FixConst;
 /* ---------------------------------------------------------------------- */
 
 FixPrint::FixPrint(LAMMPS *lmp, int narg, char **arg) :
-  Fix(lmp, narg, arg),
-  fp(nullptr), text(nullptr), copy(nullptr), work(nullptr), var_print(nullptr)
+    Fix(lmp, narg, arg), fp(nullptr), text(nullptr), copy(nullptr), work(nullptr),
+    var_print(nullptr)
 {
-  if (narg < 5) error->all(FLERR,"Illegal fix print command");
-  if (utils::strmatch(arg[3],"^v_")) {
-    var_print = utils::strdup(arg[3]+2);
+  if (narg < 5) utils::missing_cmd_args(FLERR, "fix print", error);
+  if (utils::strmatch(arg[3], "^v_")) {
+    var_print = utils::strdup(arg[3] + 2);
     nevery = 1;
   } else {
-    nevery = utils::inumeric(FLERR,arg[3],false,lmp);
-    if (nevery <= 0) error->all(FLERR,"Illegal fix print command");
+    nevery = utils::inumeric(FLERR, arg[3], false, lmp);
+    if (nevery <= 0) error->all(FLERR, "Illegal fix print nevery value {}; must be > 0", nevery);
   }
 
-  MPI_Comm_rank(world,&me);
-
   text = utils::strdup(arg[4]);
-  int n = strlen(text)+1;
-  copy = (char *) memory->smalloc(n*sizeof(char),"fix/print:copy");
-  work = (char *) memory->smalloc(n*sizeof(char),"fix/print:work");
+  int n = strlen(text) + 1;
+  copy = (char *) memory->smalloc(n * sizeof(char), "fix/print:copy");
+  work = (char *) memory->smalloc(n * sizeof(char), "fix/print:work");
   maxcopy = maxwork = n;
 
   // parse optional args
@@ -57,48 +55,54 @@ FixPrint::FixPrint(LAMMPS *lmp, int narg, char **arg) :
 
   int iarg = 5;
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"file") == 0 || strcmp(arg[iarg],"append") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix print command");
-      if (me == 0) {
-        if (strcmp(arg[iarg],"file") == 0) fp = fopen(arg[iarg+1],"w");
-        else fp = fopen(arg[iarg+1],"a");
+    if ((strcmp(arg[iarg], "file") == 0) || (strcmp(arg[iarg], "append") == 0)) {
+      if (iarg + 2 > narg)
+        utils::missing_cmd_args(FLERR, std::string("fix print ") + arg[iarg], error);
+      if (comm->me == 0) {
+        if (strcmp(arg[iarg], "file") == 0)
+          fp = fopen(arg[iarg + 1], "w");
+        else
+          fp = fopen(arg[iarg + 1], "a");
         if (fp == nullptr)
-          error->one(FLERR,"Cannot open fix print file {}: {}",
-                     arg[iarg+1], utils::getsyserror());
+          error->one(FLERR, "Cannot open fix print file {}: {}", arg[iarg + 1],
+                     utils::getsyserror());
       }
       iarg += 2;
-    } else if (strcmp(arg[iarg],"screen") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix print command");
-      screenflag = utils::logical(FLERR,arg[iarg+1],false,lmp);
+    } else if (strcmp(arg[iarg], "screen") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix print screen", error);
+      screenflag = utils::logical(FLERR, arg[iarg + 1], false, lmp);
       iarg += 2;
-    } else if (strcmp(arg[iarg],"title") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix print command");
-      delete [] title;
-      title = utils::strdup(arg[iarg+1]);
+    } else if (strcmp(arg[iarg], "title") == 0) {
+      if (iarg + 2 > narg) utils::missing_cmd_args(FLERR, "fix print title", error);
+      delete[] title;
+      title = utils::strdup(arg[iarg + 1]);
       iarg += 2;
-    } else error->all(FLERR,"Illegal fix print command");
+    } else
+      error->all(FLERR, "Unknown fix print keyword: {}", arg[iarg]);
   }
 
   // print file comment line
 
-  if (fp && me == 0) {
-    if (title) fprintf(fp,"%s\n",title);
-    else fprintf(fp,"# Fix print output for fix %s\n",id);
+  if (fp && (comm->me == 0)) {
+    if (title)
+      fprintf(fp, "%s\n", title);
+    else
+      fprintf(fp, "# Fix print output for fix %s\n", id);
   }
 
-  delete [] title;
+  delete[] title;
 }
 
 /* ---------------------------------------------------------------------- */
 
 FixPrint::~FixPrint()
 {
-  delete [] text;
-  delete [] var_print;
+  delete[] text;
+  delete[] var_print;
   memory->sfree(copy);
   memory->sfree(work);
 
-  if (fp && me == 0) fclose(fp);
+  if (fp && (comm->me == 0)) fclose(fp);
 }
 
 /* ---------------------------------------------------------------------- */
@@ -117,16 +121,16 @@ void FixPrint::init()
   if (var_print) {
     ivar_print = input->variable->find(var_print);
     if (ivar_print < 0)
-      error->all(FLERR,"Variable name for fix print timestep does not exist");
+      error->all(FLERR, "Variable {} for fix print timestep does not exist", var_print);
     if (!input->variable->equalstyle(ivar_print))
-      error->all(FLERR,"Variable for fix print timestep is invalid style");
-    next_print = static_cast<bigint>
-      (input->variable->compute_equal(ivar_print));
+      error->all(FLERR, "Variable {} for fix print timestep is invalid style", var_print);
+    next_print = static_cast<bigint>(input->variable->compute_equal(ivar_print));
     if (next_print <= update->ntimestep)
-      error->all(FLERR,"Fix print timestep variable returned a bad timestep");
+      error->all(FLERR, "Fix print timestep variable {} returned a bad timestep: {}", var_print,
+                 next_print);
   } else {
     if (update->ntimestep % nevery)
-      next_print = (update->ntimestep/nevery)*nevery + nevery;
+      next_print = (update->ntimestep / nevery) * nevery + nevery;
     else
       next_print = update->ntimestep;
   }
@@ -158,24 +162,23 @@ void FixPrint::end_of_step()
 
   modify->clearstep_compute();
 
-  strncpy(copy,text,maxcopy);
-  input->substitute(copy,work,maxcopy,maxwork,0);
+  strncpy(copy, text, maxcopy);
+  input->substitute(copy, work, maxcopy, maxwork, 0);
 
   if (var_print) {
-    next_print = static_cast<bigint>
-      (input->variable->compute_equal(ivar_print));
+    next_print = static_cast<bigint>(input->variable->compute_equal(ivar_print));
     if (next_print <= update->ntimestep)
-      error->all(FLERR,"Fix print timestep variable returned a bad timestep");
+      error->all(FLERR, "Fix print timestep variable returned a bad timestep: {}", next_print);
   } else {
-    next_print = (update->ntimestep/nevery)*nevery + nevery;
+    next_print = (update->ntimestep / nevery) * nevery + nevery;
   }
 
   modify->addstep_compute(next_print);
 
-  if (me == 0) {
-    if (screenflag) utils::logmesg(lmp,std::string(copy) + "\n");
+  if (comm->me == 0) {
+    if (screenflag) utils::logmesg(lmp, std::string(copy) + "\n");
     if (fp) {
-      fmt::print(fp,"{}\n",copy);
+      fmt::print(fp, "{}\n", copy);
       fflush(fp);
     }
   }
diff --git a/src/fix_print.h b/src/fix_print.h
index 48eda897b5..9e699e22ba 100644
--- a/src/fix_print.h
+++ b/src/fix_print.h
@@ -34,7 +34,7 @@ class FixPrint : public Fix {
   void end_of_step() override;
 
  private:
-  int me, screenflag;
+  int screenflag;
   FILE *fp;
   char *text, *copy, *work;
   int maxcopy, maxwork;

From fce15bf66f52dc86c1f61dd60f3a95d716fe94d4 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 17 Mar 2024 01:17:01 -0400
Subject: [PATCH 04/27] add support for appending to files to fix ave/time

---
 doc/src/fix_ave_time.rst | 29 ++++++++++++++++++-----------
 src/fix_ave_time.cpp     |  8 +++++---
 2 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/doc/src/fix_ave_time.rst b/doc/src/fix_ave_time.rst
index aa82e676ea..ea6a6dc2a9 100644
--- a/doc/src/fix_ave_time.rst
+++ b/doc/src/fix_ave_time.rst
@@ -28,7 +28,7 @@ Syntax
        v_name[I] = value calculated by a vector-style variable with name, I can include wildcard (see below)
 
 * zero or more keyword/arg pairs may be appended
-* keyword = *mode* or *file* or *ave* or *start* or *off* or *overwrite* or *format* or *title1* or *title2* or *title3*
+* keyword = *mode* or *file* or *append* or *ave* or *start* or *off* or *overwrite* or *format* or *title1* or *title2* or *title3*
 
   .. parsed-literal::
 
@@ -45,6 +45,8 @@ Syntax
          M = value # from 1 to Nvalues
        *file* arg = filename
          filename = name of file to output time averages to
+       *append* arg = filename
+         filename = name of file to append time averages to
        *overwrite* arg = none = overwrite output file with only latest output
        *format* arg = string
          string = C-style format string
@@ -270,16 +272,21 @@ are effectively constant or are simply current values (e.g., they are
 being written to a file with other time-averaged values for purposes
 of creating well-formatted output).
 
-The *file* keyword allows a filename to be specified.  Every *Nfreq*
-steps, one quantity or vector of quantities is written to the file for
-each input value specified in the fix ave/time command.  For *mode* =
-scalar, this means a single line is written each time output is
-performed.  Thus the file ends up to be a series of lines, i.e. one
-column of numbers for each input value.  For *mode* = vector, an array
-of numbers is written each time output is performed.  The number of rows
-is the length of the input vectors, and the number of columns is the
-number of values.  Thus the file ends up to be a series of these array
-sections.
+.. versionadded:: TBD
+   new keyword *append*
+
+The *file* or *append* keywords allow a filename to be specified.  If
+*file* is used, then the filename is overwritten if it already exists.
+If *append* is used, then the filename is appended to if it already
+exists, or created if it does not exist.  Every *Nfreq* steps, one
+quantity or vector of quantities is written to the file for each input
+value specified in the fix ave/time command.  For *mode* = scalar, this
+means a single line is written each time output is performed.  Thus the
+file ends up to be a series of lines, i.e. one column of numbers for
+each input value.  For *mode* = vector, an array of numbers is written
+each time output is performed.  The number of rows is the length of the
+input vectors, and the number of columns is the number of values.  Thus
+the file ends up to be a series of these array sections.
 
 .. versionadded:: 4May2022
 
diff --git a/src/fix_ave_time.cpp b/src/fix_ave_time.cpp
index f6ba0ad0e6..908a21f748 100644
--- a/src/fix_ave_time.cpp
+++ b/src/fix_ave_time.cpp
@@ -1033,11 +1033,13 @@ void FixAveTime::options(int iarg, int narg, char **arg)
   // optional args
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"file") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+    if ((strcmp(arg[iarg],"file") == 0) || (strcmp(arg[iarg],"append") == 0)) {
+      if (iarg+2 > narg)
+        utils::missing_cmd_args(FLERR, std::string("fix ave/time ")+arg[iarg], error);
       yaml_flag = utils::strmatch(arg[iarg+1],"\\.[yY][aA]?[mM][lL]$");
       if (comm->me == 0) {
-        fp = fopen(arg[iarg+1],"w");
+        if (strcmp(arg[iarg],"file") == 0) fp = fopen(arg[iarg+1],"w");
+        else fp = fopen(arg[iarg+1],"a");
         if (fp == nullptr)
           error->one(FLERR,"Cannot open fix ave/time file {}: {}",
                      arg[iarg+1], utils::getsyserror());

From 866c059d2d8e7fd0dd39ab2a51860eab2fc9aeca Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 17 Mar 2024 01:17:11 -0400
Subject: [PATCH 05/27] improve error messages

---
 src/fix_ave_time.cpp | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/fix_ave_time.cpp b/src/fix_ave_time.cpp
index 908a21f748..5219a4de3d 100644
--- a/src/fix_ave_time.cpp
+++ b/src/fix_ave_time.cpp
@@ -1046,30 +1046,31 @@ void FixAveTime::options(int iarg, int narg, char **arg)
       }
       iarg += 2;
     } else if (strcmp(arg[iarg],"ave") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time ave", error);
       if (strcmp(arg[iarg+1],"one") == 0) ave = ONE;
       else if (strcmp(arg[iarg+1],"running") == 0) ave = RUNNING;
       else if (strcmp(arg[iarg+1],"window") == 0) ave = WINDOW;
-      else error->all(FLERR,"Illegal fix ave/time command");
+      else error->all(FLERR,"Unknown fix ave/time ave keyword {}", arg[iarg+1]);
       if (ave == WINDOW) {
-        if (iarg+3 > narg) error->all(FLERR,"Illegal fix ave/time command");
+        if (iarg+3 > narg) utils::missing_cmd_args(FLERR, "fix ave/time ave window", error);
         nwindow = utils::inumeric(FLERR,arg[iarg+2],false,lmp);
-        if (nwindow <= 0) error->all(FLERR,"Illegal fix ave/time command");
+        if (nwindow <= 0)
+          error->all(FLERR,"Illegal fix ave/time ave window argument {}; must be > 0", nwindow);
       }
       iarg += 2;
       if (ave == WINDOW) iarg++;
     } else if (strcmp(arg[iarg],"start") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time start", error);
       startstep = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
     } else if (strcmp(arg[iarg],"mode") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time mode", error);
       if (strcmp(arg[iarg+1],"scalar") == 0) mode = SCALAR;
       else if (strcmp(arg[iarg+1],"vector") == 0) mode = VECTOR;
-      else error->all(FLERR,"Illegal fix ave/time command");
+      else error->all(FLERR,"Unknown fix ave/time mode {}", arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"off") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time off", error);
       memory->grow(offlist,noff+1,"ave/time:offlist");
       offlist[noff++] = utils::inumeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
@@ -1077,27 +1078,27 @@ void FixAveTime::options(int iarg, int narg, char **arg)
       overwrite = 1;
       iarg += 1;
     } else if (strcmp(arg[iarg],"format") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time format", error);
       delete[] format_user;
       format_user = utils::strdup(arg[iarg+1]);
       format = format_user;
       iarg += 2;
     } else if (strcmp(arg[iarg],"title1") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time title1", error);
       delete[] title1;
       title1 = utils::strdup(arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"title2") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time title2", error);
       delete[] title2;
       title2 = utils::strdup(arg[iarg+1]);
       iarg += 2;
     } else if (strcmp(arg[iarg],"title3") == 0) {
-      if (iarg+2 > narg) error->all(FLERR,"Illegal fix ave/time command");
+      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/time title3", error);
       delete[] title3;
       title3 = utils::strdup(arg[iarg+1]);
       iarg += 2;
-    } else error->all(FLERR,"Unknown fix ave/time command option {}", arg[iarg]);
+    } else error->all(FLERR,"Unknown fix ave/time keyword {}", arg[iarg]);
   }
 }
 

From 1bbe87d9d2527909bc3f9aec81ebdb4bac0a7b07 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 17 Mar 2024 15:13:04 -0400
Subject: [PATCH 06/27] add 'append' keyword for appending to output file

---
 doc/src/fix_ave_histo.rst | 36 ++++++++++++++++++++++--------------
 src/fix_ave_histo.cpp     |  8 +++++---
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/doc/src/fix_ave_histo.rst b/doc/src/fix_ave_histo.rst
index 9699e4238c..60aeb632fb 100644
--- a/doc/src/fix_ave_histo.rst
+++ b/doc/src/fix_ave_histo.rst
@@ -35,7 +35,7 @@ Syntax
        v_name[I] = value calculated by a vector-style variable with name, I can include wildcard (see below)
 
 * zero or more keyword/arg pairs may be appended
-* keyword = *mode* or *kind* or *file* or *ave* or *start* or *beyond* or *overwrite* or *title1* or *title2* or *title3*
+* keyword = *mode* or *kind* or *file* or *append* or *ave* or *start* or *beyond* or *overwrite* or *title1* or *title2* or *title3*
 
   .. parsed-literal::
 
@@ -45,6 +45,8 @@ Syntax
        *kind* arg = *global* or *peratom* or *local*
        *file* arg = filename
          filename = name of file to output histogram(s) to
+       *append* arg = filename
+         filename = name of file to append time averages to
        *ave* args = *one* or *running* or *window*
          one = output a new average value every Nfreq steps
          running = output cumulative average of all previous Nfreq steps
@@ -317,19 +319,25 @@ on.  The default is step 0.  Often input values can be 0.0 at time 0,
 so setting *start* to a larger value can avoid including a 0.0 in
 a running or windowed histogram.
 
-The *file* keyword allows a filename to be specified.  Every *Nfreq*
-steps, one histogram is written to the file.  This includes a leading
-line that contains the timestep, number of bins, the total count of
-values contributing to the histogram, the count of values that were
-not histogrammed (see the *beyond* keyword), the minimum value
-encountered, and the maximum value encountered.  The min/max values
-include values that were not histogrammed.  Following the leading
-line, one line per bin is written into the file.  Each line contains
-the bin #, the coordinate for the center of the bin (between *lo* and
-*hi*\ ), the count of values in the bin, and the normalized count.  The
-normalized count is the bin count divided by the total count (not
-including values not histogrammed), so that the normalized values sum
-to 1.0 across all bins.
+.. versionadded:: TBD
+   new keyword *append*
+
+The *file* or *append* keywords allow a filename to be specified.  If
+*file* is used, then the filename is overwritten if it already exists.
+If *append* is used, then the filename is appended to if it already
+exists, or created if it does not exist.  Every *Nfreq* steps, one
+histogram is written to the file.  This includes a leading line that
+contains the timestep, number of bins, the total count of values
+contributing to the histogram, the count of values that were not
+histogrammed (see the *beyond* keyword), the minimum value encountered,
+and the maximum value encountered.  The min/max values include values
+that were not histogrammed.  Following the leading line, one line per
+bin is written into the file.  Each line contains the bin #, the
+coordinate for the center of the bin (between *lo* and *hi*\ ), the
+count of values in the bin, and the normalized count.  The normalized
+count is the bin count divided by the total count (not including values
+not histogrammed), so that the normalized values sum to 1.0 across all
+bins.
 
 The *overwrite* keyword will continuously overwrite the output file
 with the latest output, so that it only contains one timestep worth of
diff --git a/src/fix_ave_histo.cpp b/src/fix_ave_histo.cpp
index a92efcdacd..b3ca9e1106 100644
--- a/src/fix_ave_histo.cpp
+++ b/src/fix_ave_histo.cpp
@@ -839,10 +839,12 @@ void FixAveHisto::options(int iarg, int narg, char **arg)
   auto mycmd = fmt::format("fix {}", style);
 
   while (iarg < narg) {
-    if (strcmp(arg[iarg],"file") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, mycmd + " file", error);
+    if ((strcmp(arg[iarg],"file") == 0) || (strcmp(arg[iarg],"append") == 0)) {
+      if (iarg+2 > narg)
+        utils::missing_cmd_args(FLERR, std::string("fix ave/histo ")+arg[iarg], error);
       if (comm->me == 0) {
-        fp = fopen(arg[iarg+1],"w");
+        if (strcmp(arg[iarg],"file") == 0) fp = fopen(arg[iarg+1],"w");
+        else fp = fopen(arg[iarg+1],"a");
         if (fp == nullptr)
           error->one(FLERR, "Cannot open fix ave/histo file {}: {}",
                      arg[iarg+1], utils::getsyserror());

From d97c7fffac66f26485b8c0c2fc87df5eb05dcebe Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 17 Mar 2024 15:13:13 -0400
Subject: [PATCH 07/27] spelling

---
 doc/src/fix_electrode.rst                   | 29 +++++++++++----------
 doc/src/processors.rst                      | 26 +++++++++---------
 doc/utils/sphinx-config/false_positives.txt |  1 +
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/doc/src/fix_electrode.rst b/doc/src/fix_electrode.rst
index 8a7a44454d..692b187841 100644
--- a/doc/src/fix_electrode.rst
+++ b/doc/src/fix_electrode.rst
@@ -255,23 +255,24 @@ and the fix will issue an error in that case.
 
 .. versionadded:: TBD
 
-The keyword *qtotal* causes *fix electrode/conp* and *fix electrode/thermo*
-to add an overall potential to all electrodes so that the total charge on
-the electrodes is a specified amount (which may be an equal-style variable).
-For example, if a user wanted to simulate a solution of excess cations
-such that the total electrolyte charge is +2, setting *qtotal -2* would cause
-the total electrode charge to be -2, so that the simulation box remains overall
-electroneutral. Since *fix electrode/conq* constrains the total charges of
-individual electrodes, and since *symm on* constrains the total charge of all
-electrodes to be zero, either option is incompatible with the *qtotal* keyword
-(even if *qtotal* is set to zero).
+The keyword *qtotal* causes *fix electrode/conp* and *fix
+electrode/thermo* to add an overall potential to all electrodes so that
+the total charge on the electrodes is a specified amount (which may be
+an equal-style variable).  For example, if a user wanted to simulate a
+solution of excess cations such that the total electrolyte charge is +2,
+setting *qtotal -2* would cause the total electrode charge to be -2, so
+that the simulation box remains overall electroneutral. Since *fix
+electrode/conq* constrains the total charges of individual electrodes,
+and since *symm on* constrains the total charge of all electrodes to be
+zero, either option is incompatible with the *qtotal* keyword (even if
+*qtotal* is set to zero).
 
 .. versionadded:: TBD
 
-The keyword *eta* takes the name of a custom double vector defined via fix
-property/atom.  The values will be used instead of the standard eta value.  The
-property/atom fix must be for vector of double values and use the *ghost on*
-option.
+The keyword *eta* takes the name of a custom double vector defined via
+fix property/atom.  The values will be used instead of the standard eta
+value.  The property/atom fix must be for vector of double values and
+use the *ghost on* option.
 
 Restart, fix_modify, output, run start/stop, minimize info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
diff --git a/doc/src/processors.rst b/doc/src/processors.rst
index a11febb1c2..f909790304 100644
--- a/doc/src/processors.rst
+++ b/doc/src/processors.rst
@@ -159,17 +159,17 @@ surface-to-volume ratio of each processor's subdomain.
    for most MPI implementations, but some MPIs provide options for this
    ordering, e.g. via environment variable settings.
 
-The *numa* style operates similar to the *twolevel* keyword except
-that it auto-detects which cores are running on which nodes.
-It will also subdivide the cores into numa domains. Currently, the
-number of numa domains is not autodetected and must be specified using
-the *numa_nodes* keyword; otherwise, the default value is used. The
-*numa* style uses a different algorithm than the *twolevel* keyword for
-doing the two-level factorization of the simulation box into a 3d
-processor grid to minimize off-node communication and communication
-across numa domains. It does its own MPI-based mapping of nodes and
-cores to the regular 3d grid.  Thus it may produce a different layout
-of the processors than the *twolevel* options.
+The *numa* style operates similar to the *twolevel* keyword except that
+it auto-detects which cores are running on which nodes.  It will also
+subdivide the cores into numa domains. Currently, the number of numa
+domains is not auto-detected and must be specified using the
+*numa_nodes* keyword; otherwise, the default value is used. The *numa*
+style uses a different algorithm than the *twolevel* keyword for doing
+the two-level factorization of the simulation box into a 3d processor
+grid to minimize off-node communication and communication across numa
+domains. It does its own MPI-based mapping of nodes and cores to the
+regular 3d grid.  Thus it may produce a different layout of the
+processors than the *twolevel* options.
 
 The *numa* style will give an error if the number of MPI processes is
 not divisible by the number of cores used per node, or any of the Px
@@ -182,7 +182,7 @@ or Py or Pz values is greater than 1.
    is because it auto-detects which processes are running on which nodes.
    However, it assumes that the lowest ranks are in the first numa
    domain, and so forth. MPI rank orderings that do not preserve this
-   property might result in more intranode communication between CPUs.
+   property might result in more intra-node communication between CPUs.
 
 The *custom* style uses the file *infile* to define both the 3d
 factorization and the mapping of processors to the grid.
@@ -213,7 +213,7 @@ any order, but no processor ID should appear more than once.
 
 ----------
 
-The *numa_nodes* keyword is used to specifiy the number of numa domains
+The *numa_nodes* keyword is used to specify the number of numa domains
 per node. It is currently only used by the *numa* style for two-level
 factorization to reduce the amount of MPI communications between CPUs.
 A good setting for this will typically be equal to the number of CPU
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 030c80d30c..04ea69575a 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -947,6 +947,7 @@ elastance
 Electroneg
 electronegative
 electronegativity
+electroneutral
 electroneutrality
 Eleftheriou
 ElementN

From 505f7b3cb4b64c351aa1b03b0bde288e18993649 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 17 Mar 2024 15:34:32 -0400
Subject: [PATCH 08/27] add 'append' keyword for appending to output file

---
 doc/src/fix_ave_chunk.rst | 28 ++++++++++++++++++----------
 doc/src/fix_ave_histo.rst |  2 +-
 src/fix_ave_chunk.cpp     |  8 +++++---
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/doc/src/fix_ave_chunk.rst b/doc/src/fix_ave_chunk.rst
index adbfb43d72..57ce52f7c7 100644
--- a/doc/src/fix_ave_chunk.rst
+++ b/doc/src/fix_ave_chunk.rst
@@ -31,7 +31,7 @@ Syntax
        v_name = per-atom vector calculated by an atom-style variable with name
 
 * zero or more keyword/arg pairs may be appended
-* keyword = *norm* or *ave* or *bias* or *adof* or *cdof* or *file* or *overwrite* or *format* or *title1* or *title2* or *title3*
+* keyword = *norm* or *ave* or *bias* or *adof* or *cdof* or *file* or *append* or *overwrite* or *format* or *title1* or *title2* or *title3*
 
   .. parsed-literal::
 
@@ -51,6 +51,8 @@ Syntax
          dof_per_chunk = define this many degrees-of-freedom per chunk for temperature calculation
        *file* arg = filename
          filename = file to write results to
+       *append* arg = filename
+         filename = file to append results to
        *overwrite* arg = none = overwrite output file with only latest output
        *format* arg = string
          string = C-style format string
@@ -433,15 +435,21 @@ molecule.
 
 ----------
 
-The *file* keyword allows a filename to be specified.  Every
-:math:`N_\text{freq}` timesteps, a section of chunk info will be written to a
-text file in the following format.  A line with the timestep and number of
-chunks is written.  Then one line per chunk is written, containing the chunk
-ID :math:`(1-N_\text{chunk}),` an optional original ID value, optional
-coordinate values for chunks that represent spatial bins, the number of atoms
-in the chunk, and one or more calculated values.  More explanation of the
-optional values is given below.  The number of values in each line
-corresponds to the number of values specified in the fix ave/chunk
+.. versionadded:: TBD
+   new keyword *append*
+
+The *file* or *append* keywords allow a filename to be specified.  If
+*file* is used, then the filename is overwritten if it already exists.
+If *append* is used, then the filename is appended to if it already
+exists, or created if it does not exist.  Every :math:`N_\text{freq}`
+timesteps, a section of chunk info will be written to a text file in the
+following format.  A line with the timestep and number of chunks is
+written.  Then one line per chunk is written, containing the chunk ID
+:math:`(1-N_\text{chunk}),` an optional original ID value, optional
+coordinate values for chunks that represent spatial bins, the number of
+atoms in the chunk, and one or more calculated values.  More explanation
+of the optional values is given below.  The number of values in each
+line corresponds to the number of values specified in the fix ave/chunk
 command.  The number of atoms and the value(s) are summed or average
 quantities, as explained above.
 
diff --git a/doc/src/fix_ave_histo.rst b/doc/src/fix_ave_histo.rst
index 60aeb632fb..b9ecc31cec 100644
--- a/doc/src/fix_ave_histo.rst
+++ b/doc/src/fix_ave_histo.rst
@@ -46,7 +46,7 @@ Syntax
        *file* arg = filename
          filename = name of file to output histogram(s) to
        *append* arg = filename
-         filename = name of file to append time averages to
+         filename = name of file to append histogram(s) to
        *ave* args = *one* or *running* or *window*
          one = output a new average value every Nfreq steps
          running = output cumulative average of all previous Nfreq steps
diff --git a/src/fix_ave_chunk.cpp b/src/fix_ave_chunk.cpp
index a0d25cf2c7..9382b6d38e 100644
--- a/src/fix_ave_chunk.cpp
+++ b/src/fix_ave_chunk.cpp
@@ -193,10 +193,12 @@ FixAveChunk::FixAveChunk(LAMMPS *lmp, int narg, char **arg) :
       cdof = utils::numeric(FLERR,arg[iarg+1],false,lmp);
       iarg += 2;
 
-    } else if (strcmp(arg[iarg],"file") == 0) {
-      if (iarg+2 > narg) utils::missing_cmd_args(FLERR, "fix ave/chunk file", error);
+    } else if ((strcmp(arg[iarg],"file") == 0) || (strcmp(arg[iarg],"append") == 0)) {
+      if (iarg+2 > narg)
+        utils::missing_cmd_args(FLERR, std::string("fix ave/chunk ")+arg[iarg], error);
       if (comm->me == 0) {
-        fp = fopen(arg[iarg+1],"w");
+        if (strcmp(arg[iarg],"file") == 0) fp = fopen(arg[iarg+1],"w");
+        else fp = fopen(arg[iarg+1],"a");
         if (fp == nullptr)
           error->one(FLERR, "Cannot open fix ave/chunk file {}: {}",
                      arg[iarg+1], utils::getsyserror());

From 41ee1efa1374b693239e7c1870b3e39f5760fbfd Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 18 Mar 2024 10:27:38 -0400
Subject: [PATCH 09/27] fix bug in f2c string conversion detected by bound
 checking

---
 fortran/lammps.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fortran/lammps.f90 b/fortran/lammps.f90
index d0133f075c..c297bad2ef 100644
--- a/fortran/lammps.f90
+++ b/fortran/lammps.f90
@@ -3687,7 +3687,7 @@ CONTAINS
 
     n = LEN_TRIM(f_string)
     ptr = lammps_malloc(n+1)
-    CALL C_F_POINTER(ptr, c_string, [1])
+    CALL C_F_POINTER(ptr, c_string, [n+1])
     DO i=1, n
         c_string(i) = f_string(i:i)
     END DO

From 03bbc562ad4da3c3ac265631260c83207ce05a99 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 18 Mar 2024 10:46:43 -0400
Subject: [PATCH 10/27] improve error messages for invalid hybrid sub-styles

---
 src/angle_hybrid.cpp        | 2 +-
 src/bond_hybrid.cpp         | 2 +-
 src/dihedral_hybrid.cpp     | 3 ++-
 src/improper_hybrid.cpp     | 3 ++-
 src/pair_hybrid.cpp         | 6 ++++--
 src/pair_hybrid_overlay.cpp | 6 ++++--
 src/pair_hybrid_scaled.cpp  | 2 +-
 7 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/src/angle_hybrid.cpp b/src/angle_hybrid.cpp
index e79776b0d2..0c61970a1f 100644
--- a/src/angle_hybrid.cpp
+++ b/src/angle_hybrid.cpp
@@ -270,7 +270,7 @@ void AngleHybrid::coeff(int narg, char **arg)
     else if (strcmp(arg[1], "bb") == 0)
       error->all(FLERR, "BondBond coeff for hybrid angle has invalid format");
     else
-      error->all(FLERR, "Angle coeff for hybrid has invalid style");
+      error->all(FLERR, "Expected hybrid sub-style instead of {} in angle_coeff command", arg[1]);
   }
 
   // move 1st arg to 2nd arg
diff --git a/src/bond_hybrid.cpp b/src/bond_hybrid.cpp
index 401358dda0..6e5ae8d5e7 100644
--- a/src/bond_hybrid.cpp
+++ b/src/bond_hybrid.cpp
@@ -305,7 +305,7 @@ void BondHybrid::coeff(int narg, char **arg)
     if (strcmp(arg[1], "none") == 0)
       none = 1;
     else
-      error->all(FLERR, "Bond coeff for hybrid has invalid style");
+      error->all(FLERR, "Expected hybrid sub-style instead of {} in bond_coeff command", arg[1]);
   }
 
   // move 1st arg to 2nd arg
diff --git a/src/dihedral_hybrid.cpp b/src/dihedral_hybrid.cpp
index 9da4df1f68..4ee0ffdad9 100644
--- a/src/dihedral_hybrid.cpp
+++ b/src/dihedral_hybrid.cpp
@@ -277,7 +277,8 @@ void DihedralHybrid::coeff(int narg, char **arg)
     else if (strcmp(arg[1], "bb13") == 0)
       error->all(FLERR, "BondBond13 coeff for hybrid dihedral has invalid format");
     else
-      error->all(FLERR, "Dihedral coeff for hybrid has invalid style");
+      error->all(FLERR, "Expected hybrid sub-style instead of {} in dihedral_coeff command",
+                 arg[1]);
   }
 
   // move 1st arg to 2nd arg
diff --git a/src/improper_hybrid.cpp b/src/improper_hybrid.cpp
index 505488cce6..a847b7bc95 100644
--- a/src/improper_hybrid.cpp
+++ b/src/improper_hybrid.cpp
@@ -269,7 +269,8 @@ void ImproperHybrid::coeff(int narg, char **arg)
     else if (strcmp(arg[1], "aa") == 0)
       error->all(FLERR, "AngleAngle coeff for hybrid improper has invalid format");
     else
-      error->all(FLERR, "Improper coeff for hybrid has invalid style");
+      error->all(FLERR, "Expected hybrid sub-style instead of {} in improper_coeff command",
+                 arg[1]);
   }
 
   // move 1st arg to 2nd arg
diff --git a/src/pair_hybrid.cpp b/src/pair_hybrid.cpp
index f05a201e33..d257973617 100644
--- a/src/pair_hybrid.cpp
+++ b/src/pair_hybrid.cpp
@@ -521,8 +521,10 @@ void PairHybrid::coeff(int narg, char **arg)
 
   int none = 0;
   if (m == nstyles) {
-    if (strcmp(arg[2],"none") == 0) none = 1;
-    else error->all(FLERR,"Pair coeff for hybrid has invalid style: {}", arg[2]);
+    if (strcmp(arg[2],"none") == 0)
+      none = 1;
+    else
+      error->all(FLERR,"Expected hybrid sub-style instead of {} in pair_coeff command", arg[2]);
   }
 
   // move 1st/2nd args to 2nd/3rd args
diff --git a/src/pair_hybrid_overlay.cpp b/src/pair_hybrid_overlay.cpp
index 118403d345..4211d0e5cb 100644
--- a/src/pair_hybrid_overlay.cpp
+++ b/src/pair_hybrid_overlay.cpp
@@ -59,8 +59,10 @@ void PairHybridOverlay::coeff(int narg, char **arg)
 
   int none = 0;
   if (m == nstyles) {
-    if (strcmp(arg[2],"none") == 0) none = 1;
-    else error->all(FLERR,"Pair coeff for hybrid has invalid style: {}", arg[2]);
+    if (strcmp(arg[2],"none") == 0)
+      none = 1;
+    else
+      error->all(FLERR,"Expected hybrid sub-style instead of {} in pair_coeff command", arg[2]);
   }
 
   // move 1st/2nd args to 2nd/3rd args
diff --git a/src/pair_hybrid_scaled.cpp b/src/pair_hybrid_scaled.cpp
index 69ff037e4a..e897c784c7 100644
--- a/src/pair_hybrid_scaled.cpp
+++ b/src/pair_hybrid_scaled.cpp
@@ -516,7 +516,7 @@ void PairHybridScaled::coeff(int narg, char **arg)
     if (strcmp(arg[2], "none") == 0)
       none = 1;
     else
-      error->all(FLERR, "Pair coeff for hybrid has invalid style: {}", arg[2]);
+      error->all(FLERR, "Expected hybrid sub-style instead of {} in pair_coeff command", arg[2]);
   }
 
   // move 1st/2nd args to 2nd/3rd args

From 8d4a384f342043b9e76ab6395e99e136ba88f205 Mon Sep 17 00:00:00 2001
From: Stan Gerald Moore <stamoor@sandia.gov>
Date: Mon, 18 Mar 2024 09:28:53 -0600
Subject: [PATCH 11/27] Improve cuFFT detection in CMake, similar to HIP

---
 cmake/Modules/Packages/KOKKOS.cmake | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake
index 9324ea95c4..e74893d0d0 100644
--- a/cmake/Modules/Packages/KOKKOS.cmake
+++ b/cmake/Modules/Packages/KOKKOS.cmake
@@ -139,12 +139,9 @@ if(PKG_KSPACE)
       message(WARNING "Using KISS FFT with the CUDA backend of Kokkos may be sub-optimal.")
       target_compile_definitions(lammps PRIVATE -DFFT_KOKKOS_KISS)
     elseif(FFT_KOKKOS STREQUAL "CUFFT")
-      find_library(CUFFT_LIBRARY cufft)
-      if (CUFFT_LIBRARY STREQUAL "CUFFT_LIBRARY-NOTFOUND")
-        message(FATAL_ERROR "Required cuFFT library not found. Check your environment or set CUFFT_LIBRARY to its location")
-      endif()
+      find_package(CUDAToolkit REQUIRED)
       target_compile_definitions(lammps PRIVATE -DFFT_KOKKOS_CUFFT)
-      target_link_libraries(lammps PRIVATE ${CUFFT_LIBRARY})
+      target_link_libraries(lammps PRIVATE CUDA::cufft)
     endif()
   elseif(Kokkos_ENABLE_HIP)
     if(NOT ((FFT_KOKKOS STREQUAL "KISS") OR (FFT_KOKKOS STREQUAL "HIPFFT")))

From 6a28e8d5f6c2357dfe9ab8e68a99def27c69dd65 Mon Sep 17 00:00:00 2001
From: Trung Nguyen <ndactrung@gmail.com>
Date: Mon, 18 Mar 2024 13:27:21 -0500
Subject: [PATCH 12/27] Fixed bugs with sph gpu pair styles

---
 lib/gpu/lal_sph_heatconduction.cu       | 22 ++++-----
 lib/gpu/lal_sph_lj.cu                   | 45 ++++++++---------
 lib/gpu/lal_sph_taitwater.cu            | 64 +++++++++++--------------
 src/GPU/pair_sph_heatconduction_gpu.cpp |  4 +-
 src/GPU/pair_sph_lj_gpu.cpp             | 28 +++++------
 src/GPU/pair_sph_taitwater_gpu.cpp      | 48 +++++++++++++------
 6 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/lib/gpu/lal_sph_heatconduction.cu b/lib/gpu/lal_sph_heatconduction.cu
index e2ba40db0c..8e4ec6ff19 100644
--- a/lib/gpu/lal_sph_heatconduction.cu
+++ b/lib/gpu/lal_sph_heatconduction.cu
@@ -29,23 +29,23 @@ _texture_2d( vel_tex,int4);
 
 #if (SHUFFLE_AVAIL == 0)
 
-#define store_dE(dEacc, ii, inum, tid, t_per_atom, offset, dE)              \
+#define store_dE(dEacc, ii, inum, tid, t_per_atom, offset, i, dE)           \
   if (t_per_atom>1) {                                                       \
     simdsync();                                                             \
     simd_reduce_add1(t_per_atom, red_acc, offset, tid, dEacc);              \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    dE[ii]=dEacc;                                                           \
+    dE[i]=dEacc;                                                            \
   }
 #else
-#define store_drhoE(dEacc, ii, inum, tid, t_per_atom, offset, dE)           \
+#define store_drhoE(dEacc, ii, inum, tid, t_per_atom, offset, i, dE)        \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       dEacc += shfl_down(dEacc, s, t_per_atom);                             \
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    dE[ii]=dEacc;                                                           \
+    dE[i]=dEacc;                                                           \
   }
 #endif
 
@@ -66,7 +66,7 @@ __kernel void k_sph_heatconduction(const __global numtyp4 *restrict x_,
                        const int inum, const int nbor_pitch,
                        const __global numtyp4 *restrict v_,
                        const int dimension, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
@@ -77,7 +77,7 @@ __kernel void k_sph_heatconduction(const __global numtyp4 *restrict x_,
   acctyp dEacc = (acctyp)0;
 
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -140,7 +140,7 @@ __kernel void k_sph_heatconduction(const __global numtyp4 *restrict x_,
     } // for nbor
   } // if ii
 
-  store_drhoE(dEacc,ii,inum,tid,t_per_atom,offset,dE);
+  store_drhoE(dEacc,ii,inum,tid,t_per_atom,offset,i,dE);
 }
 
 __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_,
@@ -157,7 +157,7 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_,
                             const int inum, const int nbor_pitch,
                             const __global numtyp4 *restrict v_,
                             const int dimension, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   #ifndef ONETYPE
@@ -180,7 +180,7 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_,
   acctyp dEacc = (acctyp)0;
 
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -204,7 +204,7 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_,
       #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype = jx.w;
+      int jtype=jx.w;
       #ifndef ONETYPE
       int mtype=itype+jx.w;
       const numtyp cutsq_p=coeff[mtype].z;
@@ -252,6 +252,6 @@ __kernel void k_sph_heatconduction_fast(const __global numtyp4 *restrict x_,
     } // for nbor
   } // if ii
 
-  store_drhoE(dEacc,ii,inum,tid,t_per_atom,offset,dE);
+  store_drhoE(dEacc,ii,inum,tid,t_per_atom,offset,i,dE);
 }
 
diff --git a/lib/gpu/lal_sph_lj.cu b/lib/gpu/lal_sph_lj.cu
index f376dfc533..ae8e5aecb5 100644
--- a/lib/gpu/lal_sph_lj.cu
+++ b/lib/gpu/lal_sph_lj.cu
@@ -29,17 +29,18 @@ _texture_2d( vel_tex,int4);
 
 #if (SHUFFLE_AVAIL == 0)
 
-#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE)      \
-  if (t_per_atom>1) {                                                        \
-    simdsync();                                                              \
-    simd_reduce_add2(t_per_atom, red_acc, offset, tid,                       \
-                     drhoEacc.x, drhoEacc.y);                                \
-  }                                                                          \
-  if (offset==0 && ii<inum) {                                                \
-    drhoE[ii]=drhoEacc;                                                      \
+#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, i, drhoE)  \
+  if (t_per_atom>1) {                                                       \
+    simdsync();                                                             \
+    simd_reduce_add2(t_per_atom, red_acc, offset, tid,                      \
+                     drhoEacc.x, drhoEacc.y);                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    drhoE[i]=drhoEacc.x;                                                    \
+    drhoE[i+inum]=drhoEacc.y;                                               \
   }
 #else
-#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE)     \
+#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, i, drhoE)  \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       drhoEacc.x += shfl_down(drhoEacc.x, s, t_per_atom);                   \
@@ -47,7 +48,8 @@ _texture_2d( vel_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    drhoE[ii]=drhoEacc;                                                     \
+    drhoE[i]=drhoEacc.x;                                                    \
+    drhoE[i+inum]=drhoEacc.y;                                               \
   }
 #endif
 
@@ -105,12 +107,12 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_,
                        const __global int * dev_packed,
                        __global acctyp3 *restrict ans,
                        __global acctyp *restrict engv,
-                       __global acctyp2 *restrict drhoE,
+                       __global acctyp *restrict drhoE,
                        const int eflag, const int vflag,
                        const int inum, const int nbor_pitch,
                        const __global numtyp4 *restrict v_,
                        const int dimension, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
@@ -124,10 +126,10 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_,
     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
   }
   acctyp2 drhoEacc;
-  drhoEacc.x = drhoEacc.x = (acctyp)0;
+  drhoEacc.x = drhoEacc.y = (acctyp)0;
 
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -246,7 +248,7 @@ __kernel void k_sph_lj(const __global numtyp4 *restrict x_,
   } // if ii
   store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                 ans,engv);
-  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE);
+  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,i,drhoE);
 }
 
 __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_,
@@ -258,12 +260,12 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_,
                             const __global int * dev_packed,
                             __global acctyp3 *restrict ans,
                             __global acctyp *restrict engv,
-                            __global acctyp2 *restrict drhoE,
+                            __global acctyp *restrict drhoE,
                             const int eflag, const int vflag,
                             const int inum, const int nbor_pitch,
                             const __global numtyp4 *restrict v_,
                             const int dimension, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   #ifndef ONETYPE
@@ -289,10 +291,10 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_,
     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
   }
   acctyp2 drhoEacc;
-  drhoEacc.x = drhoEacc.x = (acctyp)0;
+  drhoEacc.x = drhoEacc.y = (acctyp)0;
 
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -325,7 +327,7 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_,
       #endif
 
       numtyp4 jx; fetch4(jx,j,pos_tex); //x_[j];
-      int jtype = jx.w;
+      int jtype=jx.w;
       #ifndef ONETYPE
       int mtype=itype+jx.w;
       const numtyp cutsq_p=coeff[mtype].z; // cutsq[itype][jtype];
@@ -415,12 +417,11 @@ __kernel void k_sph_lj_fast(const __global numtyp4 *restrict x_,
           virial[4] += delx*delz*force;
           virial[5] += dely*delz*force;
         }
-
       }
     } // for nbor
   } // if ii
 
   store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv);
-  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE);
+  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,i,drhoE);
 }
 
diff --git a/lib/gpu/lal_sph_taitwater.cu b/lib/gpu/lal_sph_taitwater.cu
index 9424d58996..969ce548de 100644
--- a/lib/gpu/lal_sph_taitwater.cu
+++ b/lib/gpu/lal_sph_taitwater.cu
@@ -29,17 +29,18 @@ _texture_2d( vel_tex,int4);
 
 #if (SHUFFLE_AVAIL == 0)
 
-#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE)      \
-  if (t_per_atom>1) {                                                        \
-    simdsync();                                                              \
-    simd_reduce_add2(t_per_atom, red_acc, offset, tid,                       \
-                     drhoEacc.x, drhoEacc.y);                                \
-  }                                                                          \
-  if (offset==0 && ii<inum) {                                                \
-    drhoE[ii]=drhoEacc;                                                      \
+#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, i, drhoE)  \
+  if (t_per_atom>1) {                                                       \
+    simdsync();                                                             \
+    simd_reduce_add2(t_per_atom, red_acc, offset, tid,                      \
+                     drhoEacc.x, drhoEacc.y);                               \
+  }                                                                         \
+  if (offset==0 && ii<inum) {                                               \
+    drhoE[i]=drhoEacc.x;                                                    \
+    drhoE[i+inum]=drhoEacc.y;                                               \
   }
 #else
-#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, drhoE)     \
+#define store_drhoE(drhoEacc, ii, inum, tid, t_per_atom, offset, i, drhoE)  \
   if (t_per_atom>1) {                                                       \
     for (unsigned int s=t_per_atom/2; s>0; s>>=1) {                         \
       drhoEacc.x += shfl_down(drhoEacc.x, s, t_per_atom);                   \
@@ -47,7 +48,8 @@ _texture_2d( vel_tex,int4);
     }                                                                       \
   }                                                                         \
   if (offset==0 && ii<inum) {                                               \
-    drhoE[ii]=drhoEacc;                                                     \
+    drhoE[i]=drhoEacc.x;                                                    \
+    drhoE[i+inum]=drhoEacc.y;                                               \
   }
 #endif
 
@@ -61,12 +63,12 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_,
                               const __global int * dev_packed,
                               __global acctyp3 *restrict ans,
                               __global acctyp *restrict engv,
-                              __global acctyp2 *restrict drhoE,
+                              __global acctyp *restrict drhoE,
                               const int eflag, const int vflag,
                               const int inum, const int nbor_pitch,
                               const __global numtyp4 *restrict v_,
                               const int dimension, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   int n_stride;
@@ -80,10 +82,10 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_,
     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
   }
   acctyp2 drhoEacc;
-  drhoEacc.x = drhoEacc.x = (acctyp)0;
+  drhoEacc.x = drhoEacc.y = (acctyp)0;
 
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -163,9 +165,8 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_,
         numtyp fvisc = (numtyp)0;
         if (delVdotDelR < (numtyp)0) {
           numtyp mu = h * delVdotDelR / (rsq + (numtyp)0.01 * h * h);
-          fvisc = -coeffx * (soundspeed_itype
-              + soundspeed_jtype) * mu / (rhoi + rhoj);
-        }
+          fvisc = -coeffx * (soundspeed_itype + soundspeed_jtype) * mu / (rhoi + rhoj);
+        } 
 
         // total pair force & thermal energy increment
         numtyp force = -mass_itype * mass_jtype * (fi + fj + fvisc) * wfd;
@@ -176,15 +177,11 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_,
         f.z+=delz*force;
 
         // and change in density, drho[i]
-        drhoEacc.x += mass_jtype* delVdotDelR * wfd;
+        drhoEacc.x += mass_jtype * delVdotDelR * wfd;
 
         // change in thermal energy, desph[i]
         drhoEacc.y += deltaE;
 
-        if (EVFLAG && eflag) {
-          numtyp e = (numtyp)0;
-          energy+=e;
-        }
         if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
@@ -198,7 +195,7 @@ __kernel void k_sph_taitwater(const __global numtyp4 *restrict x_,
   } // if ii
   store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag,
                 ans,engv);
-  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE);
+  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,i,drhoE);
 }
 
 __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_,
@@ -210,12 +207,12 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_,
                                    const __global int * dev_packed,
                                    __global acctyp3 *restrict ans,
                                    __global acctyp *restrict engv,
-                                   __global acctyp2 *restrict drhoE,
+                                   __global acctyp *restrict drhoE,
                                    const int eflag, const int vflag,
                                    const int inum, const int nbor_pitch,
                                    const __global numtyp4 *restrict v_,
                                    const int dimension, const int t_per_atom) {
-  int tid, ii, offset;
+  int tid, ii, offset, i;
   atom_info(t_per_atom,ii,tid,offset);
 
   #ifndef ONETYPE
@@ -245,10 +242,10 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_,
     for (int i=0; i<6; i++) virial[i]=(acctyp)0;
   }
   acctyp2 drhoEacc;
-  drhoEacc.x = drhoEacc.x = (acctyp)0;
+  drhoEacc.x = drhoEacc.y = (acctyp)0;
 
   if (ii<inum) {
-    int i, numj, nbor, nbor_end;
+    int numj, nbor, nbor_end;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
               n_stride,nbor_end,nbor);
 
@@ -337,8 +334,7 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_,
         numtyp fvisc = (numtyp)0;
         if (delVdotDelR < (numtyp)0) {
           numtyp mu = h * delVdotDelR / (rsq + (numtyp)0.01 * h * h);
-          fvisc = -coeffx * (soundspeed_itype
-              + soundspeed_jtype) * mu / (rhoi + rhoj);
+          fvisc = -coeffx * (soundspeed_itype + soundspeed_jtype) * mu / (rhoi + rhoj);
         }
 
         // total pair force & thermal energy increment
@@ -349,16 +345,12 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_,
         f.y+=dely*force;
         f.z+=delz*force;
 
-        // and change in density
+        // and change in density, drho[i]
         drhoEacc.x += mass_jtype * delVdotDelR * wfd;
 
-        // change in thermal energy
+        // change in thermal energy, desph[i]
         drhoEacc.y += deltaE;
 
-        if (EVFLAG && eflag) {
-          numtyp e = (numtyp)0;
-          energy+=e;
-        }
         if (EVFLAG && vflag) {
           virial[0] += delx*delx*force;
           virial[1] += dely*dely*force;
@@ -372,6 +364,6 @@ __kernel void k_sph_taitwater_fast(const __global numtyp4 *restrict x_,
   } // if ii
 
   store_answers(f,energy,virial,ii,inum,tid,t_per_atom,offset,eflag,vflag, ans,engv);
-  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,drhoE);
+  store_drhoE(drhoEacc,ii,inum,tid,t_per_atom,offset,i,drhoE);
 }
 
diff --git a/src/GPU/pair_sph_heatconduction_gpu.cpp b/src/GPU/pair_sph_heatconduction_gpu.cpp
index a81de53c91..ba68c5fcc5 100644
--- a/src/GPU/pair_sph_heatconduction_gpu.cpp
+++ b/src/GPU/pair_sph_heatconduction_gpu.cpp
@@ -113,7 +113,7 @@ void PairSPHHeatConductionGPU::compute(int eflag, int vflag)
         neighbor->ago, inum, nall, atom->x, atom->type,
         sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag,
         eflag_atom, vflag_atom, host_start, &ilist, &numneigh,
-        cpu_time, success, atom->v);
+        cpu_time, success, atom->vest);
   } else {
     inum = list->inum;
     ilist = list->ilist;
@@ -122,7 +122,7 @@ void PairSPHHeatConductionGPU::compute(int eflag, int vflag)
     sph_heatconduction_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
                        ilist, numneigh, firstneigh, eflag, vflag,
                        eflag_atom, vflag_atom, host_start, cpu_time, success,
-                       atom->tag, atom->v);
+                       atom->tag, atom->vest);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
diff --git a/src/GPU/pair_sph_lj_gpu.cpp b/src/GPU/pair_sph_lj_gpu.cpp
index 46d7b38073..d503a26335 100644
--- a/src/GPU/pair_sph_lj_gpu.cpp
+++ b/src/GPU/pair_sph_lj_gpu.cpp
@@ -114,7 +114,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag)
         neighbor->ago, inum, nall, atom->x, atom->type,
         sublo, subhi, atom->tag, atom->nspecial, atom->special, eflag, vflag,
         eflag_atom, vflag_atom, host_start, &ilist, &numneigh,
-        cpu_time, success, atom->v);
+        cpu_time, success, atom->vest);
   } else {
     inum = list->inum;
     ilist = list->ilist;
@@ -123,7 +123,7 @@ void PairSPHLJGPU::compute(int eflag, int vflag)
     sph_lj_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type,
                        ilist, numneigh, firstneigh, eflag, vflag,
                        eflag_atom, vflag_atom, host_start, cpu_time, success,
-                       atom->tag, atom->v);
+                       atom->tag, atom->vest);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
@@ -136,21 +136,21 @@ void PairSPHLJGPU::compute(int eflag, int vflag)
   int nlocal = atom->nlocal;
   if (acc_float) {
     auto drhoE_ptr = (float *)drhoE_pinned;
-    int idx = 0;
-    for (int i = 0; i < nlocal; i++) {
-      drho[i] = drhoE_ptr[idx];
-      desph[i] = drhoE_ptr[idx+1];
-      idx += 2;
-    }
+    for (int i = 0; i < nlocal; i++)
+      drho[i] += drhoE_ptr[i];
+
+    drhoE_ptr += nlocal;
+    for (int i = 0; i < nlocal; i++)
+      desph[i] += drhoE_ptr[i];
 
   } else {
     auto drhoE_ptr = (double *)drhoE_pinned;
-    int idx = 0;
-    for (int i = 0; i < nlocal; i++) {
-      drho[i] = drhoE_ptr[idx];
-      desph[i] = drhoE_ptr[idx+1];
-      idx += 2;
-    }
+    for (int i = 0; i < nlocal; i++)
+      drho[i] += drhoE_ptr[i];
+
+    drhoE_ptr += nlocal;
+    for (int i = 0; i < nlocal; i++)
+      desph[i] += drhoE_ptr[i];
   }
 
   if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)
diff --git a/src/GPU/pair_sph_taitwater_gpu.cpp b/src/GPU/pair_sph_taitwater_gpu.cpp
index 6f2762c144..23252cea8a 100644
--- a/src/GPU/pair_sph_taitwater_gpu.cpp
+++ b/src/GPU/pair_sph_taitwater_gpu.cpp
@@ -18,6 +18,7 @@
 #include "pair_sph_taitwater_gpu.h"
 
 #include "atom.h"
+#include "comm.h"
 #include "domain.h"
 #include "error.h"
 #include "force.h"
@@ -85,6 +86,25 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag)
 {
   ev_init(eflag, vflag);
 
+  // check consistency of pair coefficients
+
+  if (first) {
+    for (int i = 1; i <= atom->ntypes; i++) {
+      for (int j = 1; i <= atom->ntypes; i++) {
+        if (cutsq[i][j] > 1.e-32) {
+          if (!setflag[i][i] || !setflag[j][j]) {
+            if (comm->me == 0) {
+              printf(
+                  "SPH particle types %d and %d interact with cutoff=%g, but not all of their single particle properties are set.\n",
+                  i, j, sqrt(cutsq[i][j]));
+            }
+          }
+        }
+      }
+    }
+    first = 0;
+  }
+
   int nall = atom->nlocal + atom->nghost;
   int inum, host_start;
 
@@ -110,7 +130,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag)
     firstneigh = sph_taitwater_gpu_compute_n(
         neighbor->ago, inum, nall, atom->x, atom->type, sublo, subhi, atom->tag, atom->nspecial,
         atom->special, eflag, vflag, eflag_atom, vflag_atom, host_start, &ilist, &numneigh,
-        cpu_time, success, atom->v);
+        cpu_time, success, atom->vest);
   } else {
     inum = list->inum;
     ilist = list->ilist;
@@ -118,7 +138,7 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag)
     firstneigh = list->firstneigh;
     sph_taitwater_gpu_compute(neighbor->ago, inum, nall, atom->x, atom->type, ilist, numneigh, firstneigh,
                        eflag, vflag, eflag_atom, vflag_atom, host_start, cpu_time, success,
-                       atom->tag, atom->v);
+                       atom->tag, atom->vest);
   }
   if (!success) error->one(FLERR, "Insufficient memory on accelerator");
 
@@ -131,21 +151,21 @@ void PairSPHTaitwaterGPU::compute(int eflag, int vflag)
   int nlocal = atom->nlocal;
   if (acc_float) {
     auto drhoE_ptr = (float *)drhoE_pinned;
-    int idx = 0;
-    for (int i = 0; i < nlocal; i++) {
-      drho[i] = drhoE_ptr[idx];
-      desph[i] = drhoE_ptr[idx+1];
-      idx += 2;
-    }
+    for (int i = 0; i < nlocal; i++)
+      drho[i] += drhoE_ptr[i];
+
+    drhoE_ptr += nlocal;
+    for (int i = 0; i < nlocal; i++)
+      desph[i] += drhoE_ptr[i];
 
   } else {
     auto drhoE_ptr = (double *)drhoE_pinned;
-    int idx = 0;
-    for (int i = 0; i < nlocal; i++) {
-      drho[i] = drhoE_ptr[idx];
-      desph[i] = drhoE_ptr[idx+1];
-      idx += 2;
-    }
+    for (int i = 0; i < nlocal; i++)
+      drho[i] += drhoE_ptr[i];
+
+    drhoE_ptr += nlocal;
+    for (int i = 0; i < nlocal; i++)
+      desph[i] += drhoE_ptr[i];
   }
 
   if (atom->molecular != Atom::ATOMIC && neighbor->ago == 0)

From 8f589ed53654efaecc2bec3d566cd19e13be4b56 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 18 Mar 2024 21:51:19 -0400
Subject: [PATCH 13/27] simplify using modern API

---
 src/MC/fix_sgcmc.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/src/MC/fix_sgcmc.cpp b/src/MC/fix_sgcmc.cpp
index ae0e69d77e..e6e01610fe 100644
--- a/src/MC/fix_sgcmc.cpp
+++ b/src/MC/fix_sgcmc.cpp
@@ -253,11 +253,9 @@ void FixSemiGrandCanonicalMC::init()
       error->all(FLERR, "Can not run fix sgcmc with naive total energy calculation "
                  "and more than one MPI process.");
 
-    // Create a compute that will provide the total energy of the system.
+    // Get reference to a compute that will provide the total energy of the system.
     // This is needed by computeTotalEnergy().
-    char* id_pe = (char*)"thermo_pe";
-    int ipe = modify->find_compute(id_pe);
-    compute_pe = modify->compute[ipe];
+    compute_pe = modify->get_compute_by_id("thermo_pe");
   }
   interactionRadius = force->pair->cutforce;
   if (comm->me == 0) utils::logmesg(lmp, "  SGC - Interaction radius: {}\n", interactionRadius);

From e7075163f1b6f171cb9f9e75e24d49cf9ffd497a Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 18 Mar 2024 21:51:31 -0400
Subject: [PATCH 14/27] update coding style

---
 src/MC/fix_sgcmc.cpp | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/src/MC/fix_sgcmc.cpp b/src/MC/fix_sgcmc.cpp
index e6e01610fe..77ead0479f 100644
--- a/src/MC/fix_sgcmc.cpp
+++ b/src/MC/fix_sgcmc.cpp
@@ -371,8 +371,7 @@ void FixSemiGrandCanonicalMC::doMC()
           // Use a random number to choose the new species if there are three or more atom types.
           newSpecies = (int)(localRandom->uniform() * (atom->ntypes-1)) + 1;
           if (newSpecies >= oldSpecies) newSpecies++;
-        }
-        else {
+        } else {
           // If there are only two atom types, then the decision is clear.
           newSpecies = (oldSpecies == 1) ? 2 : 1;
         }
@@ -392,8 +391,7 @@ void FixSemiGrandCanonicalMC::doMC()
         if (serialMode && kappa != 0.0) {
           for (int i = 2; i <= atom->ntypes; i++)
             dm += (deltamu[i] + kappa / atom->natoms * (2.0 * speciesCounts[i] + deltaN[i])) * deltaN[i];
-        }
-        else {
+        } else {
           for (int i = 2; i <= atom->ntypes; i++)
             dm += deltamu[i] * deltaN[i];
         }
@@ -434,8 +432,7 @@ void FixSemiGrandCanonicalMC::doMC()
         // Update global species counters.
         for (int i = 1; i <= atom->ntypes; i++)
           speciesCounts[i] += deltaNGlobal[i];
-      }
-      else if (serialMode) {
+      } else if (serialMode) {
         // Update the local species counters.
         for (int i = 1; i <= atom->ntypes; i++)
           speciesCounts[i] += deltaN[i];
@@ -448,8 +445,7 @@ void FixSemiGrandCanonicalMC::doMC()
         else
           flipAtomGeneric(selectedAtom, oldSpecies, newSpecies);
         nAcceptedSwapsLocal++;
-      }
-      else {
+      } else {
         nRejectedSwapsLocal++;
       }
 

From ee580038009498d9d2abd9d8230025ce1344263a Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 18 Mar 2024 23:36:41 -0400
Subject: [PATCH 15/27] update electron radius velocities and radii in EFF NH
 fixes analog to fix nve/eff

---
 src/EFF/fix_nh_eff.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/EFF/fix_nh_eff.cpp b/src/EFF/fix_nh_eff.cpp
index a569932000..6e8e2c68fa 100644
--- a/src/EFF/fix_nh_eff.cpp
+++ b/src/EFF/fix_nh_eff.cpp
@@ -16,7 +16,6 @@
    Contributing author: Andres Jaramillo-Botero (Caltech)
 ------------------------------------------------------------------------- */
 
-
 #include "fix_nh_eff.h"
 
 #include "atom.h"
@@ -62,7 +61,7 @@ void FixNHEff::nve_v()
     if (mask[i] & groupbit) {
       if (abs(spin[i])==1) {
         dtfm = dtf / mass[type[i]];
-        ervel[i] = dtfm * erforce[i] / mefactor;
+        ervel[i] += dtfm * erforce[i] / mefactor;
       }
     }
   }
@@ -79,15 +78,26 @@ void FixNHEff::nve_x()
   FixNH::nve_x();
 
   double *eradius = atom->eradius;
+  double *erforce = atom->erforce;
   double *ervel = atom->ervel;
+  double *mass = atom->mass;
+  int *type = atom->type;
   int *spin = atom->spin;
   int *mask = atom->mask;
   int nlocal = atom->nlocal;
   if (igroup == atom->firstgroup) nlocal = atom->nfirst;
 
+  double mefactor = domain->dimension/4.0;
+  double dtfm;
+
   for (int i = 0; i < nlocal; i++)
-    if (mask[i] & groupbit)
-      if (abs(spin[i])==1) eradius[i] += dtv * ervel[i];
+    if (mask[i] & groupbit) {
+      dtfm = dtf / mass[type[i]];
+      if (abs(spin[i])==1) {
+        ervel[i] += dtfm * erforce[i] / mefactor;
+        eradius[i] += dtv * ervel[i];
+      }
+    }
 }
 
 /* ----------------------------------------------------------------------

From f8da51828a11eefe52f1ca8a2f09890d160aa897 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 19 Mar 2024 12:03:56 -0400
Subject: [PATCH 16/27] fix memory leaks in lammps_gather*concat() functions of
 the library interface

---
 src/library.cpp | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/src/library.cpp b/src/library.cpp
index fcf0f6a631..1f76c783da 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -2857,10 +2857,6 @@ void lammps_gather_atoms_concat(void *handle, const char *name, int type,
       if ((count == 1) || imgunpack) vector = (int *) vptr;
       else array = (int **) vptr;
 
-      int *copy;
-      lmp->memory->create(copy,count*natoms,"lib/gather:copy");
-      for (i = 0; i < count*natoms; i++) copy[i] = 0;
-
       int nlocal = lmp->atom->nlocal;
 
       if (count == 1) {
@@ -2872,7 +2868,10 @@ void lammps_gather_atoms_concat(void *handle, const char *name, int type,
                        MPI_INT,lmp->world);
 
       } else if (imgunpack) {
-        lmp->memory->create(copy,count*nlocal,"lib/gather:copy");
+        int *copy;
+        lmp->memory->create(copy,count*natoms,"lib/gather:copy");
+        for (i = 0; i < count*natoms; i++) copy[i] = 0;
+
         offset = 0;
         for (i = 0; i < nlocal; i++) {
           const int image = vector[i];
@@ -4328,10 +4327,6 @@ void lammps_gather_concat(void *handle, const char *name, int type, int count,
       if ((count == 1) || imgunpack) vector = (int *) vptr;
       else array = (int **) vptr;
 
-      int *copy;
-      lmp->memory->create(copy,count*natoms,"lib/gather:copy");
-      for (i = 0; i < count*natoms; i++) copy[i] = 0;
-
       int nlocal = lmp->atom->nlocal;
 
       if (count == 1) {
@@ -4343,7 +4338,10 @@ void lammps_gather_concat(void *handle, const char *name, int type, int count,
                        MPI_INT,lmp->world);
 
       } else if (imgunpack) {
-        lmp->memory->create(copy,count*nlocal,"lib/gather:copy");
+        int *copy;
+        lmp->memory->create(copy,count*natoms,"lib/gather:copy");
+        for (i = 0; i < count*natoms; i++) copy[i] = 0;
+
         offset = 0;
         for (i = 0; i < nlocal; i++) {
           const int image = vector[i];

From d0e2a846b2ca6f172d0286c4b6cf6bf84cbbfc25 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 19 Mar 2024 12:04:06 -0400
Subject: [PATCH 17/27] cosmetic

---
 src/library.cpp | 44 ++++++++++----------------------------------
 1 file changed, 10 insertions(+), 34 deletions(-)

diff --git a/src/library.cpp b/src/library.cpp
index 1f76c783da..a2eb1473a9 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -2864,8 +2864,7 @@ void lammps_gather_atoms_concat(void *handle, const char *name, int type,
         displs[0] = 0;
         for (i = 1; i < nprocs; i++)
           displs[i] = displs[i-1] + recvcounts[i-1];
-        MPI_Allgatherv(vector,nlocal,MPI_INT,data,recvcounts,displs,
-                       MPI_INT,lmp->world);
+        MPI_Allgatherv(vector,nlocal,MPI_INT,data,recvcounts,displs,MPI_INT,lmp->world);
 
       } else if (imgunpack) {
         int *copy;
@@ -2884,8 +2883,7 @@ void lammps_gather_atoms_concat(void *handle, const char *name, int type,
         displs[0] = 0;
         for (i = 1; i < nprocs; i++)
           displs[i] = displs[i-1] + recvcounts[i-1];
-        MPI_Allgatherv(copy,count*nlocal,MPI_INT,
-                       data,recvcounts,displs,MPI_INT,lmp->world);
+        MPI_Allgatherv(copy,count*nlocal,MPI_INT,data,recvcounts,displs,MPI_INT,lmp->world);
         lmp->memory->destroy(copy);
 
       } else {
@@ -2894,8 +2892,7 @@ void lammps_gather_atoms_concat(void *handle, const char *name, int type,
         displs[0] = 0;
         for (i = 1; i < nprocs; i++)
           displs[i] = displs[i-1] + recvcounts[i-1];
-        MPI_Allgatherv(&array[0][0],count*nlocal,MPI_INT,
-                       data,recvcounts,displs,MPI_INT,lmp->world);
+        MPI_Allgatherv(&array[0][0],count*nlocal,MPI_INT,data,recvcounts,displs,MPI_INT,lmp->world);
       }
 
     } else {
@@ -2911,8 +2908,7 @@ void lammps_gather_atoms_concat(void *handle, const char *name, int type,
         displs[0] = 0;
         for (i = 1; i < nprocs; i++)
           displs[i] = displs[i-1] + recvcounts[i-1];
-        MPI_Allgatherv(vector,nlocal,MPI_DOUBLE,data,recvcounts,displs,
-                       MPI_DOUBLE,lmp->world);
+        MPI_Allgatherv(vector,nlocal,MPI_DOUBLE,data,recvcounts,displs,MPI_DOUBLE,lmp->world);
 
       } else {
         int n = count*nlocal;
@@ -3170,10 +3166,6 @@ void lammps_scatter_atoms(void *handle, const char *name, int type, int count,
       return;
     }
 
-    // copy = Natom length vector of per-atom values
-    // use atom ID to insert each atom's values into copy
-    // MPI_Allreduce with MPI_SUM to merge into data, ordered by atom ID
-
     if (type == 0) {
       int *vector = nullptr;
       int **array = nullptr;
@@ -3321,10 +3313,6 @@ void lammps_scatter_atoms_subset(void *handle, const char *name, int type,
       return;
     }
 
-    // copy = Natom length vector of per-atom values
-    // use atom ID to insert each atom's values into copy
-    // MPI_Allreduce with MPI_SUM to merge into data, ordered by atom ID
-
     if (type == 0) {
       int *vector = nullptr;
       int **array = nullptr;
@@ -4354,8 +4342,7 @@ void lammps_gather_concat(void *handle, const char *name, int type, int count,
         displs[0] = 0;
         for (i = 1; i < nprocs; i++)
           displs[i] = displs[i-1] + recvcounts[i-1];
-        MPI_Allgatherv(copy,count*nlocal,MPI_INT,
-                       data,recvcounts,displs,MPI_INT,lmp->world);
+        MPI_Allgatherv(copy,count*nlocal,MPI_INT,data,recvcounts,displs,MPI_INT,lmp->world);
         lmp->memory->destroy(copy);
 
       } else {
@@ -4364,8 +4351,7 @@ void lammps_gather_concat(void *handle, const char *name, int type, int count,
         displs[0] = 0;
         for (i = 1; i < nprocs; i++)
           displs[i] = displs[i-1] + recvcounts[i-1];
-        MPI_Allgatherv(&array[0][0],count*nlocal,MPI_INT,
-                       data,recvcounts,displs,MPI_INT,lmp->world);
+        MPI_Allgatherv(&array[0][0],count*nlocal,MPI_INT,data,recvcounts,displs,MPI_INT,lmp->world);
       }
 
     } else {
@@ -4874,10 +4860,6 @@ void lammps_scatter(void *handle, const char *name, int type, int count,
       return;
     }
 
-    // copy = Natom length vector of per-atom values
-    // use atom ID to insert each atom's values into copy
-    // MPI_Allreduce with MPI_SUM to merge into data, ordered by atom ID
-
     if (type == 0) {
       int *vector = nullptr;
       int **array = nullptr;
@@ -5128,10 +5110,6 @@ void lammps_scatter_subset(void *handle, const char *name,int type, int count,
       return;
     }
 
-    // copy = Natom length vector of per-atom values
-    // use atom ID to insert each atom's values into copy
-    // MPI_Allreduce with MPI_SUM to merge into data, ordered by atom ID
-
     if (type == 0) {
       int *vector = nullptr;
       int **array = nullptr;
@@ -5488,7 +5466,8 @@ int lammps_neighlist_num_elements(void *handle, int idx) {
  * \param[out] numneigh   number of neighbors of atom iatom or 0
  * \param[out] neighbors  pointer to array of neighbor atom local indices or NULL */
 
-void lammps_neighlist_element_neighbors(void *handle, int idx, int element, int *iatom, int *numneigh, int **neighbors) {
+void lammps_neighlist_element_neighbors(void *handle, int idx, int element, int *iatom,
+                                        int *numneigh, int **neighbors) {
   auto   lmp = (LAMMPS *) handle;
   Neighbor * neighbor = lmp->neighbor;
   *iatom = -1;
@@ -5775,9 +5754,7 @@ otherwise 0.
  * \param  setting   string with the name of the specific setting
  * \return 1 if available, 0 if not.
  */
-int lammps_config_accelerator(const char *package,
-                              const char *category,
-                              const char *setting)
+int lammps_config_accelerator(const char *package, const char *category, const char *setting)
 {
   return Info::has_accelerator_feature(package,category,setting) ? 1 : 0;
 }
@@ -5899,8 +5876,7 @@ int lammps_style_count(void *handle, const char *category) {
  * \param buf_size size of the provided string buffer
  * \return 1 if successful, otherwise 0
  */
-int lammps_style_name(void *handle, const char *category, int idx,
-                      char *buffer, int buf_size) {
+int lammps_style_name(void *handle, const char *category, int idx, char *buffer, int buf_size) {
   auto lmp = (LAMMPS *) handle;
   Info info(lmp);
   auto styles = info.get_available_styles(category);

From 2dd956043963e1632a4cc6ad5f704f40887dbcd1 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 19 Mar 2024 22:39:05 -0400
Subject: [PATCH 18/27] add notes to python versions of lammps_extract_fix()
 that for global data one can only retrieve scalars

---
 python/lammps/core.py          | 26 +++++++++++++++++---------
 python/lammps/numpy_wrapper.py |  7 +++++++
 src/library.cpp                |  3 ++-
 3 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/python/lammps/core.py b/python/lammps/core.py
index 28b384d6ba..3498041454 100644
--- a/python/lammps/core.py
+++ b/python/lammps/core.py
@@ -1078,15 +1078,23 @@ class lammps(object):
   def extract_fix(self,fid,fstyle,ftype,nrow=0,ncol=0):
     """Retrieve data from a LAMMPS fix
 
-    This is a wrapper around the :cpp:func:`lammps_extract_fix`
-    function of the C-library interface.
-    This function returns ``None`` if either the fix id is not
-    recognized, or an invalid combination of :ref:`fstyle <py_style_constants>`
-    and :ref:`ftype <py_type_constants>` constants is used. The
-    names and functionality of the constants are the same as for
-    the corresponding C-library function.  For requests to return
-    a scalar or a size, the value is returned, also when accessing
-    global vectors or arrays, otherwise a pointer.
+    This is a wrapper around the :cpp:func:`lammps_extract_fix` function
+    of the C-library interface.  This function returns ``None`` if
+    either the fix id is not recognized, or an invalid combination of
+    :ref:`fstyle <py_style_constants>` and :ref:`ftype
+    <py_type_constants>` constants is used. The names and functionality
+    of the constants are the same as for the corresponding C-library
+    function.  For requests to return a scalar or a size, the value is
+    returned, also when accessing global vectors or arrays, otherwise a
+    pointer.
+
+    .. note::
+
+       When requesting global data, the fix data can only be accessed
+       one item at a time without access to the whole vector or array.
+       Thus this function will always return a scalar.  To access vector
+       or array elements the "nrow" and "ncol" arguments need to be set
+       accordingly (they default to 0).
 
     :param fid: fix ID
     :type fid:  string
diff --git a/python/lammps/numpy_wrapper.py b/python/lammps/numpy_wrapper.py
index a29853d16a..91042c43c8 100644
--- a/python/lammps/numpy_wrapper.py
+++ b/python/lammps/numpy_wrapper.py
@@ -203,6 +203,13 @@ class numpy_wrapper:
     It behaves the same as the original method, but returns NumPy arrays
     instead of ``ctypes`` pointers.
 
+    .. note::
+
+       When requesting global data, the fix data can only be accessed one
+       item at a time without access to the whole vector or array.  Thus this
+       function will always return a scalar.  To access vector or array elements
+       the "nrow" and "ncol" arguments need to be set accordingly (they default to 0).
+
     :param fid: fix ID
     :type fid:  string
     :param fstyle: style of the data retrieve (global, atom, or local), see :ref:`py_style_constants`
diff --git a/src/library.cpp b/src/library.cpp
index a2eb1473a9..f81f52305e 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -2137,7 +2137,8 @@ available.
 
    .. code-block:: c
 
-      double *dptr = (double *) lammps_extract_fix(handle,name,0,1,0,0);
+      double *dptr = (double *) lammps_extract_fix(handle, name,
+                                LMP_STYLE_GLOBAL, LMP_TYPE_VECTOR, 0, 0);
       double value = *dptr;
       lammps_free((void *)dptr);
 

From d123cd54408c1cb19343cfa3c38b67aedfd47ea9 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 20 Mar 2024 04:57:03 -0400
Subject: [PATCH 19/27] fix typo

---
 doc/src/Fortran.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/src/Fortran.rst b/doc/src/Fortran.rst
index 64fca57a02..e9976b9032 100644
--- a/doc/src/Fortran.rst
+++ b/doc/src/Fortran.rst
@@ -1255,8 +1255,8 @@ Procedures Bound to the :f:type:`lammps` Derived Type
    three elements of the global vector calculated by fix recenter into the
    variables *dx*, *dy*, and *dz*, respectively.
 
-   If asked for per-atom or local data, :f:func:`extract_compute` returns a
-   pointer to actual LAMMPS data. The pointer so returned will have the
+   If asked for per-atom or local data, :f:func:`extract_fix` returns a
+   pointer to actual LAMMPS data.  The pointer returned will have the
    appropriate size to match the internal data, and will be
    type/kind/rank-checked at the time of the assignment. For example,
 

From 2eb1ce98d9e9eb069113f3a83d2ee379f05e1404 Mon Sep 17 00:00:00 2001
From: Yifan Li <yifanl0716@gmail.com>
Date: Wed, 20 Mar 2024 14:06:44 -0400
Subject: [PATCH 20/27] add compute_t_prim for method=PIMD

---
 src/REPLICA/fix_pimd_langevin.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/REPLICA/fix_pimd_langevin.cpp b/src/REPLICA/fix_pimd_langevin.cpp
index c6886fbed7..47a710cabf 100644
--- a/src/REPLICA/fix_pimd_langevin.cpp
+++ b/src/REPLICA/fix_pimd_langevin.cpp
@@ -696,6 +696,7 @@ void FixPIMDLangevin::post_force(int /*flag*/)
     inter_replica_comm(x);
     spring_force();
     compute_spring_energy();
+    compute_t_prim();
     if (mapflag) {
       for (int i = 0; i < nlocal; i++) { domain->unmap_inv(x[i], image[i]); }
     }

From a8d07744c0f28fabb9ba345c5b9450b04ae4587b Mon Sep 17 00:00:00 2001
From: Yifan Li <yifanl0716@gmail.com>
Date: Thu, 21 Mar 2024 10:55:48 -0400
Subject: [PATCH 21/27] split compute_cvir into 2 functions

---
 src/REPLICA/fix_pimd_langevin.cpp | 21 +++++++++++++++++----
 src/REPLICA/fix_pimd_langevin.h   |  1 +
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/REPLICA/fix_pimd_langevin.cpp b/src/REPLICA/fix_pimd_langevin.cpp
index 47a710cabf..996308520e 100644
--- a/src/REPLICA/fix_pimd_langevin.cpp
+++ b/src/REPLICA/fix_pimd_langevin.cpp
@@ -685,6 +685,7 @@ void FixPIMDLangevin::post_force(int /*flag*/)
     }
 
     compute_vir();
+    compute_xf_vir();
     compute_cvir();
     compute_t_vir();
   }
@@ -1384,19 +1385,31 @@ void FixPIMDLangevin::remove_com_motion()
 
 /* ---------------------------------------------------------------------- */
 
-void FixPIMDLangevin::compute_cvir()
+void FixPIMDLangevin::compute_xf_vir()
 {
   int nlocal = atom->nlocal;
   double xf = 0.0;
-  double xcf = 0.0;
-  vir_ = centroid_vir = 0.0;
+  vir_ = 0.0;
   for (int i = 0; i < nlocal; i++) {
     for (int j = 0; j < 3; j++) {
       xf += x_unwrap[i][j] * atom->f[i][j];
-      xcf += (x_unwrap[i][j] - xc[i][j]) * atom->f[i][j];
     }
   }
   MPI_Allreduce(&xf, &vir_, 1, MPI_DOUBLE, MPI_SUM, universe->uworld);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void FixPIMDLangevin::compute_cvir()
+{
+  int nlocal = atom->nlocal;
+  double xcf = 0.0;
+  centroid_vir = 0.0;
+  for (int i = 0; i < nlocal; i++) {
+    for (int j = 0; j < 3; j++) {
+      xcf += (x_unwrap[i][j] - xc[i][j]) * atom->f[i][j];
+    }
+  }
   MPI_Allreduce(&xcf, &centroid_vir, 1, MPI_DOUBLE, MPI_SUM, universe->uworld);
   if (pstyle == ANISO) {
     for (int i = 0; i < 6; i++) c_vir_tensor[i] = 0.0;
diff --git a/src/REPLICA/fix_pimd_langevin.h b/src/REPLICA/fix_pimd_langevin.h
index 0f21b908b0..869281243f 100644
--- a/src/REPLICA/fix_pimd_langevin.h
+++ b/src/REPLICA/fix_pimd_langevin.h
@@ -176,6 +176,7 @@ class FixPIMDLangevin : public Fix {
   void compute_p_prim();
   void compute_p_cv();    // centroid-virial pressure estimator
   void compute_vir();
+  void compute_xf_vir();
   void compute_cvir();
   void compute_totenthalpy();
 

From 5f6d3ad154493cd37fead5d746bad86665d7e3d2 Mon Sep 17 00:00:00 2001
From: Yifan Li <yifanl0716@gmail.com>
Date: Thu, 21 Mar 2024 10:58:25 -0400
Subject: [PATCH 22/27] allow t_vir and t_cv computation for method=PIMD

---
 src/REPLICA/fix_pimd_langevin.cpp | 40 +++++++++++++++----------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/src/REPLICA/fix_pimd_langevin.cpp b/src/REPLICA/fix_pimd_langevin.cpp
index 996308520e..13231a9e65 100644
--- a/src/REPLICA/fix_pimd_langevin.cpp
+++ b/src/REPLICA/fix_pimd_langevin.cpp
@@ -667,28 +667,26 @@ void FixPIMDLangevin::post_force(int /*flag*/)
   imageint *image = atom->image;
   tagint *tag = atom->tag;
 
-  if (method == NMPIMD) {
-    if (atom->nmax > maxunwrap) reallocate_x_unwrap();
-    if (atom->nmax > maxxc) reallocate_xc();
-    for (int i = 0; i < nlocal; i++) {
-      x_unwrap[i][0] = x[i][0];
-      x_unwrap[i][1] = x[i][1];
-      x_unwrap[i][2] = x[i][2];
-    }
-    if (mapflag) {
-      for (int i = 0; i < nlocal; i++) { domain->unmap(x_unwrap[i], image[i]); }
-    }
-    for (int i = 0; i < nlocal; i++) {
-      xc[i][0] = xcall[3 * (tag[i] - 1) + 0];
-      xc[i][1] = xcall[3 * (tag[i] - 1) + 1];
-      xc[i][2] = xcall[3 * (tag[i] - 1) + 2];
-    }
-
-    compute_vir();
-    compute_xf_vir();
-    compute_cvir();
-    compute_t_vir();
+  if (atom->nmax > maxunwrap) reallocate_x_unwrap();
+  if (atom->nmax > maxxc) reallocate_xc();
+  for (int i = 0; i < nlocal; i++) {
+    x_unwrap[i][0] = x[i][0];
+    x_unwrap[i][1] = x[i][1];
+    x_unwrap[i][2] = x[i][2];
   }
+  if (mapflag) {
+    for (int i = 0; i < nlocal; i++) { domain->unmap(x_unwrap[i], image[i]); }
+  }
+  for (int i = 0; i < nlocal; i++) {
+    xc[i][0] = xcall[3 * (tag[i] - 1) + 0];
+    xc[i][1] = xcall[3 * (tag[i] - 1) + 1];
+    xc[i][2] = xcall[3 * (tag[i] - 1) + 2];
+  }
+
+  compute_vir();
+  compute_xf_vir();
+  compute_cvir();
+  compute_t_vir();
 
   if (method == PIMD) {
     if (mapflag) {

From 8e3aa79a9ef1dd31d5bef012b46417171a485384 Mon Sep 17 00:00:00 2001
From: Yifan Li <yifanl0716@gmail.com>
Date: Thu, 21 Mar 2024 11:32:30 -0400
Subject: [PATCH 23/27] correct remove_com_motion for method=PIMD

---
 src/REPLICA/fix_pimd_langevin.cpp | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/REPLICA/fix_pimd_langevin.cpp b/src/REPLICA/fix_pimd_langevin.cpp
index 13231a9e65..511dd6de41 100644
--- a/src/REPLICA/fix_pimd_langevin.cpp
+++ b/src/REPLICA/fix_pimd_langevin.cpp
@@ -1364,7 +1364,24 @@ void FixPIMDLangevin::inter_replica_comm(double **ptr)
 
 void FixPIMDLangevin::remove_com_motion()
 {
-  if (universe->iworld == 0) {
+  if (method == NMPIMD) {
+    if (universe->iworld == 0) {
+      double **v = atom->v;
+      int *mask = atom->mask;
+      int nlocal = atom->nlocal;
+      if (dynamic) masstotal = group->mass(igroup);
+      double vcm[3];
+      group->vcm(igroup, masstotal, vcm);
+      for (int i = 0; i < nlocal; i++) {
+        if (mask[i] & groupbit) {
+          v[i][0] -= vcm[0];
+          v[i][1] -= vcm[1];
+          v[i][2] -= vcm[2];
+        }
+      }
+    }
+  }
+  else if (method == PIMD) {
     double **v = atom->v;
     int *mask = atom->mask;
     int nlocal = atom->nlocal;
@@ -1379,6 +1396,9 @@ void FixPIMDLangevin::remove_com_motion()
       }
     }
   }
+  else {
+    error->all(FLERR, "Unknown method for fix pimd/langevin. Only nmpimd and pimd are supported!");
+  }
 }
 
 /* ---------------------------------------------------------------------- */

From b7def392fb069ce4a6c58658d2f314e683703426 Mon Sep 17 00:00:00 2001
From: Yifan Li <yifanl0716@gmail.com>
Date: Thu, 21 Mar 2024 11:55:13 -0400
Subject: [PATCH 24/27] correct p_cv computation for method=PIMD

---
 src/REPLICA/fix_pimd_langevin.cpp | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/REPLICA/fix_pimd_langevin.cpp b/src/REPLICA/fix_pimd_langevin.cpp
index 511dd6de41..ac8fef8b73 100644
--- a/src/REPLICA/fix_pimd_langevin.cpp
+++ b/src/REPLICA/fix_pimd_langevin.cpp
@@ -1585,11 +1585,21 @@ void FixPIMDLangevin::compute_p_prim()
 void FixPIMDLangevin::compute_p_cv()
 {
   double inv_volume = 1.0 / (domain->xprd * domain->yprd * domain->zprd);
-  if (universe->iworld == 0) {
-    p_cv = THIRD * inv_volume * ((2.0 * ke_bead - centroid_vir) * force->nktv2p + vir) / np;
-  }
   p_md = THIRD * inv_volume * (totke + vir);
-  MPI_Bcast(&p_cv, 1, MPI_DOUBLE, 0, universe->uworld);
+  if (method == NMPIMD) {
+    if (universe->iworld == 0) {
+      p_cv = THIRD * inv_volume * ((2.0 * ke_bead - centroid_vir) * force->nktv2p + vir) / np;
+    }
+    MPI_Bcast(&p_cv, 1, MPI_DOUBLE, 0, universe->uworld);
+  }
+  else if (method == PIMD) {
+    p_cv = THIRD * inv_volume * ((2.0 * totke / np - centroid_vir) * force->nktv2p + vir) / np;
+  }
+  else {
+    error->universe_all(
+        FLERR,
+        "Unknown method parameter for fix pimd/langevin. Only nmpimd and pimd are supported!");
+  }
 }
 
 /* ---------------------------------------------------------------------- */

From 73eb12a2042b9ad44d93d9f510950c9e2e0d78a0 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 21 Mar 2024 12:17:06 -0400
Subject: [PATCH 25/27] apply clang-format

---
 src/REPLICA/fix_pimd_langevin.cpp | 38 +++++++++++++------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/src/REPLICA/fix_pimd_langevin.cpp b/src/REPLICA/fix_pimd_langevin.cpp
index ac8fef8b73..c24984f152 100644
--- a/src/REPLICA/fix_pimd_langevin.cpp
+++ b/src/REPLICA/fix_pimd_langevin.cpp
@@ -47,10 +47,10 @@
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
-using MathConst::MY_PI;
 using MathConst::MY_2PI;
-using MathConst::THIRD;
+using MathConst::MY_PI;
 using MathConst::MY_SQRT2;
+using MathConst::THIRD;
 using MathSpecial::powint;
 
 enum { PIMD, NMPIMD };
@@ -475,11 +475,13 @@ void FixPIMDLangevin::init()
 
   c_pe = modify->get_compute_by_id(id_pe);
   if (!c_pe)
-    error->universe_all(FLERR, fmt::format("Could not find fix {} potential energy compute ID {}", style, id_pe));
+    error->universe_all(
+        FLERR, fmt::format("Could not find fix {} potential energy compute ID {}", style, id_pe));
 
   c_press = modify->get_compute_by_id(id_press);
   if (!c_press)
-    error->universe_all(FLERR, fmt::format("Could not find fix {} pressure compute ID {}", style, id_press));
+    error->universe_all(
+        FLERR, fmt::format("Could not find fix {} pressure compute ID {}", style, id_press));
 
   t_prim = t_vir = t_cv = p_prim = p_vir = p_cv = p_md = 0.0;
 }
@@ -741,7 +743,7 @@ void FixPIMDLangevin::collect_xc()
       }
     }
 
-    const double sqrtnp = sqrt((double)np);
+    const double sqrtnp = sqrt((double) np);
     for (int i = 0; i < nlocal; i++) {
       xcall[3 * (tag[i] - 1) + 0] = x[i][0] / sqrtnp;
       xcall[3 * (tag[i] - 1) + 1] = x[i][1] / sqrtnp;
@@ -1048,8 +1050,8 @@ void FixPIMDLangevin::langevin_init()
         c2_k[i] = sqrt(1.0 - c1_k[i] * c1_k[i]);
       }
       for (int i = 0; i < np; i++) {
-        out += fmt::format("      {:d}     {:.8e} {:.8e} {:.8e} {:.8e}\n", i,
-                           _omega_k[i], tau_k[i], c1_k[i], c2_k[i]);
+        out += fmt::format("      {:d}     {:.8e} {:.8e} {:.8e} {:.8e}\n", i, _omega_k[i], tau_k[i],
+                           c1_k[i], c2_k[i]);
       }
     } else if (method == PIMD) {
       for (int i = 0; i < np; i++) {
@@ -1111,7 +1113,7 @@ void FixPIMDLangevin::nmpimd_init()
   }
 
   // Set up eigenvectors for degenerated modes
-  const double sqrtnp = sqrt((double)np);
+  const double sqrtnp = sqrt((double) np);
   for (int j = 0; j < np; j++) {
     for (int i = 1; i < int(np / 2) + 1; i++) {
       M_x2xp[i][j] = MY_SQRT2 * cos(MY_2PI * double(i) * double(j) / double(np)) / sqrtnp;
@@ -1380,8 +1382,7 @@ void FixPIMDLangevin::remove_com_motion()
         }
       }
     }
-  }
-  else if (method == PIMD) {
+  } else if (method == PIMD) {
     double **v = atom->v;
     int *mask = atom->mask;
     int nlocal = atom->nlocal;
@@ -1395,8 +1396,7 @@ void FixPIMDLangevin::remove_com_motion()
         v[i][2] -= vcm[2];
       }
     }
-  }
-  else {
+  } else {
     error->all(FLERR, "Unknown method for fix pimd/langevin. Only nmpimd and pimd are supported!");
   }
 }
@@ -1409,9 +1409,7 @@ void FixPIMDLangevin::compute_xf_vir()
   double xf = 0.0;
   vir_ = 0.0;
   for (int i = 0; i < nlocal; i++) {
-    for (int j = 0; j < 3; j++) {
-      xf += x_unwrap[i][j] * atom->f[i][j];
-    }
+    for (int j = 0; j < 3; j++) { xf += x_unwrap[i][j] * atom->f[i][j]; }
   }
   MPI_Allreduce(&xf, &vir_, 1, MPI_DOUBLE, MPI_SUM, universe->uworld);
 }
@@ -1424,9 +1422,7 @@ void FixPIMDLangevin::compute_cvir()
   double xcf = 0.0;
   centroid_vir = 0.0;
   for (int i = 0; i < nlocal; i++) {
-    for (int j = 0; j < 3; j++) {
-      xcf += (x_unwrap[i][j] - xc[i][j]) * atom->f[i][j];
-    }
+    for (int j = 0; j < 3; j++) { xcf += (x_unwrap[i][j] - xc[i][j]) * atom->f[i][j]; }
   }
   MPI_Allreduce(&xcf, &centroid_vir, 1, MPI_DOUBLE, MPI_SUM, universe->uworld);
   if (pstyle == ANISO) {
@@ -1591,11 +1587,9 @@ void FixPIMDLangevin::compute_p_cv()
       p_cv = THIRD * inv_volume * ((2.0 * ke_bead - centroid_vir) * force->nktv2p + vir) / np;
     }
     MPI_Bcast(&p_cv, 1, MPI_DOUBLE, 0, universe->uworld);
-  }
-  else if (method == PIMD) {
+  } else if (method == PIMD) {
     p_cv = THIRD * inv_volume * ((2.0 * totke / np - centroid_vir) * force->nktv2p + vir) / np;
-  }
-  else {
+  } else {
     error->universe_all(
         FLERR,
         "Unknown method parameter for fix pimd/langevin. Only nmpimd and pimd are supported!");

From 0b5722adc0a48d50245815fa77dc0b718055df6d Mon Sep 17 00:00:00 2001
From: Federico Williamson - WGPC5 <federicowilliamson@hotmail.co.uk>
Date: Thu, 21 Mar 2024 14:33:01 -0300
Subject: [PATCH 26/27] Allow compute spin for groups other than `all`

---
 src/SPIN/compute_spin.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/SPIN/compute_spin.cpp b/src/SPIN/compute_spin.cpp
index 1c92d284f0..fc5e223e75 100644
--- a/src/SPIN/compute_spin.cpp
+++ b/src/SPIN/compute_spin.cpp
@@ -215,9 +215,8 @@ void ComputeSpin::compute_vector()
         tempnum += tx*tx+ty*ty+tz*tz;
         tempdenom += sp[i][0]*fm[i][0]+fm[i][1]*sp[i][1]+sp[i][2]*fm[i][2];
         countsp++;
-      }
+      } else error->all(FLERR,"Compute compute/spin requires atom/spin style");
     }
-    else error->all(FLERR,"Compute compute/spin requires atom/spin style");
   }
 
   MPI_Allreduce(mag,magtot,4,MPI_DOUBLE,MPI_SUM,world);

From 7137290682e4fc7253c4a0a1b4bfb41a3449fabd Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Fri, 29 Mar 2024 21:52:57 +0100
Subject: [PATCH 27/27] Follow requested changes to patch

---
 src/variable.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/variable.cpp b/src/variable.cpp
index 2bde0e5adb..c1cf978028 100644
--- a/src/variable.cpp
+++ b/src/variable.cpp
@@ -1660,14 +1660,16 @@ double Variable::evaluate(char *str, Tree **tree, int ivar)
             if (!compute->is_initialized())
               print_var_error(FLERR,"Variable formula compute cannot be invoked before "
                               "initialization by a run",ivar);
+            if (index1 > compute->size_array_cols)
+              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
             if (!(compute->invoked_flag & Compute::INVOKED_ARRAY)) {
               compute->compute_array();
               compute->invoked_flag |= Compute::INVOKED_ARRAY;
             }
+            // wait until after compute invocation to check size_array_rows
+            // b/c may be zero until after initial invocation
             if (compute->size_array_rows == 0)
               print_var_error(FLERR,"Variable formula compute array is zero length",ivar);
-            if (index1 > compute->size_array_cols)
-              print_var_error(FLERR,"Variable formula compute array is accessed out-of-range",ivar,0);
 
             auto newtree = new Tree();
             newtree->type = VECTORARRAY;