From 10a3e857963165350c4d72d5a665678207926425 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Wed, 22 Apr 2020 11:53:08 -0600
Subject: [PATCH 01/64] C1 JT 042220 - added biquadratic pair/spin exchange

---
 .../llg_exchange.py                           |   2 +-
 src/SPIN/pair_spin_exchange.cpp               |  24 +-
 src/SPIN/pair_spin_exchange_biquadratic.cpp   | 594 ++++++++++++++++++
 src/SPIN/pair_spin_exchange_biquadratic.h     |  85 +++
 4 files changed, 695 insertions(+), 10 deletions(-)
 create mode 100644 src/SPIN/pair_spin_exchange_biquadratic.cpp
 create mode 100644 src/SPIN/pair_spin_exchange_biquadratic.h

diff --git a/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py b/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py
index 49eecb5b44..dd1c543bb3 100755
--- a/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py
+++ b/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py
@@ -65,6 +65,6 @@ for t in range (0,N):
   # calc. average magnetization
   Sm = (S1+S2)*0.5
   # calc. energy
-  en = -2.0*J0*(np.dot(S1,S2))
+  en = -J0*(np.dot(S1,S2))
   # print res. in ps for comparison with LAMMPS
   print(t*dt/1000.0,Sm[0],Sm[1],Sm[2],en)
diff --git a/src/SPIN/pair_spin_exchange.cpp b/src/SPIN/pair_spin_exchange.cpp
index 5c5d5cb1a4..b23f4fa0cb 100644
--- a/src/SPIN/pair_spin_exchange.cpp
+++ b/src/SPIN/pair_spin_exchange.cpp
@@ -231,9 +231,15 @@ void PairSpinExchange::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_exchange(i,j,rsq,fmi,spj);
-        if (lattice_flag) {
+        
+        if (lattice_flag)
           compute_exchange_mech(i,j,rsq,eij,fi,spi,spj);
-        }
+        
+        if (eflag) {
+          evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+          evdwl *= 0.5*hbar;
+          emag[i] += evdwl;
+        } else evdwl = 0.0;
       }
 
       f[i][0] += fi[0];
@@ -243,11 +249,11 @@ void PairSpinExchange::compute(int eflag, int vflag)
       fm[i][1] += fmi[1];
       fm[i][2] += fmi[2];
 
-      if (eflag) {
-        evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
-        evdwl *= 0.5*hbar;
-        emag[i] += evdwl;
-      } else evdwl = 0.0;
+      // if (eflag) {
+      //   evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+      //   evdwl *= 0.5*hbar;
+      //   emag[i] += evdwl;
+      // } else evdwl = 0.0;
 
       if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
           evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
@@ -364,8 +370,8 @@ void PairSpinExchange::compute_exchange(int i, int j, double rsq, double fmi[3],
    compute the mechanical force due to the exchange interaction between atom i and atom j
 ------------------------------------------------------------------------- */
 
-void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq, double eij[3],
-    double fi[3],  double spi[3], double spj[3])
+void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq, 
+    double eij[3], double fi[3],  double spi[3], double spj[3])
 {
   int *type = atom->type;
   int itype, jtype;
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
new file mode 100644
index 0000000000..a7f64690af
--- /dev/null
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -0,0 +1,594 @@
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ------------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+                         Aidan Thompson (SNL)
+
+   Please cite the related publication:
+   Tranchida, J., Plimpton, S. J., Thibaudeau, P., & Thompson, A. P. (2018).
+   Massively parallel symplectic algorithm for coupled magnetic spin dynamics
+   and molecular dynamics. Journal of Computational Physics.
+------------------------------------------------------------------------- */
+
+#include "pair_spin_exchange_biquadratic.h"
+#include <mpi.h>
+#include <cmath>
+#include <cstring>
+#include "atom.h"
+#include "comm.h"
+#include "error.h"
+#include "fix.h"
+#include "force.h"
+#include "neigh_list.h"
+#include "memory.h"
+#include "modify.h"
+#include "update.h"
+#include "utils.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+PairSpinExchangeBiquadratic::~PairSpinExchangeBiquadratic()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cut_spin_exchange);
+    memory->destroy(J1_mag);
+    memory->destroy(J1_mech);
+    memory->destroy(J2);
+    memory->destroy(J3);
+    memory->destroy(K1_mag);
+    memory->destroy(K1_mech);
+    memory->destroy(K2);
+    memory->destroy(K3);
+    memory->destroy(cutsq); // to be implemented
+    memory->destroy(emag);
+  }
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::settings(int narg, char **arg)
+{
+  PairSpin::settings(narg,arg);
+
+  cut_spin_exchange_global = force->numeric(FLERR,arg[0]);
+
+  // reset cutoffs that have been explicitly set
+
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++)
+      for (j = i+1; j <= atom->ntypes; j++)
+        if (setflag[i][j]) {
+          cut_spin_exchange[i][j] = cut_spin_exchange_global;
+        }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type spin pairs
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::coeff(int narg, char **arg)
+{
+  if (!allocated) allocate();
+
+  // check if args correct
+
+  if (strcmp(arg[2],"biquadratic") != 0)
+    error->all(FLERR,"Incorrect args in pair_style command");
+  if (narg != 10)
+    error->all(FLERR,"Incorrect args in pair_style command");
+
+  int ilo,ihi,jlo,jhi;
+  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
+  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+
+  // get exchange arguments from input command
+
+  const double rc = force->numeric(FLERR,arg[3]);
+  const double j1 = force->numeric(FLERR,arg[4]);
+  const double j2 = force->numeric(FLERR,arg[5]);
+  const double j3 = force->numeric(FLERR,arg[6]);
+  const double k1 = force->numeric(FLERR,arg[7]);
+  const double k2 = force->numeric(FLERR,arg[8]);
+  const double k3 = force->numeric(FLERR,arg[9]);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      cut_spin_exchange[i][j] = rc;
+      J1_mag[i][j] = j1/hbar;
+      J1_mech[i][j] = j1;
+      J2[i][j] = j2;
+      J3[i][j] = j3;
+      K1_mag[i][j] = k1/hbar;
+      K1_mech[i][j] = k1;
+      K2[i][j] = k2;
+      K3[i][j] = k3;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args in pair_style command");
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairSpinExchangeBiquadratic::init_one(int i, int j)
+{
+
+   if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+
+  J1_mag[j][i] = J1_mag[i][j];
+  J1_mech[j][i] = J1_mech[i][j];
+  J2[j][i] = J2[i][j];
+  J3[j][i] = J3[i][j];
+  K1_mag[j][i] = K1_mag[i][j];
+  K1_mech[j][i] = K1_mech[i][j];
+  K2[j][i] = K2[i][j];
+  K3[j][i] = K3[i][j];
+  cut_spin_exchange[j][i] = cut_spin_exchange[i][j];
+
+  return cut_spin_exchange_global;
+}
+
+/* ----------------------------------------------------------------------
+   extract the larger cutoff
+------------------------------------------------------------------------- */
+
+void *PairSpinExchangeBiquadratic::extract(const char *str, int &dim)
+{
+  dim = 0;
+  if (strcmp(str,"cut") == 0) return (void *) &cut_spin_exchange_global;
+  return NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::compute(int eflag, int vflag)
+{
+  int i,j,ii,jj,inum,jnum,itype,jtype;
+  double evdwl, ecoul;
+  double xi[3], eij[3];
+  double delx,dely,delz;
+  double spi[3], spj[3];
+  double fi[3], fmi[3];
+  double local_cut2;
+  double rsq, inorm;
+  int *ilist,*jlist,*numneigh,**firstneigh;
+
+  evdwl = ecoul = 0.0;
+  ev_init(eflag,vflag);
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double **fm = atom->fm;
+  double **sp = atom->sp;
+  int *type = atom->type;
+  int nlocal = atom->nlocal;
+  int newton_pair = force->newton_pair;
+
+  inum = list->inum;
+  ilist = list->ilist;
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // checking size of emag
+
+  if (nlocal_max < nlocal) {    // grow emag lists if necessary
+    nlocal_max = nlocal;
+    memory->grow(emag,nlocal_max,"pair/spin:emag");
+  }
+
+  // computation of the exchange interaction
+  // loop over atoms and their neighbors
+
+  for (ii = 0; ii < inum; ii++) {
+    i = ilist[ii];
+    itype = type[i];
+
+    jlist = firstneigh[i];
+    jnum = numneigh[i];
+    xi[0] = x[i][0];
+    xi[1] = x[i][1];
+    xi[2] = x[i][2];
+    spi[0] = sp[i][0];
+    spi[1] = sp[i][1];
+    spi[2] = sp[i][2];
+    emag[i] = 0.0;
+
+    // loop on neighbors
+
+    for (jj = 0; jj < jnum; jj++) {
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+
+      spj[0] = sp[j][0];
+      spj[1] = sp[j][1];
+      spj[2] = sp[j][2];
+
+      evdwl = 0.0;
+      fi[0] = fi[1] = fi[2] = 0.0;
+      fmi[0] = fmi[1] = fmi[2] = 0.0;
+
+      delx = xi[0] - x[j][0];
+      dely = xi[1] - x[j][1];
+      delz = xi[2] - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+      inorm = 1.0/sqrt(rsq);
+      eij[0] = -inorm*delx;
+      eij[1] = -inorm*dely;
+      eij[2] = -inorm*delz;
+
+      local_cut2 = cut_spin_exchange[itype][jtype]*cut_spin_exchange[itype][jtype];
+
+      // compute exchange interaction
+
+      if (rsq <= local_cut2) {
+        compute_exchange(i,j,rsq,fmi,spi,spj);
+        if (lattice_flag)
+          compute_exchange_mech(i,j,rsq,eij,fi,spi,spj);
+      
+        if (eflag) {
+          evdwl -= compute_energy(i,j,rsq,spi,spj);
+          emag[i] += evdwl;
+        } else evdwl = 0.0;
+      }
+
+      f[i][0] += fi[0];
+      f[i][1] += fi[1];
+      f[i][2] += fi[2];
+      fm[i][0] += fmi[0];
+      fm[i][1] += fmi[1];
+      fm[i][2] += fmi[2];
+
+      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+          evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
+    }
+  }
+
+  if (vflag_fdotr) virial_fdotr_compute();
+
+}
+
+/* ----------------------------------------------------------------------
+   update the pair interactions fmi acting on the spin ii
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::compute_single_pair(int ii, double fmi[3])
+{
+  int *type = atom->type;
+  double **x = atom->x;
+  double **sp = atom->sp;
+  double local_cut2;
+  double xi[3];
+  double delx,dely,delz;
+  double spi[3],spj[3];
+
+  int j,jnum,itype,jtype,ntypes;
+  int k,locflag;
+  int *jlist,*numneigh,**firstneigh;
+
+  double rsq;
+
+  numneigh = list->numneigh;
+  firstneigh = list->firstneigh;
+
+  // check if interaction applies to type of ii
+
+  itype = type[ii];
+  ntypes = atom->ntypes;
+  locflag = 0;
+  k = 1;
+  while (k <= ntypes) {
+    if (k <= itype) {
+      if (setflag[k][itype] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else if (k > itype) {
+      if (setflag[itype][k] == 1) {
+        locflag =1;
+        break;
+      }
+      k++;
+    } else error->all(FLERR,"Wrong type number");
+  }
+
+  // if interaction applies to type ii,
+  // locflag = 1 and compute pair interaction
+
+  if (locflag == 1) {
+
+    xi[0] = x[ii][0];
+    xi[1] = x[ii][1];
+    xi[2] = x[ii][2];
+    spi[0] = sp[ii][0];
+    spi[1] = sp[ii][1];
+    spi[2] = sp[ii][2];
+
+    jlist = firstneigh[ii];
+    jnum = numneigh[ii];
+
+    for (int jj = 0; jj < jnum; jj++) {
+
+      j = jlist[jj];
+      j &= NEIGHMASK;
+      jtype = type[j];
+      local_cut2 = cut_spin_exchange[itype][jtype]*cut_spin_exchange[itype][jtype];
+
+      spj[0] = sp[j][0];
+      spj[1] = sp[j][1];
+      spj[2] = sp[j][2];
+
+      delx = xi[0] - x[j][0];
+      dely = xi[1] - x[j][1];
+      delz = xi[2] - x[j][2];
+      rsq = delx*delx + dely*dely + delz*delz;
+
+      if (rsq <= local_cut2) {
+        compute_exchange(ii,j,rsq,fmi,spi,spj);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   compute exchange interaction between spins i and j
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq, 
+    double fmi[3], double spi[3], double spj[3])
+{
+  int *type = atom->type;
+  int itype,jtype;
+  double Jex,Kex,ra,sdots;
+  double rj,rk,r2j,r2k,ir3j,ir3k;
+  itype = type[i];
+  jtype = type[j];
+
+  ra = sqrt(rsq);
+  rj = ra/J3[itype][jtype];
+  r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
+  ir3j = 1.0/(rj*rj*rj);
+  rk = ra/K3[itype][jtype];
+  r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
+  ir3k = 1.0/(rk*rk*rk);
+  
+  // modified Yukawa
+  Jex = (1.0-J2[itype][jtype]*r2j);
+  Jex *= J1_mag[itype][jtype]*ir3j;
+  Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
+  
+  Kex = (1.0-K2[itype][jtype]*r2k);
+  Kex *= K1_mag[itype][jtype]*ir3k;
+  Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
+ 
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+
+  fmi[0] += Jex*spj[0] + 2.0*Kex*spj[0]*sdots;
+  fmi[1] += Jex*spj[1] + 2.0*Kex*spj[1]*sdots;
+  fmi[2] += Jex*spj[2] + 2.0*Kex*spj[2]*sdots;
+}
+
+/* ----------------------------------------------------------------------
+   compute the mechanical force due to the exchange interaction between atom i and atom j
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, double rsq, 
+    double eij[3], double fi[3],  double spi[3], double spj[3])
+{
+  int *type = atom->type;
+  int itype,jtype;
+  double Jex,Jex_mech,Kex,Kex_mech,ra,sdots;
+  double rj,rk,r2j,r2k,ir3j,ir3k;
+  itype = type[i];
+  jtype = type[j];
+
+  ra = sqrt(rsq);
+  rj = ra/J3[itype][jtype];
+  r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
+  ir3j = 1.0/(rj*rj*rj);
+  rk = ra/K3[itype][jtype];
+  r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
+  ir3k = 1.0/(rk*rk*rk);
+  
+  // modified Yukawa
+  Jex_mech = J2[itype][jtype]*2.0*ra/(J3[itype][jtype]*J3[itype][jtype]);
+  Jex_mech += (3.0/ra+1.0/J3[itype][jtype])*(1.0-J2[itype][jtype]*r2j);
+  Jex_mech *= -J1_mech[itype][jtype]*ir3j;
+  Jex_mech *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
+
+  Kex_mech = K2[itype][jtype]*2.0*ra/(K3[itype][jtype]*K3[itype][jtype]);
+  Kex_mech += (3.0/ra+1.0/K3[itype][jtype])*(1.0-K2[itype][jtype]*r2k);
+  Kex_mech *= -K1_mech[itype][jtype]*ir3k;
+  Kex_mech *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
+  
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+
+  fi[0] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[0];
+  fi[1] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[1];
+  fi[2] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[2];
+}
+
+/* ----------------------------------------------------------------------
+   compute energy of spin pair i and j
+------------------------------------------------------------------------- */
+
+double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq, 
+    double spi[3], double spj[3])
+{
+  int *type = atom->type;
+  int itype,jtype;
+  double Jex,Kex,ra,sdots;
+  double rj,rk,r2j,r2k,ir3j,ir3k;
+  double energy = 0.0;
+  itype = type[i];
+  jtype = type[j];
+
+  ra = sqrt(rsq);
+  rj = ra/J3[itype][jtype];
+  r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
+  ir3j = 1.0/(rj*rj*rj);
+  rk = ra/K3[itype][jtype];
+  r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
+  ir3k = 1.0/(rk*rk*rk);
+  
+  // modified Yukawa
+  Jex = (1.0-J2[itype][jtype]*r2j);
+  Jex *= J1_mech[itype][jtype]*ir3j;
+  Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
+  
+  Kex = (1.0-K2[itype][jtype]*r2k);
+  Kex *= K1_mech[itype][jtype]*ir3k;
+  Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
+
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
+
+  energy = 0.5*(Jex*sdots + Kex*sdots*sdots);
+  return energy;
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::allocate()
+{
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cut_spin_exchange,n+1,n+1,"pair/spin/exchange:cut_spin_exchange");
+  memory->create(J1_mag,n+1,n+1,"pair/spin/exchange:J1_mag");
+  memory->create(J1_mech,n+1,n+1,"pair/spin/exchange:J1_mech");
+  memory->create(J2,n+1,n+1,"pair/spin/exchange:J2");
+  memory->create(J3,n+1,n+1,"pair/spin/exchange:J3");
+  memory->create(K1_mag,n+1,n+1,"pair/spin/exchange:J1_mag");
+  memory->create(K1_mech,n+1,n+1,"pair/spin/exchange:J1_mech");
+  memory->create(K2,n+1,n+1,"pair/spin/exchange:J2");
+  memory->create(K3,n+1,n+1,"pair/spin/exchange:J3");
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::write_restart(FILE *fp)
+{
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+        fwrite(&J1_mag[i][j],sizeof(double),1,fp);
+        fwrite(&J1_mech[i][j],sizeof(double),1,fp);
+        fwrite(&J2[i][j],sizeof(double),1,fp);
+        fwrite(&J3[i][j],sizeof(double),1,fp);
+        fwrite(&K1_mag[i][j],sizeof(double),1,fp);
+        fwrite(&K1_mech[i][j],sizeof(double),1,fp);
+        fwrite(&K2[i][j],sizeof(double),1,fp);
+        fwrite(&K3[i][j],sizeof(double),1,fp);
+        fwrite(&cut_spin_exchange[i][j],sizeof(double),1,fp);
+      }
+    }
+  }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::read_restart(FILE *fp)
+{
+  read_restart_settings(fp);
+
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++) {
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) utils::sfread(FLERR,&setflag[i][j],sizeof(int),1,fp,NULL,error);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+        if (me == 0) {
+          utils::sfread(FLERR,&J1_mag[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&J1_mech[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&J2[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&J3[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&K1_mag[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&K1_mech[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&K2[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&K3[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&cut_spin_exchange[i][j],sizeof(double),1,fp,NULL,error);
+        }
+        MPI_Bcast(&J1_mag[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&J1_mech[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&J2[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&J3[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&K1_mag[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&K1_mech[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&K2[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&K3[i][j],1,MPI_DOUBLE,0,world);
+        MPI_Bcast(&cut_spin_exchange[i][j],1,MPI_DOUBLE,0,world);
+      }
+    }
+  }
+}
+
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::write_restart_settings(FILE *fp)
+{
+  fwrite(&cut_spin_exchange_global,sizeof(double),1,fp);
+  fwrite(&offset_flag,sizeof(int),1,fp);
+  fwrite(&mix_flag,sizeof(int),1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairSpinExchangeBiquadratic::read_restart_settings(FILE *fp)
+{
+  if (comm->me == 0) {
+    utils::sfread(FLERR,&cut_spin_exchange_global,sizeof(double),1,fp,NULL,error);
+    utils::sfread(FLERR,&offset_flag,sizeof(int),1,fp,NULL,error);
+    utils::sfread(FLERR,&mix_flag,sizeof(int),1,fp,NULL,error);
+  }
+  MPI_Bcast(&cut_spin_exchange_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
+  MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
+}
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.h b/src/SPIN/pair_spin_exchange_biquadratic.h
new file mode 100644
index 0000000000..6fb9a7a94c
--- /dev/null
+++ b/src/SPIN/pair_spin_exchange_biquadratic.h
@@ -0,0 +1,85 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(spin/exchange/biquadratic,PairSpinExchangeBiquadratic)
+
+#else
+
+#ifndef LMP_PAIR_SPIN_EXCHANGE_BIQUADRATIC_H
+#define LMP_PAIR_SPIN_EXCHANGE_BIQUADRATIC_H
+
+#include "pair_spin.h"
+
+namespace LAMMPS_NS {
+
+class PairSpinExchangeBiquadratic : public PairSpin {
+ public:
+  PairSpinExchangeBiquadratic(LAMMPS *lmp) : PairSpin(lmp) {}
+  virtual ~PairSpinExchangeBiquadratic();
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void *extract(const char *, int &);
+
+  void compute(int, int);
+  void compute_single_pair(int, double *);
+
+  void compute_exchange(int, int, double, double *, double *, double *);
+  void compute_exchange_mech(int, int, double, double *, double *, double *, double *);
+  double compute_energy(int , int , double , double *, double *);
+
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+
+  double cut_spin_exchange_global;      // global exchange cutoff distance
+
+ protected:
+  double **J1_mag;                      // H exchange coeffs in eV
+  double **J1_mech;                     // mech exchange coeffs in
+  double **J2, **J3;                    // J1 in eV, J2 in Ang-1, J3 in Ang
+  double **K1_mag;                      // Bi exchange coeffs in eV
+  double **K1_mech;                     // mech exchange coeffs in
+  double **K2, **K3;                    // K1 in eV, K2 Ang-1, K3 in Ang
+  double **cut_spin_exchange;           // cutoff distance exchange
+
+  void allocate();
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Incorrect args in pair_spin command
+
+Self-explanatory.
+
+E: Spin simulations require metal unit style
+
+Self-explanatory.
+
+E: Incorrect args for pair coefficients
+
+Self-explanatory.  Check the input script or data file.
+
+E: Pair spin requires atom attribute spin
+
+The atom style defined does not have these attributes.
+
+*/

From e941670f2c7ae02a22ce1617a01fa967dbeaff56 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 13 Jul 2020 14:43:14 -0600
Subject: [PATCH 02/64] Commit modif biquad

---
 doc/src/fix_precession_spin.rst    |  2 +-
 src/SPIN/pair_spin_dipole_cut.cpp  |  5 +++--
 src/SPIN/pair_spin_dipole_long.cpp | 12 ++++++++----
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/doc/src/fix_precession_spin.rst b/doc/src/fix_precession_spin.rst
index 783963af72..043c5cb200 100644
--- a/doc/src/fix_precession_spin.rst
+++ b/doc/src/fix_precession_spin.rst
@@ -62,7 +62,7 @@ with:
 
 The field value in Tesla is multiplied by the gyromagnetic
 ratio, :math:`g \cdot \mu_B/\hbar`, converting it into a precession frequency in
-rad.THz (in metal units and with :math:`\mu_B = 5.788 eV/T`).
+rad.THz (in metal units and with :math:`\mu_B = 5.788\cdot 10^{-5}` eV/T).
 
 As a comparison, the figure below displays the simulation of a
 single spin (of norm :math:`\mu_i = 1.0`) submitted to an external
diff --git a/src/SPIN/pair_spin_dipole_cut.cpp b/src/SPIN/pair_spin_dipole_cut.cpp
index cdae3c0bab..e18c24bcc0 100644
--- a/src/SPIN/pair_spin_dipole_cut.cpp
+++ b/src/SPIN/pair_spin_dipole_cut.cpp
@@ -48,9 +48,10 @@ PairSpinDipoleCut::PairSpinDipoleCut(LAMMPS *lmp) : PairSpin(lmp)
 
   hbar = force->hplanck/MY_2PI;                       // eV/(rad.THz)
   mub = 9.274e-4;                             // in A.Ang^2
-  mu_0 = 785.15;                              // in eV/Ang/A^2
+  // mu_0 = 785.15;                              // in eV/Ang/A^2
+  mu_0 = 784.15;                              // in eV/Ang/A^2
   mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);   // in eV.Ang^3
-  //mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);   // in eV
+  // mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);   // in eV
   mub2mu0hbinv = mub2mu0 / hbar;              // in rad.THz
 }
 
diff --git a/src/SPIN/pair_spin_dipole_long.cpp b/src/SPIN/pair_spin_dipole_long.cpp
index aeb916cfae..5ac3b276d2 100644
--- a/src/SPIN/pair_spin_dipole_long.cpp
+++ b/src/SPIN/pair_spin_dipole_long.cpp
@@ -52,7 +52,7 @@ PairSpinDipoleLong::PairSpinDipoleLong(LAMMPS *lmp) : PairSpin(lmp)
 
   hbar = force->hplanck/MY_2PI;                 // eV/(rad.THz)
   mub = 9.274e-4;                               // in A.Ang^2
-  mu_0 = 785.15;                                // in eV/Ang/A^2
+  mu_0 = 784.15;                                // in eV/Ang/A^2
   mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);     // in eV.Ang^3
   //mub2mu0 = mub * mub * mu_0 / (4.0*MY_PI);   // in eV
   mub2mu0hbinv = mub2mu0 / hbar;                // in rad.THz
@@ -136,10 +136,11 @@ void PairSpinDipoleLong::init_style()
 
   // insure use of KSpace long-range solver, set g_ewald
 
-  if (force->kspace == NULL)
-    error->all(FLERR,"Pair style requires a KSpace style");
+  // if (force->kspace == NULL)
+  //   error->all(FLERR,"Pair style requires a KSpace style");
 
-  g_ewald = force->kspace->g_ewald;
+  // g_ewald = force->kspace->g_ewald;
+  g_ewald = 1.0;
 }
 
 /* ----------------------------------------------------------------------
@@ -220,6 +221,9 @@ void PairSpinDipoleLong::compute(int eflag, int vflag)
     memory->grow(emag,nlocal_max,"pair/spin:emag");
   }
 
+
+  printf("test gewald %g \n",g_ewald);
+
   pre1 = 2.0 * g_ewald / MY_PIS;
   pre2 = 4.0 * pow(g_ewald,3.0) / MY_PIS;
   pre3 = 8.0 * pow(g_ewald,5.0) / MY_PIS;

From 7054c82b679031845592e28b400c4b1a5d2c890f Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 24 Aug 2020 09:23:31 -0600
Subject: [PATCH 03/64] added BS function to pair/spin/biquadractic

---
 src/SPIN/pair_spin_exchange_biquadratic.cpp | 102 ++++++++++++++------
 1 file changed, 70 insertions(+), 32 deletions(-)

diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index a7f64690af..20cea77396 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -375,15 +375,24 @@ void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq,
   rk = ra/K3[itype][jtype];
   r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
   ir3k = 1.0/(rk*rk*rk);
+ 
+  // BS model
+  Jex = 4.0*J1_mag[itype][jtype]*r2j;
+  Jex *= (1.0-J2[itype][jtype]*r2j);
+  Jex *= exp(-r2j);
+
+  Kex = 4.0*K1_mag[itype][jtype]*r2k;
+  Kex *= (1.0-K2[itype][jtype]*r2k);
+  Kex *= exp(-r2k);
   
   // modified Yukawa
-  Jex = (1.0-J2[itype][jtype]*r2j);
-  Jex *= J1_mag[itype][jtype]*ir3j;
-  Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
-  
-  Kex = (1.0-K2[itype][jtype]*r2k);
-  Kex *= K1_mag[itype][jtype]*ir3k;
-  Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
+  // Jex = (1.0-J2[itype][jtype]*r2j);
+  // Jex *= J1_mag[itype][jtype]*ir3j;
+  // Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
+  // 
+  // Kex = (1.0-K2[itype][jtype]*r2k);
+  // Kex *= K1_mag[itype][jtype]*ir3k;
+  // Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
  
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
@@ -402,28 +411,48 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, double rsq
   int *type = atom->type;
   int itype,jtype;
   double Jex,Jex_mech,Kex,Kex_mech,ra,sdots;
-  double rj,rk,r2j,r2k,ir3j,ir3k;
+  // double rj,rk,r2j,r2k,ir3j,ir3k;
+  double rja,rka,rjr,rkr,iJ3,iK3;
   itype = type[i];
   jtype = type[j];
 
-  ra = sqrt(rsq);
-  rj = ra/J3[itype][jtype];
-  r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
-  ir3j = 1.0/(rj*rj*rj);
-  rk = ra/K3[itype][jtype];
-  r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
-  ir3k = 1.0/(rk*rk*rk);
+  // ra = sqrt(rsq);
+  // rj = ra/J3[itype][jtype];
+  // r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
+  // ir3j = 1.0/(rj*rj*rj);
+  // rk = ra/K3[itype][jtype];
+  // r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
+  // ir3k = 1.0/(rk*rk*rk);
   
-  // modified Yukawa
-  Jex_mech = J2[itype][jtype]*2.0*ra/(J3[itype][jtype]*J3[itype][jtype]);
-  Jex_mech += (3.0/ra+1.0/J3[itype][jtype])*(1.0-J2[itype][jtype]*r2j);
-  Jex_mech *= -J1_mech[itype][jtype]*ir3j;
-  Jex_mech *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
+  Jex = J1_mech[itype][jtype];
+  iJ3 = 1.0/(J3[itype][jtype]*J3[itype][jtype]);
+  Kex = K1_mech[itype][jtype];
+  iK3 = 1.0/(K3[itype][jtype]*K3[itype][jtype]);
+  
+  rja = rsq*iJ3;
+  rjr = sqrt(rsq)*iJ3;
+  rka = rsq*iK3;
+  rkr = sqrt(rsq)*iK3;
+ 
+  // BS model
+  Jex_mech = 1.0-rja-J2[itype][jtype]*rja*(2.0-rja);
+  Jex_mech *= 8.0*Jex*rjr*exp(-rja);
+  Jex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+  
+  Kex_mech = 1.0-rka-K2[itype][jtype]*rka*(2.0-rka);
+  Kex_mech *= 8.0*Kex*rkr*exp(-rka);
+  Kex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
-  Kex_mech = K2[itype][jtype]*2.0*ra/(K3[itype][jtype]*K3[itype][jtype]);
-  Kex_mech += (3.0/ra+1.0/K3[itype][jtype])*(1.0-K2[itype][jtype]*r2k);
-  Kex_mech *= -K1_mech[itype][jtype]*ir3k;
-  Kex_mech *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
+  // modified Yukawa
+  // Jex_mech = J2[itype][jtype]*2.0*ra/(J3[itype][jtype]*J3[itype][jtype]);
+  // Jex_mech += (3.0/ra+1.0/J3[itype][jtype])*(1.0-J2[itype][jtype]*r2j);
+  // Jex_mech *= -J1_mech[itype][jtype]*ir3j;
+  // Jex_mech *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
+
+  // Kex_mech = K2[itype][jtype]*2.0*ra/(K3[itype][jtype]*K3[itype][jtype]);
+  // Kex_mech += (3.0/ra+1.0/K3[itype][jtype])*(1.0-K2[itype][jtype]*r2k);
+  // Kex_mech *= -K1_mech[itype][jtype]*ir3k;
+  // Kex_mech *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
   
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
@@ -454,15 +483,24 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
   rk = ra/K3[itype][jtype];
   r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
   ir3k = 1.0/(rk*rk*rk);
-  
+ 
+  // BS model 
+  Jex = 4.0*J1_mech[itype][jtype]*r2j;
+  Jex *= (1.0-J2[itype][jtype]*r2j);
+  Jex *= exp(-r2j);
+
+  Kex = 4.0*K1_mech[itype][jtype]*r2k;
+  Kex *= (1.0-K2[itype][jtype]*r2k);
+  Kex *= exp(-r2k);
+
   // modified Yukawa
-  Jex = (1.0-J2[itype][jtype]*r2j);
-  Jex *= J1_mech[itype][jtype]*ir3j;
-  Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
-  
-  Kex = (1.0-K2[itype][jtype]*r2k);
-  Kex *= K1_mech[itype][jtype]*ir3k;
-  Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
+  // Jex = (1.0-J2[itype][jtype]*r2j);
+  // Jex *= J1_mech[itype][jtype]*ir3j;
+  // Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
+  // 
+  // Kex = (1.0-K2[itype][jtype]*r2k);
+  // Kex *= K1_mech[itype][jtype]*ir3k;
+  // Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
 
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
 

From 901fe9d3aa494f1ec92867e4fbc1a26e18222c99 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Tue, 15 Sep 2020 18:22:11 -0600
Subject: [PATCH 04/64] modification of pair spin exchange/biquadratic, to
 offset ground state spin pressure

---
 src/SPIN/compute_spin.cpp                   | 37 +++++++++++++++++++--
 src/SPIN/pair_spin_exchange_biquadratic.cpp |  9 +++--
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/src/SPIN/compute_spin.cpp b/src/SPIN/compute_spin.cpp
index 94eff27f53..ca3c40e11a 100644
--- a/src/SPIN/compute_spin.cpp
+++ b/src/SPIN/compute_spin.cpp
@@ -46,6 +46,7 @@ ComputeSpin::ComputeSpin(LAMMPS *lmp, int narg, char **arg) :
   if ((narg != 3) && (narg != 4)) error->all(FLERR,"Illegal compute compute/spin command");
 
   vector_flag = 1;
+  // size_vector = 7;
   size_vector = 6;
   extvector = 0;
 
@@ -148,15 +149,19 @@ void ComputeSpin::compute_vector()
   int i;
   int countsp, countsptot;
   double mag[4], magtot[4];
+  double m2, m2tot;
+  double m4, m4tot;
   double magenergy, magenergytot;
   double tempnum, tempnumtot;
   double tempdenom, tempdenomtot;
-  double spintemperature;
+  double spintemperature,binder;
 
   invoked_vector = update->ntimestep;
 
   countsp = countsptot = 0.0;
   mag[0] = mag[1] = mag[2] = mag[3] = 0.0;
+  // m2 = m2tot = 0.0;
+  // m4 = m4tot = 0.0;
   magtot[0] = magtot[1] = magtot[2] = magtot[3] = 0.0;
   magenergy = magenergytot = 0.0;
   tempnum = tempnumtot = 0.0;
@@ -176,10 +181,25 @@ void ComputeSpin::compute_vector()
   for (i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit) {
       if (atom->sp_flag) {
+        
+        // compute first moment
+
         mag[0] += sp[i][0];
         mag[1] += sp[i][1];
         mag[2] += sp[i][2];
 
+        // compute second moment
+        
+        // m2 += sp[i][0]*sp[i][0];
+        // m2 += sp[i][1]*sp[i][1];
+        // m2 += sp[i][2]*sp[i][2];
+
+        // compute fourth moment
+        
+        // m4 += sp[i][0]*sp[i][0]*sp[i][0]*sp[i][0];
+        // m4 += sp[i][1]*sp[i][1]*sp[i][1]*sp[i][1];
+        // m4 += sp[i][2]*sp[i][2]*sp[i][2]*sp[i][2];
+
         // update magnetic precession energies
 
         if (precession_spin_flag) {
@@ -206,26 +226,39 @@ void ComputeSpin::compute_vector()
   }
 
   MPI_Allreduce(mag,magtot,4,MPI_DOUBLE,MPI_SUM,world);
+  // MPI_Allreduce(&m2,&m2tot,1,MPI_DOUBLE,MPI_SUM,world);
+  // MPI_Allreduce(&m4,&m4tot,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&magenergy,&magenergytot,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&tempnum,&tempnumtot,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&tempdenom,&tempdenomtot,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&countsp,&countsptot,1,MPI_INT,MPI_SUM,world);
 
+  // compute average magnetization
+
   double scale = 1.0/countsptot;
   magtot[0] *= scale;
   magtot[1] *= scale;
   magtot[2] *= scale;
   magtot[3] = sqrt((magtot[0]*magtot[0])+(magtot[1]*magtot[1])+(magtot[2]*magtot[2]));
+  
+  // compute spin temperature
+  
   spintemperature = hbar*tempnumtot;
   spintemperature /= (2.0*kb*tempdenomtot);
 
+  // compute Binder cumulant
+
+  // m2tot *= scale;
+  // m4tot *= scale;
+  // binder = 1.0 - m4tot/(3.0*m2tot*m2tot);
+
   vector[0] = magtot[0];
   vector[1] = magtot[1];
   vector[2] = magtot[2];
   vector[3] = magtot[3];
   vector[4] = magenergytot;
   vector[5] = spintemperature;
-
+  // vector[6] = binder;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 20cea77396..812ccf40ab 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -454,11 +454,15 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, double rsq
   // Kex_mech *= -K1_mech[itype][jtype]*ir3k;
   // Kex_mech *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
   
-  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+  // sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2] - 1.0);
 
   fi[0] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[0];
   fi[1] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[1];
   fi[2] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[2];
+  // fi[0] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[0];
+  // fi[1] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[1];
+  // fi[2] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[2];
 }
 
 /* ----------------------------------------------------------------------
@@ -502,7 +506,8 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
   // Kex *= K1_mech[itype][jtype]*ir3k;
   // Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
 
-  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
+  // sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2] - 1.0);  
 
   energy = 0.5*(Jex*sdots + Kex*sdots*sdots);
   return energy;

From 7d5109454f02bde06f625065e18f4506701446ac Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Tue, 15 Sep 2020 20:16:48 -0600
Subject: [PATCH 05/64] correcting small issue with offset of biquadratic
 exchange

---
 src/SPIN/pair_spin_exchange_biquadratic.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 812ccf40ab..61b3df70ce 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -455,11 +455,11 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, double rsq
   // Kex_mech *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
   
   // sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
-  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2] - 1.0);
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
-  fi[0] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[0];
-  fi[1] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[1];
-  fi[2] -= (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[2];
+  fi[0] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
+  fi[1] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[1];
+  fi[2] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[2];
   // fi[0] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[0];
   // fi[1] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[1];
   // fi[2] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[2];
@@ -507,9 +507,9 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
   // Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
 
   // sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
-  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2] - 1.0);  
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
 
-  energy = 0.5*(Jex*sdots + Kex*sdots*sdots);
+  energy = 0.5*(Jex*(sdots-1.0) + Kex*(sdots*sdots-1.0));
   return energy;
 }
 

From 9aba7b00505e3d33771d308b4253f310cad9297e Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 28 Sep 2020 15:42:26 -0600
Subject: [PATCH 06/64] adding a kokkos/spin atom style

---
 src/KOKKOS/kokkos_type.h                    |  60 +++++++++++
 src/SPIN/compute_spin.cpp                   |  24 -----
 src/SPIN/pair_spin_exchange.cpp             |  41 ++++----
 src/SPIN/pair_spin_exchange_biquadratic.cpp | 107 ++++++--------------
 src/SPIN/pair_spin_neel.cpp                 |   8 +-
 5 files changed, 117 insertions(+), 123 deletions(-)

diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index c8fccaf409..a3ebe4f030 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -714,6 +714,39 @@ typedef tdual_virial_array::t_dev_um t_virial_array_um;
 typedef tdual_virial_array::t_dev_const_um t_virial_array_const_um;
 typedef tdual_virial_array::t_dev_const_randomread t_virial_array_randomread;
 
+// Spin Types
+
+//3d SP_FLOAT array n*4
+#ifdef LMP_KOKKOS_NO_LEGACY
+typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_sp_array;
+#else
+typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutRight, LMPDeviceType> tdual_sp_array;
+#endif
+typedef tdual_sp_array::t_dev t_sp_array;
+typedef tdual_sp_array::t_dev_const t_sp_array_const;
+typedef tdual_sp_array::t_dev_um t_sp_array_um;
+typedef tdual_sp_array::t_dev_const_um t_sp_array_const_um;
+typedef tdual_sp_array::t_dev_const_randomread t_sp_array_randomread;
+
+//3d FM_FLOAT array n*3
+
+typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_array;
+typedef tdual_fm_array::t_dev t_fm_array;
+typedef tdual_fm_array::t_dev_const t_fm_array_const;
+typedef tdual_fm_array::t_dev_um t_fm_array_um;
+typedef tdual_fm_array::t_dev_const_um t_fm_array_const_um;
+typedef tdual_fm_array::t_dev_const_randomread t_fm_array_randomread;
+
+//3d FML_FLOAT array n*3
+
+typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_long_array;
+typedef tdual_fm_long_array::t_dev t_fm_long_array;
+typedef tdual_fm_long_array::t_dev_const t_fm_long_array_const;
+typedef tdual_fm_long_array::t_dev_um t_fm_long_array_um;
+typedef tdual_fm_long_array::t_dev_const_um t_fm_long_array_const_um;
+typedef tdual_fm_long_array::t_dev_const_randomread t_fm_long_array_randomread;
+
+
 //Energy Types
 //1d E_FLOAT array n
 
@@ -950,6 +983,33 @@ typedef tdual_virial_array::t_host_um t_virial_array_um;
 typedef tdual_virial_array::t_host_const_um t_virial_array_const_um;
 typedef tdual_virial_array::t_host_const_randomread t_virial_array_randomread;
 
+// Spin types
+
+//2d X_FLOAT array n*3
+typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_sp_array;
+typedef tdual_sp_array::t_host t_sp_array;
+typedef tdual_sp_array::t_host_const t_sp_array_const;
+typedef tdual_sp_array::t_host_um t_sp_array_um;
+typedef tdual_sp_array::t_host_const_um t_sp_array_const_um;
+typedef tdual_sp_array::t_host_const_randomread t_sp_array_randomread;
+
+//2d F_FLOAT array n*3
+typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_array;
+//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array;
+typedef tdual_fm_array::t_host t_fm_array;
+typedef tdual_fm_array::t_host_const t_fm_array_const;
+typedef tdual_fm_array::t_host_um t_fm_array_um;
+typedef tdual_fm_array::t_host_const_um t_fm_array_const_um;
+typedef tdual_fm_array::t_host_const_randomread t_fm_array_randomread;
+
+//2d F_FLOAT array n*3
+typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_long_array;
+//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array;
+typedef tdual_fm_long_array::t_host t_fm_long_array;
+typedef tdual_fm_long_array::t_host_const t_fm_long_array_const;
+typedef tdual_fm_long_array::t_host_um t_fm_long_array_um;
+typedef tdual_fm_long_array::t_host_const_um t_fm_long_array_const_um;
+typedef tdual_fm_long_array::t_host_const_randomread t_fm_long_array_randomread;
 
 
 //Energy Types
diff --git a/src/SPIN/compute_spin.cpp b/src/SPIN/compute_spin.cpp
index ca3c40e11a..5edfb04645 100644
--- a/src/SPIN/compute_spin.cpp
+++ b/src/SPIN/compute_spin.cpp
@@ -46,7 +46,6 @@ ComputeSpin::ComputeSpin(LAMMPS *lmp, int narg, char **arg) :
   if ((narg != 3) && (narg != 4)) error->all(FLERR,"Illegal compute compute/spin command");
 
   vector_flag = 1;
-  // size_vector = 7;
   size_vector = 6;
   extvector = 0;
 
@@ -160,8 +159,6 @@ void ComputeSpin::compute_vector()
 
   countsp = countsptot = 0.0;
   mag[0] = mag[1] = mag[2] = mag[3] = 0.0;
-  // m2 = m2tot = 0.0;
-  // m4 = m4tot = 0.0;
   magtot[0] = magtot[1] = magtot[2] = magtot[3] = 0.0;
   magenergy = magenergytot = 0.0;
   tempnum = tempnumtot = 0.0;
@@ -188,18 +185,6 @@ void ComputeSpin::compute_vector()
         mag[1] += sp[i][1];
         mag[2] += sp[i][2];
 
-        // compute second moment
-        
-        // m2 += sp[i][0]*sp[i][0];
-        // m2 += sp[i][1]*sp[i][1];
-        // m2 += sp[i][2]*sp[i][2];
-
-        // compute fourth moment
-        
-        // m4 += sp[i][0]*sp[i][0]*sp[i][0]*sp[i][0];
-        // m4 += sp[i][1]*sp[i][1]*sp[i][1]*sp[i][1];
-        // m4 += sp[i][2]*sp[i][2]*sp[i][2]*sp[i][2];
-
         // update magnetic precession energies
 
         if (precession_spin_flag) {
@@ -226,8 +211,6 @@ void ComputeSpin::compute_vector()
   }
 
   MPI_Allreduce(mag,magtot,4,MPI_DOUBLE,MPI_SUM,world);
-  // MPI_Allreduce(&m2,&m2tot,1,MPI_DOUBLE,MPI_SUM,world);
-  // MPI_Allreduce(&m4,&m4tot,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&magenergy,&magenergytot,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&tempnum,&tempnumtot,1,MPI_DOUBLE,MPI_SUM,world);
   MPI_Allreduce(&tempdenom,&tempdenomtot,1,MPI_DOUBLE,MPI_SUM,world);
@@ -246,19 +229,12 @@ void ComputeSpin::compute_vector()
   spintemperature = hbar*tempnumtot;
   spintemperature /= (2.0*kb*tempdenomtot);
 
-  // compute Binder cumulant
-
-  // m2tot *= scale;
-  // m4tot *= scale;
-  // binder = 1.0 - m4tot/(3.0*m2tot*m2tot);
-
   vector[0] = magtot[0];
   vector[1] = magtot[1];
   vector[2] = magtot[2];
   vector[3] = magtot[3];
   vector[4] = magenergytot;
   vector[5] = spintemperature;
-  // vector[6] = binder;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/SPIN/pair_spin_exchange.cpp b/src/SPIN/pair_spin_exchange.cpp
index b23f4fa0cb..611230c73e 100644
--- a/src/SPIN/pair_spin_exchange.cpp
+++ b/src/SPIN/pair_spin_exchange.cpp
@@ -240,28 +240,26 @@ void PairSpinExchange::compute(int eflag, int vflag)
           evdwl *= 0.5*hbar;
           emag[i] += evdwl;
         } else evdwl = 0.0;
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fi[0];
+          f[j][1] -= fi[1];
+          f[j][2] -= fi[2];
+        }
+        fm[i][0] += fmi[0];
+        fm[i][1] += fmi[1];
+        fm[i][2] += fmi[2];
+
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+            evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
       }
-
-      f[i][0] += fi[0];
-      f[i][1] += fi[1];
-      f[i][2] += fi[2];
-      fm[i][0] += fmi[0];
-      fm[i][1] += fmi[1];
-      fm[i][2] += fmi[2];
-
-      // if (eflag) {
-      //   evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
-      //   evdwl *= 0.5*hbar;
-      //   emag[i] += evdwl;
-      // } else evdwl = 0.0;
-
-      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
-          evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
-
 }
 
 /* ----------------------------------------------------------------------
@@ -389,9 +387,12 @@ void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq,
   Jex_mech *= 8.0*Jex*rr*exp(-ra);
   Jex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
-  fi[0] -= Jex_mech*eij[0];
-  fi[1] -= Jex_mech*eij[1];
-  fi[2] -= Jex_mech*eij[2];
+  fi[0] -= 0.5*Jex_mech*eij[0];
+  fi[1] -= 0.5*Jex_mech*eij[1];
+  fi[2] -= 0.5*Jex_mech*eij[2];
+  // fi[0] -= Jex_mech*eij[0];
+  // fi[1] -= Jex_mech*eij[1];
+  // fi[2] -= Jex_mech*eij[2];
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 61b3df70ce..cf351e6539 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -246,6 +246,7 @@ void PairSpinExchangeBiquadratic::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_exchange(i,j,rsq,fmi,spi,spj);
+        
         if (lattice_flag)
           compute_exchange_mech(i,j,rsq,eij,fi,spi,spj);
       
@@ -253,22 +254,26 @@ void PairSpinExchangeBiquadratic::compute(int eflag, int vflag)
           evdwl -= compute_energy(i,j,rsq,spi,spj);
           emag[i] += evdwl;
         } else evdwl = 0.0;
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fi[0];
+          f[j][1] -= fi[1];
+          f[j][2] -= fi[2];
+        }
+        fm[i][0] += fmi[0];
+        fm[i][1] += fmi[1];
+        fm[i][2] += fmi[2];
+
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+            evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
       }
-
-      f[i][0] += fi[0];
-      f[i][1] += fi[1];
-      f[i][2] += fi[2];
-      fm[i][0] += fmi[0];
-      fm[i][1] += fmi[1];
-      fm[i][2] += fmi[2];
-
-      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
-          evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
-
 }
 
 /* ----------------------------------------------------------------------
@@ -363,20 +368,13 @@ void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq,
 {
   int *type = atom->type;
   int itype,jtype;
-  double Jex,Kex,ra,sdots;
-  double rj,rk,r2j,r2k,ir3j,ir3k;
+  double Jex,Kex,r2j,r2k,sdots;
   itype = type[i];
   jtype = type[j];
 
-  ra = sqrt(rsq);
-  rj = ra/J3[itype][jtype];
   r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
-  ir3j = 1.0/(rj*rj*rj);
-  rk = ra/K3[itype][jtype];
-  r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
-  ir3k = 1.0/(rk*rk*rk);
+  r2k = rsq/J3[itype][jtype]/J3[itype][jtype];
  
-  // BS model
   Jex = 4.0*J1_mag[itype][jtype]*r2j;
   Jex *= (1.0-J2[itype][jtype]*r2j);
   Jex *= exp(-r2j);
@@ -385,45 +383,27 @@ void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq,
   Kex *= (1.0-K2[itype][jtype]*r2k);
   Kex *= exp(-r2k);
   
-  // modified Yukawa
-  // Jex = (1.0-J2[itype][jtype]*r2j);
-  // Jex *= J1_mag[itype][jtype]*ir3j;
-  // Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
-  // 
-  // Kex = (1.0-K2[itype][jtype]*r2k);
-  // Kex *= K1_mag[itype][jtype]*ir3k;
-  // Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
- 
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
-  fmi[0] += Jex*spj[0] + 2.0*Kex*spj[0]*sdots;
-  fmi[1] += Jex*spj[1] + 2.0*Kex*spj[1]*sdots;
-  fmi[2] += Jex*spj[2] + 2.0*Kex*spj[2]*sdots;
+  fmi[0] += (Jex*spj[0] + 2.0*Kex*spj[0]*sdots);
+  fmi[1] += (Jex*spj[1] + 2.0*Kex*spj[1]*sdots);
+  fmi[2] += (Jex*spj[2] + 2.0*Kex*spj[2]*sdots);
 }
 
 /* ----------------------------------------------------------------------
    compute the mechanical force due to the exchange interaction between atom i and atom j
 ------------------------------------------------------------------------- */
 
-void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, double rsq, 
-    double eij[3], double fi[3],  double spi[3], double spj[3])
+void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, 
+    double rsq, double eij[3], double fi[3],  double spi[3], double spj[3])
 {
   int *type = atom->type;
   int itype,jtype;
   double Jex,Jex_mech,Kex,Kex_mech,ra,sdots;
-  // double rj,rk,r2j,r2k,ir3j,ir3k;
   double rja,rka,rjr,rkr,iJ3,iK3;
   itype = type[i];
   jtype = type[j];
 
-  // ra = sqrt(rsq);
-  // rj = ra/J3[itype][jtype];
-  // r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
-  // ir3j = 1.0/(rj*rj*rj);
-  // rk = ra/K3[itype][jtype];
-  // r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
-  // ir3k = 1.0/(rk*rk*rk);
-  
   Jex = J1_mech[itype][jtype];
   iJ3 = 1.0/(J3[itype][jtype]*J3[itype][jtype]);
   Kex = K1_mech[itype][jtype];
@@ -434,35 +414,22 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, double rsq
   rka = rsq*iK3;
   rkr = sqrt(rsq)*iK3;
  
-  // BS model
   Jex_mech = 1.0-rja-J2[itype][jtype]*rja*(2.0-rja);
   Jex_mech *= 8.0*Jex*rjr*exp(-rja);
-  Jex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+  // Jex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
   
   Kex_mech = 1.0-rka-K2[itype][jtype]*rka*(2.0-rka);
   Kex_mech *= 8.0*Kex*rkr*exp(-rka);
-  Kex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+  // Kex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
-  // modified Yukawa
-  // Jex_mech = J2[itype][jtype]*2.0*ra/(J3[itype][jtype]*J3[itype][jtype]);
-  // Jex_mech += (3.0/ra+1.0/J3[itype][jtype])*(1.0-J2[itype][jtype]*r2j);
-  // Jex_mech *= -J1_mech[itype][jtype]*ir3j;
-  // Jex_mech *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
-
-  // Kex_mech = K2[itype][jtype]*2.0*ra/(K3[itype][jtype]*K3[itype][jtype]);
-  // Kex_mech += (3.0/ra+1.0/K3[itype][jtype])*(1.0-K2[itype][jtype]*r2k);
-  // Kex_mech *= -K1_mech[itype][jtype]*ir3k;
-  // Kex_mech *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
-  
-  // sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
-  fi[0] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
-  fi[1] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[1];
-  fi[2] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[2];
-  // fi[0] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[0];
-  // fi[1] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[1];
-  // fi[2] += (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[2];
+  fi[0] -= 0.5*(Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
+  fi[1] -= 0.5*(Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[1];
+  fi[2] -= 0.5*(Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[2];
+  // fi[0] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
+  // fi[1] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[1];
+  // fi[2] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[2];
 }
 
 /* ----------------------------------------------------------------------
@@ -488,7 +455,6 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
   r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
   ir3k = 1.0/(rk*rk*rk);
  
-  // BS model 
   Jex = 4.0*J1_mech[itype][jtype]*r2j;
   Jex *= (1.0-J2[itype][jtype]*r2j);
   Jex *= exp(-r2j);
@@ -497,19 +463,10 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
   Kex *= (1.0-K2[itype][jtype]*r2k);
   Kex *= exp(-r2k);
 
-  // modified Yukawa
-  // Jex = (1.0-J2[itype][jtype]*r2j);
-  // Jex *= J1_mech[itype][jtype]*ir3j;
-  // Jex *= exp((J3[itype][jtype]-ra)/J3[itype][jtype]);
-  // 
-  // Kex = (1.0-K2[itype][jtype]*r2k);
-  // Kex *= K1_mech[itype][jtype]*ir3k;
-  // Kex *= exp((K3[itype][jtype]-ra)/K3[itype][jtype]);
-
-  // sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
 
   energy = 0.5*(Jex*(sdots-1.0) + Kex*(sdots*sdots-1.0));
+  // energy = 0.5*(Jex*(sdots) + Kex*(sdots*sdots-1.0));
   return energy;
 }
 
diff --git a/src/SPIN/pair_spin_neel.cpp b/src/SPIN/pair_spin_neel.cpp
index 4fd8ecc215..fc7cb6ab9a 100644
--- a/src/SPIN/pair_spin_neel.cpp
+++ b/src/SPIN/pair_spin_neel.cpp
@@ -262,8 +262,8 @@ void PairSpinNeel::compute(int eflag, int vflag)
       fm[i][2] += fmi[2];
 
       if (eflag) {
-        evdwl = compute_neel_energy(i,j,rsq,eij,spi,spj);
-        evdwl *= 0.5*hbar;
+        evdwl -= compute_neel_energy(i,j,rsq,eij,spi,spj);
+        // evdwl *= 0.5*hbar;
         emag[i] += evdwl;
       } else evdwl = 0.0;
 
@@ -588,12 +588,12 @@ double PairSpinNeel::compute_neel_energy(int i, int j, double rsq, double eij[3]
   // compute Neel's functions
 
   ra = rsq/g3[itype][jtype]/g3[itype][jtype];
-  gr = 4.0*g1[itype][jtype]*ra;
+  gr = 4.0*g1_mech[itype][jtype]*ra;
   gr *= (1.0-g2[itype][jtype]*ra);
   gr *= exp(-ra);
 
   ra = rsq/q3[itype][jtype]/q3[itype][jtype];
-  qr = 4.0*q1[itype][jtype]*ra;
+  qr = 4.0*q1_mech[itype][jtype]*ra;
   qr *= (1.0-q2[itype][jtype]*ra);
   qr *= exp(-ra);
 

From f0729551ae3798edccd44521cbf015e3d5d19fb7 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 28 Sep 2020 16:54:10 -0600
Subject: [PATCH 07/64] adding for good new kokkos spin style

---
 src/KOKKOS/atom_vec_spin_kokkos.cpp | 1297 +++++++++++++++++++++++++++
 src/KOKKOS/atom_vec_spin_kokkos.h   |  132 +++
 2 files changed, 1429 insertions(+)
 create mode 100644 src/KOKKOS/atom_vec_spin_kokkos.cpp
 create mode 100644 src/KOKKOS/atom_vec_spin_kokkos.h

diff --git a/src/KOKKOS/atom_vec_spin_kokkos.cpp b/src/KOKKOS/atom_vec_spin_kokkos.cpp
new file mode 100644
index 0000000000..8a7dd3317c
--- /dev/null
+++ b/src/KOKKOS/atom_vec_spin_kokkos.cpp
@@ -0,0 +1,1297 @@
+/* ----------------------------------------------------------------------
+
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+
+------------------------------------------------------------------------- */
+
+/* ------------------------------------------------------------------------
+   Contributing authors: Julien Tranchida (SNL)
+                         Aidan Thompson (SNL)
+
+   Please cite the related publication:
+   Tranchida, J., Plimpton, S. J., Thibaudeau, P., & Thompson, A. P. (2018).
+   Massively parallel symplectic algorithm for coupled magnetic spin dynamics
+   and molecular dynamics. Journal of Computational Physics.
+------------------------------------------------------------------------- */
+
+#include "atom_vec_spin_kokkos.h"
+#include <cmath>
+#include <cstring>
+#include "atom_kokkos.h"
+#include "comm_kokkos.h"
+#include "domain.h"
+#include "error.h"
+#include "fix.h"
+#include "memory_kokkos.h"
+#include "modify.h"
+#include "utils.h"
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+AtomVecSpinKokkos::AtomVecSpinKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
+{
+  molecular = 0;
+  mass_type = 1;
+  forceclearflag = 1;
+
+  comm_x_only = comm_f_only = 0;
+  size_forward = 7;
+  size_reverse = 9;
+  size_border = 10;
+  size_velocity = 3;
+  size_data_atom = 9;
+  size_data_vel = 4;
+  xcol_data = 4;
+
+  atom->sp_flag = 1;
+  
+  k_count = DAT::tdual_int_1d("atom::k_count",1);
+  atomKK = (AtomKokkos *) atom;
+  commKK = (CommKokkos *) comm;
+}
+
+/* ----------------------------------------------------------------------
+   grow atom arrays
+   n = 0 grows arrays by a chunk
+   n > 0 allocates arrays to size n
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::grow(int n)
+{
+  int step = MAX(DELTA,nmax*0.01);
+  if (n == 0) nmax += step;
+  else nmax = n;
+  atomKK->nmax = nmax;
+  if (nmax < 0 || nmax > MAXSMALLINT)
+    error->one(FLERR,"Per-processor system is too big");
+
+  atomKK->sync(Device,ALL_MASK);
+  atomKK->modified(Device,ALL_MASK);
+
+  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
+  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
+  memoryKK->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
+  memoryKK->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
+   
+  // allocating mech. quantities
+
+  memoryKK->grow_kokkos(atomKK->k_x,atomKK->x,nmax,"atom:x");
+  memoryKK->grow_kokkos(atomKK->k_v,atomKK->v,nmax,"atom:v");
+  memoryKK->grow_kokkos(atomKK->k_f,atomKK->f,nmax,"atom:f");
+  
+  // allocating mag. quantities
+
+  memoryKK->grow_kokkos(atomKK->k_sp,atomKK->sp,nmax,"atom:sp");
+  memoryKK->grow_kokkos(atomKK->k_fm,atomKK->fm,nmax,"atom:fm");
+  memoryKK->grow_kokkos(atomKK->k_fm_long,atomKK->fm_long,nmax,"atom:fm_long");
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
+}
+
+/* ----------------------------------------------------------------------
+   reset local array ptrs
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::grow_reset()
+{
+  tag = atomKK->tag;
+  d_tag = atomKK->k_tag.d_view;
+  h_tag = atomKK->k_tag.h_view;
+
+  type = atomKK->type;
+  d_type = atomKK->k_type.d_view;
+  h_type = atomKK->k_type.h_view;
+  mask = atomKK->mask;
+  d_mask = atomKK->k_mask.d_view;
+  h_mask = atomKK->k_mask.h_view;
+  image = atomKK->image;
+  d_image = atomKK->k_image.d_view;
+  h_image = atomKK->k_image.h_view;
+  
+  x = atomKK->x;
+  d_x = atomKK->k_x.d_view;
+  h_x = atomKK->k_x.h_view;
+  v = atomKK->v;
+  d_v = atomKK->k_v.d_view;
+  h_v = atomKK->k_v.h_view;
+  f = atomKK->f;
+  d_f = atomKK->k_f.d_view;
+  h_f = atomKK->k_f.h_view;
+  
+  sp = atomKK->sp; 
+  d_sp = atomKK->k_sp.d_view;
+  h_sp = atomKK->k_sp.h_view;
+  fm = atom->fm; 
+  d_fm = atomKK->k_fm.d_view;
+  h_fm = atomKK->k_fm.h_view;
+  fm_long = atom->fm_long;
+  d_fm_long = atomKK->k_fm_long.d_view;
+  h_fm_long = atomKK->k_fm_long.h_view;
+}
+
+/* ----------------------------------------------------------------------
+   copy atom I info to atom J
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::copy(int i, int j, int delflag)
+{
+  h_tag[j] = h_tag[i];
+  h_type[j] = h_type[i];
+  mask[j] = mask[i];
+  h_image[j] = h_image[i];
+  h_x(j,0) = h_x(i,0);
+  h_x(j,1) = h_x(i,1);
+  h_x(j,2) = h_x(i,2);
+  h_v(j,0) = h_v(i,0);
+  h_v(j,1) = h_v(i,1);
+  h_v(j,2) = h_v(i,2);
+
+  h_sp(j,0) = h_sp(i,0)
+  h_sp(j,1) = h_sp(i,1)
+  h_sp(j,2) = h_sp(i,2)
+  h_sp(j,3) = h_sp(i,3)
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG,int TRICLINIC>
+struct AtomVecSpinKokkos_PackComm {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  typename ArrayTypes<DeviceType>::t_sp_array_randomread _sp;
+  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
+  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
+  X_FLOAT _pbc[6];
+  
+  AtomVecSpinKokkos_PackComm(
+      const typename DAT::tdual_x_array &x,
+      const typename DAT::tdual_sp_array &sp,
+      const typename DAT::tdual_xfloat_2d &buf,
+      const typename DAT::tdual_int_2d &list,
+      const int & iswap,
+      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
+      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
+      _x(x.view<DeviceType>()),_sp(sp.view<DeviceType>()),
+      _list(list.view<DeviceType>()),_iswap(iswap),
+      _xprd(xprd),_yprd(yprd),_zprd(zprd),
+      _xy(xy),_xz(xz),_yz(yz) {
+        const size_t maxsend = (buf.view<DeviceType>().extent(0)*buf.view<DeviceType>().extent(1))/3;
+        // const size_t elements = 3;
+        const size_t elements = 7;
+        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
+        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
+        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+        const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+          _buf(i,3) = _sp(j,0);
+          _buf(i,4) = _sp(j,1);
+          _buf(i,5) = _sp(j,2);
+          _buf(i,6) = _sp(j,3);
+      } else {
+        if (TRICLINIC == 0) {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+          _buf(i,3) = _sp(j,0);
+          _buf(i,4) = _sp(j,1);
+          _buf(i,5) = _sp(j,2);
+          _buf(i,6) = _sp(j,3);
+        } else {
+          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
+          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
+          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
+          _buf(i,3) = _sp(j,0);
+          _buf(i,4) = _sp(j,1);
+          _buf(i,5) = _sp(j,2);
+          _buf(i,6) = _sp(j,3);
+        }
+      }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType,int PBC_FLAG>
+struct AtomVecSpinKokkos_PackBorder {
+  typedef DeviceType device_type;
+
+  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
+  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
+  const int _iswap;
+  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
+  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  const typename ArrayTypes<DeviceType>::t_int_1d _type;
+  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  const typename ArrayTypes<DeviceType>::t_sp_array_randomread _sp;
+  X_FLOAT _dx,_dy,_dz;
+
+  AtomVecSpinKokkos_PackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
+      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
+      const int & iswap,
+      const typename ArrayTypes<DeviceType>::t_x_array &x,
+      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      const typename ArrayTypes<DeviceType>::t_int_1d &type,
+      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      const typename ArrayTypes<DeviceType>::t_sp_array &sp,
+      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
+  _buf(buf),_list(list),_iswap(iswap),
+    _x(x),_sp(sp),_tag(tag),_type(type),_mask(mask),
+    _dx(dx),_dy(dy),_dz(dz) {}
+  
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      const int j = _list(_iswap,i);
+      if (PBC_FLAG == 0) {
+          _buf(i,0) = _x(j,0);
+          _buf(i,1) = _x(j,1);
+          _buf(i,2) = _x(j,2);
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = _sp(j,0);
+          _buf(i,7) = _sp(j,1);
+          _buf(i,8) = _sp(j,2);
+          _buf(i,9) = _sp(j,3);
+      } else {
+          _buf(i,0) = _x(j,0) + _dx;
+          _buf(i,1) = _x(j,1) + _dy;
+          _buf(i,2) = _x(j,2) + _dz;
+          _buf(i,3) = d_ubuf(_tag(j)).d;
+          _buf(i,4) = d_ubuf(_type(j)).d;
+          _buf(i,5) = d_ubuf(_mask(j)).d;
+          _buf(i,6) = _sp(j,0);
+          _buf(i,7) = _sp(j,1);
+          _buf(i,8) = _sp(j,2);
+          _buf(i,9) = _sp(j,3);
+      }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
+                               int pbc_flag, int *pbc, ExecutionSpace space)
+{
+  X_FLOAT dx,dy,dz;
+
+  if (pbc_flag != 0) {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if(space==Host) {
+      AtomVecSpinKokkos_PackBorder<LMPHostType,1> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,h_sp,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    } else {
+      AtomVecSpinKokkos_PackBorder<LMPDeviceType,1> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,d_sp,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    }
+
+  } else {
+    dx = dy = dz = 0;
+    if(space==Host) {
+      AtomVecSpinKokkos_PackBorder<LMPHostType,0> f(
+        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
+        iswap,h_x,h_tag,h_type,h_mask,h_sp,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    } else {
+      AtomVecSpinKokkos_PackBorder<LMPDeviceType,0> f(
+        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
+        iswap,d_x,d_tag,d_type,d_mask,d_sp,dx,dy,dz);
+      Kokkos::parallel_for(n,f);
+    }
+  }
+  return n*size_border;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::pack_border(int n, int *list, double *buf,
+                               int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_sp(j,0);
+      buf[m++] = h_sp(j,1);
+      buf[m++] = h_sp(j,2);
+      buf[m++] = h_sp(j,3);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0) + dx;
+      buf[m++] = h_x(j,1) + dy;
+      buf[m++] = h_x(j,2) + dz;
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_sp(j,0);
+      buf[m++] = h_sp(j,1);
+      buf[m++] = h_sp(j,2);
+      buf[m++] = h_sp(j,3);
+    }
+  }
+  
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::pack_border_vel(int n, int *list, double *buf,
+                                   int pbc_flag, int *pbc)
+{
+  int i,j,m;
+  double dx,dy,dz,dvx,dvy,dvz;
+
+  m = 0;
+  if (pbc_flag == 0) {
+    for (i = 0; i < n; i++) {
+      j = list[i];
+      buf[m++] = h_x(j,0);
+      buf[m++] = h_x(j,1);
+      buf[m++] = h_x(j,2);
+      buf[m++] = ubuf(h_tag(j)).d;
+      buf[m++] = ubuf(h_type(j)).d;
+      buf[m++] = ubuf(h_mask(j)).d;
+      buf[m++] = h_sp(j,0);
+      buf[m++] = h_sp(j,1);
+      buf[m++] = h_sp(j,2);
+      buf[m++] = h_sp(j,3);
+      buf[m++] = h_v(j,0);
+      buf[m++] = h_v(j,1);
+      buf[m++] = h_v(j,2);
+    }
+  } else {
+    if (domain->triclinic == 0) {
+      dx = pbc[0]*domain->xprd;
+      dy = pbc[1]*domain->yprd;
+      dz = pbc[2]*domain->zprd;
+    } else {
+      dx = pbc[0];
+      dy = pbc[1];
+      dz = pbc[2];
+    }
+    if (!deform_vremap) {
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        buf[m++] = h_sp(j,0);
+        buf[m++] = h_sp(j,1);
+        buf[m++] = h_sp(j,2);
+        buf[m++] = h_sp(j,3);
+        buf[m++] = h_v(j,0);
+        buf[m++] = h_v(j,1);
+        buf[m++] = h_v(j,2);
+      }
+    } else {
+      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
+      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
+      dvz = pbc[2]*h_rate[2];
+      for (i = 0; i < n; i++) {
+        j = list[i];
+        buf[m++] = h_x(j,0) + dx;
+        buf[m++] = h_x(j,1) + dy;
+        buf[m++] = h_x(j,2) + dz;
+        buf[m++] = ubuf(h_tag(j)).d;
+        buf[m++] = ubuf(h_type(j)).d;
+        buf[m++] = ubuf(h_mask(j)).d;
+        buf[m++] = h_sp(j,0);
+        buf[m++] = h_sp(j,1);
+        buf[m++] = h_sp(j,2);
+        buf[m++] = h_sp(j,3);
+        if (mask[i] & deform_groupbit) {
+          buf[m++] = h_v(j,0) + dvx;
+          buf[m++] = h_v(j,1) + dvy;
+          buf[m++] = h_v(j,2) + dvz;
+        } else {
+          buf[m++] = h_v(j,0);
+          buf[m++] = h_v(j,1);
+          buf[m++] = h_v(j,2);
+        }
+      }
+    }
+  }
+  
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
+
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::pack_border_hybrid(int n, int *list, double *buf)
+{
+  int i,j,m;
+
+  m = 0;
+  for (i = 0; i < n; i++) {
+    j = list[i];
+    buf[m++] = h_sp(j,0);
+    buf[m++] = h_sp(j,1);
+    buf[m++] = h_sp(j,2);
+    buf[m++] = h_sp(j,3);
+  }
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecSpinKokkos_UnpackBorder {
+  typedef DeviceType device_type;
+
+  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
+  typename ArrayTypes<DeviceType>::t_x_array _x;
+  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
+  typename ArrayTypes<DeviceType>::t_int_1d _type;
+  typename ArrayTypes<DeviceType>::t_int_1d _mask;
+  typename ArrayTypes<DeviceType>::t_sp_array _sp;
+  int _first;
+
+
+  AtomVecSpinKokkos_UnpackBorder(
+      const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
+      typename ArrayTypes<DeviceType>::t_x_array &x,
+      typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
+      typename ArrayTypes<DeviceType>::t_int_1d &type,
+      typename ArrayTypes<DeviceType>::t_int_1d &mask,
+      typename ArrayTypes<DeviceType>::t_sp_array &sp,
+      const int& first):
+    _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),_sp(sp),_first(first){
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i) const {
+      _x(i+_first,0) = _buf(i,0);
+      _x(i+_first,1) = _buf(i,1);
+      _x(i+_first,2) = _buf(i,2);
+      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
+      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
+      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
+      _sp(i+_first) = _buf(i,6);
+      _sp(i+_first) = _buf(i,7);
+      _sp(i+_first) = _buf(i,8);
+      _sp(i+_first) = _buf(i,9);
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::unpack_border_kokkos(const int &n, const int &first,
+                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
+  if (first+n >= nmax) {
+    grow(first+n+100);
+  }
+  if(space==Host) {
+    struct AtomVecSpinKokkos_UnpackBorder<LMPHostType>
+      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_sp,first);
+    Kokkos::parallel_for(n,f);
+  } else {
+    struct AtomVecSpinKokkos_UnpackBorder<LMPDeviceType>
+      f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_sp,first);
+    Kokkos::parallel_for(n,f);
+  }
+  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|SP_MASK);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::unpack_border(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+
+  for (i = first; i < last; i++) {
+    if (i == nmax) {
+      grow(0);
+    }
+    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|SP_MASK);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_sp(i,0) = buf[m++];
+    h_sp(i,1) = buf[m++];
+    h_sp(i,2) = buf[m++];
+    h_sp(i,3) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::unpack_border_vel(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++) {
+    if (i == nmax) grow(0);
+    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|SP_MASK);
+    h_x(i,0) = buf[m++];
+    h_x(i,1) = buf[m++];
+    h_x(i,2) = buf[m++];
+    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
+    h_type(i) = (int) ubuf(buf[m++]).i;
+    h_mask(i) = (int) ubuf(buf[m++]).i;
+    h_sp(i,0) = buf[m++];
+    h_sp(i,1) = buf[m++];
+    h_sp(i,2) = buf[m++];
+    h_sp(i,3) = buf[m++];
+    h_v(i,0) = buf[m++];
+    h_v(i,1) = buf[m++];
+    h_v(i,2) = buf[m++];
+  }
+
+  if (atom->nextra_border)
+    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
+      m += modify->fix[atom->extra_border[iextra]]->
+        unpack_border(n,first,&buf[m]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::unpack_border_hybrid(int n, int first, double *buf)
+{
+  int i,m,last;
+
+  m = 0;
+  last = first + n;
+  for (i = first; i < last; i++)
+    h_sp(i,0) = buf[m++];
+    h_sp(i,1) = buf[m++];
+    h_sp(i,2) = buf[m++];
+    h_sp(i,3) = buf[m++];
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecSpinKokkos_PackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array_randomread _x;
+  typename AT::t_v_array_randomread _v;
+  typename AT::t_tagint_1d_randomread _tag;
+  typename AT::t_int_1d_randomread _type;
+  typename AT::t_int_1d_randomread _mask;
+  typename AT::t_imageint_1d_randomread _image;
+  typename AT::t_sp_array_randomread _sp;
+  typename AT::t_x_array _xw;
+  typename AT::t_v_array _vw;
+  typename AT::t_tagint_1d _tagw;
+  typename AT::t_int_1d _typew;
+  typename AT::t_int_1d _maskw;
+  typename AT::t_imageint_1d _imagew;
+  typename AT::t_sp_array _spw;
+
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d_const _sendlist;
+  typename AT::t_int_1d_const _copylist;
+  int _nlocal,_dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecSpinKokkos_PackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d sendlist,
+      typename AT::tdual_int_1d copylist,int nlocal, int dim,
+                X_FLOAT lo, X_FLOAT hi):
+    _x(atom->k_x.view<DeviceType>()),
+    _v(atom->k_v.view<DeviceType>()),
+    _tag(atom->k_tag.view<DeviceType>()),
+    _type(atom->k_type.view<DeviceType>()),
+    _mask(atom->k_mask.view<DeviceType>()),
+    _image(atom->k_image.view<DeviceType>()),
+    _sp(atom->k_sp.view<DeviceType>()),
+    _xw(atom->k_x.view<DeviceType>()),
+    _vw(atom->k_v.view<DeviceType>()),
+    _tagw(atom->k_tag.view<DeviceType>()),
+    _typew(atom->k_type.view<DeviceType>()),
+    _maskw(atom->k_mask.view<DeviceType>()),
+    _imagew(atom->k_image.view<DeviceType>()),
+    _spw(atom->k_sp.view<DeviceType>()),
+    _sendlist(sendlist.template view<DeviceType>()),
+    _copylist(copylist.template view<DeviceType>()),
+    _nlocal(nlocal),_dim(dim),
+    _lo(lo),_hi(hi){
+    const size_t elements = 15;
+    const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
+                             buf.template view<DeviceType>().extent(1))/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &mysend) const {
+    const int i = _sendlist(mysend);
+    _buf(mysend,0) = 15;
+    _buf(mysend,1) = _x(i,0);
+    _buf(mysend,2) = _x(i,1);
+    _buf(mysend,3) = _x(i,2);
+    _buf(mysend,4) = _v(i,0);
+    _buf(mysend,5) = _v(i,1);
+    _buf(mysend,6) = _v(i,2);
+    _buf(mysend,7) = d_ubuf(_tag[i]).d;
+    _buf(mysend,8) = d_ubuf(_type[i]).d;
+    _buf(mysend,9) = d_ubuf(_mask[i]).d;
+    _buf(mysend,10) = d_ubuf(_image[i]).d;
+    _buf(mysend,11) = _sp(i,0);
+    _buf(mysend,12) = _sp(i,1);
+    _buf(mysend,13) = _sp(i,2);
+    _buf(mysend,14) = _sp(i,3);
+    const int j = _copylist(mysend);
+
+    if(j>-1) {
+    _xw(i,0) = _x(j,0);
+    _xw(i,1) = _x(j,1);
+    _xw(i,2) = _x(j,2);
+    _vw(i,0) = _v(j,0);
+    _vw(i,1) = _v(j,1);
+    _vw(i,2) = _v(j,2);
+    _tagw(i) = _tag(j);
+    _typew(i) = _type(j);
+    _maskw(i) = _mask(j);
+    _imagew(i) = _image(j);
+    _spw(i,0) = _sp(j,0);
+    _spw(i,1) = _sp(j,1);
+    _spw(i,2) = _sp(j,2);
+    _spw(i,3) = _sp(j,3);
+    }
+  }
+};
+  
+/* ---------------------------------------------------------------------- */
+  
+int AtomVecSpinKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf,
+                                              DAT::tdual_int_1d k_sendlist,
+                                              DAT::tdual_int_1d k_copylist,
+                                              ExecutionSpace space,int dim,
+                                              X_FLOAT lo,X_FLOAT hi )
+{
+  if(nsend > (int) (k_buf.view<LMPHostType>().extent(0)*k_buf.view<LMPHostType>().extent(1))/15) {
+    int newsize = nsend*15/k_buf.view<LMPHostType>().extent(1)+1;
+    k_buf.resize(newsize,k_buf.view<LMPHostType>().extent(1));
+  }
+  if(space == Host) {
+    AtomVecSpinKokkos_PackExchangeFunctor<LMPHostType>
+      f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    return nsend*15;
+  } else {
+    AtomVecSpinKokkos_PackExchangeFunctor<LMPDeviceType>
+      f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
+    Kokkos::parallel_for(nsend,f);
+    return nsend*15;
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+  
+int AtomVecSpinKokkos::pack_exchange(int i, double *buf)
+{
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_sp(i,0);
+  buf[m++] = h_sp(i,1);
+  buf[m++] = h_sp(i,2);
+  buf[m++] = h_sp(i,3);
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+struct AtomVecSpinKokkos_UnpackExchangeFunctor {
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+  typename AT::t_x_array _x;
+  typename AT::t_v_array _v;
+  typename AT::t_tagint_1d _tag;
+  typename AT::t_int_1d _type;
+  typename AT::t_int_1d _mask;
+  typename AT::t_imageint_1d _image;
+  typename AT::t_sp_array _sp;
+  typename AT::t_xfloat_2d_um _buf;
+  typename AT::t_int_1d _nlocal;
+  int _dim;
+  X_FLOAT _lo,_hi;
+
+  AtomVecSpinKokkos_UnpackExchangeFunctor(
+      const AtomKokkos* atom,
+      const typename AT::tdual_xfloat_2d buf,
+      typename AT::tdual_int_1d nlocal,
+      int dim, X_FLOAT lo, X_FLOAT hi):
+    _x(atom->k_x.view<DeviceType>()),
+    _v(atom->k_v.view<DeviceType>()),
+    _tag(atom->k_tag.view<DeviceType>()),
+    _type(atom->k_type.view<DeviceType>()),
+    _mask(atom->k_mask.view<DeviceType>()),
+    _image(atom->k_image.view<DeviceType>()),
+    _sp(atom->k_sp.view<DeviceType>()),
+    _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
+    _lo(lo),_hi(hi){
+    const size_t elements = 15;
+    const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/elements;
+
+    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int &myrecv) const {
+    X_FLOAT x = _buf(myrecv,_dim+1);
+    if (x >= _lo && x < _hi) {
+      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
+      _x(i,0) = _buf(myrecv,1);
+      _x(i,1) = _buf(myrecv,2);
+      _x(i,2) = _buf(myrecv,3);
+      _v(i,0) = _buf(myrecv,4);
+      _v(i,1) = _buf(myrecv,5);
+      _v(i,2) = _buf(myrecv,6);
+      _tag[i] = (tagint) d_ubuf(_buf(myrecv,7)).i;
+      _type[i] = (int) d_ubuf(_buf(myrecv,8)).i;
+      _mask[i] = (int) d_ubuf(_buf(myrecv,9)).i;
+      _image[i] = (imageint) d_ubuf(_buf(myrecv,10)).i;
+      _sp(i,0) = _buf(myrecv,11);
+      _sp(i,1) = _buf(myrecv,12);
+      _sp(i,2) = _buf(myrecv,13);
+      _sp(i,3) = _buf(myrecv,14);
+    }
+  }
+};
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,
+                                                int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,
+                                                ExecutionSpace space) {
+  if(space == Host) {
+    k_count.h_view(0) = nlocal;
+    AtomVecSpinKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/15,f);
+    return k_count.h_view(0);
+  } else {
+    k_count.h_view(0) = nlocal;
+    k_count.modify<LMPHostType>();
+    k_count.sync<LMPDeviceType>();
+    AtomVecSpinKokkos_UnpackExchangeFunctor<LMPDeviceType>
+      f(atomKK,k_buf,k_count,dim,lo,hi);
+    Kokkos::parallel_for(nrecv/15,f);
+    k_count.modify<LMPDeviceType>();
+    k_count.sync<LMPHostType>();
+
+    return k_count.h_view(0);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::unpack_exchange(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+           MASK_MASK | IMAGE_MASK | SP_MASK);
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_sp(nlocal,0) = buf[m++];
+  h_sp(nlocal,1) = buf[m++];
+  h_sp(nlocal,2) = buf[m++];
+  h_sp(nlocal,3) = buf[m++];
+
+  if (atom->nextra_grow)
+    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
+      m += modify->fix[atom->extra_grow[iextra]]->
+        unpack_exchange(nlocal,&buf[m]);
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   size of restart data for all atoms owned by this proc
+   include extra data stored by fixes
+------------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::size_restart()
+{
+  int i;
+
+  int nlocal = atom->nlocal;
+  int n = 15 * nlocal;
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      for (i = 0; i < nlocal; i++)
+        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
+
+  return n;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom I's data for restart file including extra quantities
+   xyz must be 1st 3 values, so that read_restart can test on them
+   molecular types may be negative, but write as positive
+------------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::pack_restart(int i, double *buf)
+{
+  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+            MASK_MASK | IMAGE_MASK | SP_MASK);
+
+  int m = 1;
+  buf[m++] = h_x(i,0);
+  buf[m++] = h_x(i,1);
+  buf[m++] = h_x(i,2);
+  buf[m++] = ubuf(h_tag(i)).d;
+  buf[m++] = ubuf(h_type(i)).d;
+  buf[m++] = ubuf(h_mask(i)).d;
+  buf[m++] = ubuf(h_image(i)).d;
+  buf[m++] = h_v(i,0);
+  buf[m++] = h_v(i,1);
+  buf[m++] = h_v(i,2);
+
+  buf[m++] = h_sp(i,0);
+  buf[m++] = h_sp(i,1);
+  buf[m++] = h_sp(i,2);
+  buf[m++] = h_sp(i,3);
+
+  if (atom->nextra_restart)
+    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
+      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
+
+  buf[0] = m;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   unpack data for one atom from restart file including extra quantities
+------------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::unpack_restart(double *buf)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    grow(0);
+    if (atom->nextra_store)
+      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
+  }
+
+  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
+           MASK_MASK | IMAGE_MASK | SP_MASK);
+
+  int m = 1;
+  h_x(nlocal,0) = buf[m++];
+  h_x(nlocal,1) = buf[m++];
+  h_x(nlocal,2) = buf[m++];
+  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
+  h_type(nlocal) = (int) ubuf(buf[m++]).i;
+  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
+  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
+  h_v(nlocal,0) = buf[m++];
+  h_v(nlocal,1) = buf[m++];
+  h_v(nlocal,2) = buf[m++];
+
+  h_sp(nlocal,0) = buf[m++];
+  h_sp(nlocal,1) = buf[m++];
+  h_sp(nlocal,2) = buf[m++];
+  h_sp(nlocal,3) = buf[m++];
+
+  double **extra = atom->extra;
+  if (atom->nextra_store) {
+    int size = static_cast<int> (buf[0]) - m;
+    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
+  }
+
+  atom->nlocal++;
+  return m;
+}
+
+/* ----------------------------------------------------------------------
+   create one atom of itype at coord
+   set other values to defaults
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::create_atom(int itype, double *coord)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) {
+    atomKK->modified(Host,ALL_MASK);
+    grow(0);
+  }
+  atomKK->sync(Host,ALL_MASK);
+  atomKK->modified(Host,ALL_MASK);
+
+  tag[nlocal] = 0;
+  type[nlocal] = itype;
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+  h_mask[nlocal] = 1;
+  h_image[nlocal] = ((imageint) IMGMAX << IMG2BITS) |
+    ((imageint) IMGMAX << IMGBITS) | IMGMAX;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  h_sp(nlocal,0) = 0.0;
+  h_sp(nlocal,1) = 0.0;
+  h_sp(nlocal,2) = 0.0;
+  h_sp(nlocal,3) = 0.0;
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack one line from Atoms section of data file
+   initialize other atom quantities
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::data_atom(double *coord, imageint imagetmp,
+                                    char **values)
+{
+  int nlocal = atom->nlocal;
+  if (nlocal == nmax) grow(0);
+
+  h_tag[nlocal] = utils::inumeric(FLERR,values[0],true,lmp);
+  h_type[nlocal] = utils::inumeric(FLERR,values[1],true,lmp);
+  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
+    error->one(FLERR,"Invalid atom type in Atoms section of data file");
+
+  h_sp(nlocal,3) = utils::numeric(FLERR,values[2],true,lmp);
+  h_sp(nlocal,0) = utils::numeric(FLERR,values[6],true,lmp);
+  h_sp(nlocal,1) = utils::numeric(FLERR,values[7],true,lmp);
+  h_sp(nlocal,2) = utils::numeric(FLERR,values[8],true,lmp);
+  double inorm = 1.0/sqrt(sp[nlocal][0]*sp[nlocal][0] +
+                          sp[nlocal][1]*sp[nlocal][1] +
+                          sp[nlocal][2]*sp[nlocal][2]);
+  h_sp(nlocal,0) *= inorm;
+  h_sp(nlocal,1) *= inorm;
+  h_sp(nlocal,2) *= inorm;
+
+  h_x(nlocal,0) = coord[0];
+  h_x(nlocal,1) = coord[1];
+  h_x(nlocal,2) = coord[2];
+
+  h_image[nlocal] = imagetmp;
+
+  h_mask[nlocal] = 1;
+  h_v(nlocal,0) = 0.0;
+  h_v(nlocal,1) = 0.0;
+  h_v(nlocal,2) = 0.0;
+
+  atomKK->modified(Host,ALL_MASK);
+
+  atom->nlocal++;
+}
+
+/* ----------------------------------------------------------------------
+   unpack hybrid quantities from one line in Atoms section of data file
+   initialize other atom quantities for this sub-style
+------------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::data_atom_hybrid(int nlocal, char **values)
+{
+  h_sp(nlocal,3) = utils::numeric(FLERR,values[0],true,lmp);
+  h_sp(nlocal,0) = utils::numeric(FLERR,values[1],true,lmp);
+  h_sp(nlocal,1) = utils::numeric(FLERR,values[2],true,lmp);
+  h_sp(nlocal,2) = utils::numeric(FLERR,values[3],true,lmp);
+  double inorm = 1.0/sqrt(sp[nlocal][0]*sp[nlocal][0] +
+                          sp[nlocal][1]*sp[nlocal][1] +
+                          sp[nlocal][2]*sp[nlocal][2]);
+  sp[nlocal][0] *= inorm;
+  sp[nlocal][1] *= inorm;
+  sp[nlocal][2] *= inorm;
+
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   pack atom info for data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::pack_data(double **buf)
+{
+  int nlocal = atom->nlocal;
+  for (int i = 0; i < nlocal; i++) {
+    buf[i][0] = h_tag[i];
+    buf[i][1] = h_type[i];
+    buf[i][2] = h_sp(i,0);
+    buf[i][3] = h_x(i,0);
+    buf[i][4] = h_x(i,1);
+    buf[i][5] = h_x(i,2);
+    buf[i][2] = h_sp(i,1);
+    buf[i][2] = h_sp(i,2);
+    buf[i][2] = h_sp(i,3);
+    buf[i][6] = (h_image[i] & IMGMASK) - IMGMAX;
+    buf[i][7] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX;
+    buf[i][8] = (h_image[i] >> IMG2BITS) - IMGMAX;
+  }
+}
+
+/* ----------------------------------------------------------------------
+   pack hybrid atom info for data file
+------------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::pack_data_hybrid(int i, double *buf)
+{
+  buf[0] = h_sp(i,3);
+  buf[1] = h_sp(i,0);
+  buf[2] = h_sp(i,1);
+  buf[3] = h_sp(i,2);
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   write atom info to data file including 3 image flags
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::write_data(FILE *fp, int n, double **buf)
+{
+  for (int i = 0; i < n; i++)
+    fprintf(fp,"%d %d %-1.16e %-1.16e %-1.16e %-1.16e %d %d %d\n",
+            (int) buf[i][0],(int) buf[i][1],buf[i][2],buf[i][3],buf[i][4],
+            buf[i][5],(int) buf[i][6],(int) buf[i][7],(int) buf[i][8]);
+}
+
+/* ----------------------------------------------------------------------
+   write hybrid atom info to data file
+------------------------------------------------------------------------- */
+
+int AtomVecSpinKokkos::write_data_hybrid(FILE *fp, double *buf)
+{
+  fprintf(fp," %-1.16e %-1.16e %-1.16e %-1.16e",buf[0],buf[1],buf[2],buf[3]);
+  return 4;
+}
+
+/* ----------------------------------------------------------------------
+   return # of bytes of allocated memory
+------------------------------------------------------------------------- */
+
+bigint AtomVecSpinKokkos::memory_usage()
+{
+  bigint bytes = 0;
+
+  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
+  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
+  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
+  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
+  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
+  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
+  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3);
+
+  if (atom->memcheck("sp")) bytes += memory->usage(sp,nmax,4);
+  if (atom->memcheck("fm")) bytes += memory->usage(fm,nmax*comm->nthreads,3);
+  if (atom->memcheck("fm_long")) bytes += memory->usage(fm_long,nmax*comm->nthreads,3);
+
+  return bytes;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::sync(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
+    if (mask & SP_MASK) atomKK->k_sp.sync<LMPDeviceType>();
+    if (mask & FM_MASK) atomKK->k_fm.sync<LMPDeviceType>();
+    if (mask & FML_MASK) atomKK->k_fm_long.sync<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
+    if (mask & SP_MASK) atomKK->k_sp.sync<LMPHostType>();
+    if (mask & FM_MASK) atomKK->k_fm.sync<LMPHostType>();
+    if (mask & FML_MASK) atomKK->k_fm_long.sync<LMPHostType>();
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::modified(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
+    if (mask & SP_MASK) atomKK->k_sp.modify<LMPDeviceType>();
+    if (mask & FM_MASK) atomKK->k_fm.modify<LMPDeviceType>();
+    if (mask & FML_MASK) atomKK->k_fm_long.modify<LMPDeviceType>();
+  } else {
+    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
+    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
+    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
+    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
+    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
+    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
+    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
+    if (mask & SP_MASK) atomKK->k_sp.modify<LMPHostType>();
+    if (mask & FM_MASK) atomKK->k_fm.modify<LMPHostType>();
+    if (mask & FML_MASK) atomKK->k_fm_long.modify<LMPHostType>();
+  }
+}
+
+void AtomVecSpinKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
+{
+  if (space == Device) {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & SP_MASK) && atomKK->k_sp.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_sp_array>(atomKK->k_sp,space);
+    if ((mask & FM_MASK) && atomKK->k_sp.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_fm_array>(atomKK->k_fm,space);
+    if ((mask & FML_MASK) && atomKK->k_fm_long.need_sync<LMPDeviceType>())
+      perform_async_copy<DAT::tdual_fm_long_array>(atomKK->k_fm_long,space);
+  } else {
+    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
+    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
+    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
+    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
+    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
+    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
+    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
+    if ((mask & SP_MASK) && atomKK->k_sp.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_sp_array>(atomKK->k_sp,space);
+    if ((mask & FM_MASK) && atomKK->k_fm.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_fm_array>(atomKK->k_fm,space);
+    if ((mask & FML_MASK) && atomKK->k_fm_long.need_sync<LMPHostType>())
+      perform_async_copy<DAT::tdual_fm_long_array>(atomKK->k_fm_long,space);
+  }
+}
+
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.h b/src/KOKKOS/atom_vec_spin_kokkos.h
new file mode 100644
index 0000000000..5b57cfd8e6
--- /dev/null
+++ b/src/KOKKOS/atom_vec_spin_kokkos.h
@@ -0,0 +1,132 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef ATOM_CLASS
+
+AtomStyle(spin/kk,AtomVecSpinKokkos)
+AtomStyle(spin/kk/device,AtomVecSpinKokkos)
+AtomStyle(spin/kk/host,AtomVecSpinKokkos)
+
+#else
+
+#ifndef LMP_ATOM_VEC_SPIN_KOKKOS_H
+#define LMP_ATOM_VEC_SPIN_KOKKOS_H
+
+#include "atom_vec_kokkos.h"
+#include "kokkos_type.h"
+
+namespace LAMMPS_NS {
+
+class AtomVecSpinKokkos : public AtomVecKokkos {
+ public:
+  AtomVecSpinKokkos(class LAMMPS *);
+  void grow(int);
+  void copy(int, int, int);
+  int pack_border(int, int *, double *, int, int *);
+  int pack_border_vel(int, int *, double *, int, int *);
+  int pack_border_hybrid(int, int *, double *);
+  void unpack_border(int, int, double *);
+  void unpack_border_vel(int, int, double *);
+  int unpack_border_hybrid(int, int, double *);
+  int pack_exchange(int, double *);
+  int unpack_exchange(double *);
+  int size_restart();
+  int pack_restart(int, double *);
+  int unpack_restart(double *);
+  void create_atom(int, double *);
+  void data_atom(double *, imageint, char **);
+  int data_atom_hybrid(int, char **);
+  void pack_data(double **);
+  int pack_data_hybrid(int, double *);
+  void write_data(FILE *, int, double **);
+  int write_data_hybrid(FILE *, double *);
+  bigint memory_usage();
+  
+  // clear magnetic and mechanic forces
+
+  void force_clear(int, size_t);
+
+  void grow_reset();
+  // input lists to be checked
+  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
+                         DAT::tdual_xfloat_2d buf,int iswap,
+                         int pbc_flag, int *pbc, ExecutionSpace space);
+  void unpack_border_kokkos(const int &n, const int &nfirst,
+                            const DAT::tdual_xfloat_2d &buf,
+                            ExecutionSpace space);
+  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
+                           DAT::tdual_int_1d k_sendlist,
+                           DAT::tdual_int_1d k_copylist,
+                           ExecutionSpace space, int dim,
+                           X_FLOAT lo, X_FLOAT hi);
+  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
+                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
+                             ExecutionSpace space);
+
+  void sync(ExecutionSpace space, unsigned int mask);
+  void modified(ExecutionSpace space, unsigned int mask);
+  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
+
+ protected:
+  tagint *tag;
+  int *type,*mask;
+  imageint *image;
+  double **x,**v,**f;           // lattice quantities
+
+                                // spin quantities
+  double **sp;                  // sp[i][0-2] direction of the spin i
+                                // sp[i][3] atomic magnetic moment of the spin i
+  double **fm;                  // fm[i][0-2] direction of magnetic precession
+  double **fm_long;             // storage of long-range spin prec. components
+
+  DAT::t_tagint_1d d_tag;
+  HAT::t_tagint_1d h_tag;
+
+  DAT::t_int_1d d_type, d_mask;
+  HAT::t_int_1d h_type, h_mask;
+
+  DAT::t_imageint_1d d_image;
+  HAT::t_imageint_1d h_image;
+
+  DAT::t_x_array d_x;
+  DAT::t_v_array d_v;
+  DAT::t_f_array d_f;
+
+  DAT::t_x_array d_sp;
+  DAT::t_x_array d_fm;
+  DAT::t_x_array d_fm_long;
+
+  HAT::t_x_array h_sp;
+  HAT::t_x_array h_fm;
+  HAT::t_x_array h_fm_long;
+
+  DAT::tdual_int_1d k_count;
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Per-processor system is too big
+
+The number of owned atoms plus ghost atoms on a single
+processor must fit in 32-bit integer.
+
+E: Invalid atom type in Atoms section of data file
+
+Atom types must range from 1 to specified # of types.
+
+*/

From 735676241ff8b56bf952e67d2e9f410a674251b0 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Tue, 29 Sep 2020 08:06:41 -0600
Subject: [PATCH 08/64] start correcting atom spin/kk

---
 src/KOKKOS/Install.sh               |  2 ++
 src/KOKKOS/atom_kokkos.cpp          |  6 ++++++
 src/KOKKOS/atom_kokkos.h            |  5 +++++
 src/KOKKOS/atom_vec_spin_kokkos.cpp | 21 +++++++++++----------
 src/atom_masks.h                    |  6 ++++++
 5 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 540389f599..87cddbe1de 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -63,6 +63,8 @@ action atom_vec_bond_kokkos.cpp atom_vec_bond.cpp
 action atom_vec_bond_kokkos.h atom_vec_bond.h
 action atom_vec_charge_kokkos.cpp
 action atom_vec_charge_kokkos.h
+action atom_vec_spin_kokkos.cpp
+action atom_vec_spin_kokkos.h
 action atom_vec_dpd_kokkos.cpp atom_vec_dpd.cpp
 action atom_vec_dpd_kokkos.h atom_vec_dpd.h
 action atom_vec_full_kokkos.cpp atom_vec_full.cpp
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index 4637a9a21c..2640c1611d 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -76,6 +76,12 @@ AtomKokkos::~AtomKokkos()
   memoryKK->destroy_kokkos(k_improper_atom3, improper_atom3);
   memoryKK->destroy_kokkos(k_improper_atom4, improper_atom4);
 
+  // SPIN package
+
+  memoryKK->destroy_kokkos(k_sp, sp);
+  memoryKK->destroy_kokkos(k_fm, fm);
+  memoryKK->destroy_kokkos(k_fm_long, fm_long);
+
   // USER-DPD package
   memoryKK->destroy_kokkos(k_uCond,uCond);
   memoryKK->destroy_kokkos(k_uMech,uMech);
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index 0ae032032a..3ed703c66a 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -54,6 +54,11 @@ class AtomKokkos : public Atom {
 
   DAT::tdual_float_2d k_dvector;
 
+  // SPIN package
+
+  DAT::tdual_x_array k_sp;
+  DAT::tdual_x_array k_fm;
+  DAT::tdual_x_array k_fm_long;
 
 // USER-DPD package
   DAT::tdual_efloat_1d k_uCond, k_uMech, k_uChem, k_uCG, k_uCGnew,
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.cpp b/src/KOKKOS/atom_vec_spin_kokkos.cpp
index 8a7dd3317c..ef0b350092 100644
--- a/src/KOKKOS/atom_vec_spin_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_spin_kokkos.cpp
@@ -24,19 +24,20 @@
 ------------------------------------------------------------------------- */
 
 #include "atom_vec_spin_kokkos.h"
-#include <cmath>
-#include <cstring>
 #include "atom_kokkos.h"
 #include "comm_kokkos.h"
 #include "domain.h"
-#include "error.h"
-#include "fix.h"
-#include "memory_kokkos.h"
 #include "modify.h"
+#include "fix.h"
+#include "atom_masks.h"
+#include "memory_kokkos.h"
+#include "error.h"
 #include "utils.h"
 
 using namespace LAMMPS_NS;
 
+#define DELTA 10
+
 /* ---------------------------------------------------------------------- */
 
 AtomVecSpinKokkos::AtomVecSpinKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
@@ -159,10 +160,10 @@ void AtomVecSpinKokkos::copy(int i, int j, int delflag)
   h_v(j,1) = h_v(i,1);
   h_v(j,2) = h_v(i,2);
 
-  h_sp(j,0) = h_sp(i,0)
-  h_sp(j,1) = h_sp(i,1)
-  h_sp(j,2) = h_sp(i,2)
-  h_sp(j,3) = h_sp(i,3)
+  h_sp(j,0) = h_sp(i,0); 
+  h_sp(j,1) = h_sp(i,1);
+  h_sp(j,2) = h_sp(i,2);
+  h_sp(j,3) = h_sp(i,3);
 
   if (atom->nextra_grow)
     for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
@@ -263,7 +264,7 @@ struct AtomVecSpinKokkos_PackBorder {
       const typename ArrayTypes<DeviceType>::t_sp_array &sp,
       const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
   _buf(buf),_list(list),_iswap(iswap),
-    _x(x),_sp(sp),_tag(tag),_type(type),_mask(mask),
+    _x(x),_tag(tag),_type(type),_mask(mask),_sp(sp),
     _dx(dx),_dy(dy),_dz(dz) {}
   
   KOKKOS_INLINE_FUNCTION
diff --git a/src/atom_masks.h b/src/atom_masks.h
index 8e29448488..daad323835 100644
--- a/src/atom_masks.h
+++ b/src/atom_masks.h
@@ -42,6 +42,12 @@
 #define ENERGY_MASK    0x00010000
 #define VIRIAL_MASK    0x00020000
 
+// SPIN
+
+#define SP_MASK         0x00000001
+#define FM_MASK         0x00000002
+#define FML_MASK        0x00000004
+
 // DPD
 
 #define DPDRHO_MASK       0x00040000

From d3aa2d1cd01c6f4fa86b3eb388130b1fe9214d26 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Wed, 30 Sep 2020 10:27:22 -0600
Subject: [PATCH 09/64] compilable kokkos files (still a segfault issue)

---
 src/KOKKOS/atom_kokkos.h            |  6 +++---
 src/KOKKOS/atom_vec_spin_kokkos.cpp | 22 ++++++++++++++++------
 src/KOKKOS/atom_vec_spin_kokkos.h   | 12 ++++++------
 3 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index 3ed703c66a..b66d54cbdd 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -56,9 +56,9 @@ class AtomKokkos : public Atom {
 
   // SPIN package
 
-  DAT::tdual_x_array k_sp;
-  DAT::tdual_x_array k_fm;
-  DAT::tdual_x_array k_fm_long;
+  DAT::tdual_sp_array k_sp;
+  DAT::tdual_fm_array k_fm;
+  DAT::tdual_fm_long_array k_fm_long;
 
 // USER-DPD package
   DAT::tdual_efloat_1d k_uCond, k_uMech, k_uChem, k_uCG, k_uCGnew,
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.cpp b/src/KOKKOS/atom_vec_spin_kokkos.cpp
index ef0b350092..6ed62c0242 100644
--- a/src/KOKKOS/atom_vec_spin_kokkos.cpp
+++ b/src/KOKKOS/atom_vec_spin_kokkos.cpp
@@ -135,10 +135,10 @@ void AtomVecSpinKokkos::grow_reset()
   sp = atomKK->sp; 
   d_sp = atomKK->k_sp.d_view;
   h_sp = atomKK->k_sp.h_view;
-  fm = atom->fm; 
+  fm = atomKK->fm; 
   d_fm = atomKK->k_fm.d_view;
   h_fm = atomKK->k_fm.h_view;
-  fm_long = atom->fm_long;
+  fm_long = atomKK->fm_long;
   d_fm_long = atomKK->k_fm_long.d_view;
   h_fm_long = atomKK->k_fm_long.h_view;
 }
@@ -537,10 +537,10 @@ struct AtomVecSpinKokkos_UnpackBorder {
       _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
       _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
       _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
-      _sp(i+_first) = _buf(i,6);
-      _sp(i+_first) = _buf(i,7);
-      _sp(i+_first) = _buf(i,8);
-      _sp(i+_first) = _buf(i,9);
+      _sp(i+_first,0) = _buf(i,6);
+      _sp(i+_first,1) = _buf(i,7);
+      _sp(i+_first,2) = _buf(i,8);
+      _sp(i+_first,3) = _buf(i,9);
   }
 };
 
@@ -1296,3 +1296,13 @@ void AtomVecSpinKokkos::sync_overlapping_device(ExecutionSpace space, unsigned i
   }
 }
 
+/* ----------------------------------------------------------------------
+   clear all forces (mech and mag)
+------------------------------------------------------------------------- */
+
+void AtomVecSpinKokkos::force_clear(int /*n*/, size_t nbytes)
+{
+  memset(&atom->f[0][0],0,3*nbytes);
+  memset(&atom->fm[0][0],0,3*nbytes);
+  memset(&atom->fm_long[0][0],0,3*nbytes);
+}
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.h b/src/KOKKOS/atom_vec_spin_kokkos.h
index 5b57cfd8e6..d439424076 100644
--- a/src/KOKKOS/atom_vec_spin_kokkos.h
+++ b/src/KOKKOS/atom_vec_spin_kokkos.h
@@ -102,13 +102,13 @@ class AtomVecSpinKokkos : public AtomVecKokkos {
   DAT::t_v_array d_v;
   DAT::t_f_array d_f;
 
-  DAT::t_x_array d_sp;
-  DAT::t_x_array d_fm;
-  DAT::t_x_array d_fm_long;
+  DAT::t_sp_array d_sp;
+  DAT::t_fm_array d_fm;
+  DAT::t_fm_long_array d_fm_long;
 
-  HAT::t_x_array h_sp;
-  HAT::t_x_array h_fm;
-  HAT::t_x_array h_fm_long;
+  HAT::t_sp_array h_sp;
+  HAT::t_fm_array h_fm;
+  HAT::t_fm_long_array h_fm_long;
 
   DAT::tdual_int_1d k_count;
 };

From a8d304405ddca36740deef2e8608d8b4c782f88a Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Wed, 30 Sep 2020 15:55:18 -0600
Subject: [PATCH 10/64] before pull from other machine

---
 src/SPIN/pair_spin_exchange_biquadratic.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index cf351e6539..3fffb8b58e 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -416,11 +416,9 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j,
  
   Jex_mech = 1.0-rja-J2[itype][jtype]*rja*(2.0-rja);
   Jex_mech *= 8.0*Jex*rjr*exp(-rja);
-  // Jex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
   
   Kex_mech = 1.0-rka-K2[itype][jtype]*rka*(2.0-rka);
   Kex_mech *= 8.0*Kex*rkr*exp(-rka);
-  // Kex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 

From 84c104641b4d510cfba8535085f9f17befe22926 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Fri, 2 Oct 2020 10:47:29 -0600
Subject: [PATCH 11/64] adding offset option and doc

---
 doc/src/pair_spin_exchange.rst              | 159 ++++++++++++++++----
 src/SPIN/pair_spin_exchange.cpp             | 114 ++++++++++----
 src/SPIN/pair_spin_exchange.h               |   6 +-
 src/SPIN/pair_spin_exchange_biquadratic.cpp |  71 +++++++--
 src/SPIN/pair_spin_exchange_biquadratic.h   |   4 +-
 src/SPIN/pair_spin_neel.cpp                 |   2 +-
 6 files changed, 279 insertions(+), 77 deletions(-)

diff --git a/doc/src/pair_spin_exchange.rst b/doc/src/pair_spin_exchange.rst
index 14eefaccec..32a722c5f0 100644
--- a/doc/src/pair_spin_exchange.rst
+++ b/doc/src/pair_spin_exchange.rst
@@ -3,12 +3,16 @@
 pair_style spin/exchange command
 ================================
 
+pair_style spin/exchange/biquadratic command
+================================
+
 Syntax
 """"""
 
 .. code-block:: LAMMPS
 
    pair_style spin/exchange cutoff
+   pair_style spin/exchange/biquadratic cutoff
 
 * cutoff = global cutoff pair (distance in metal units)
 
@@ -19,7 +23,10 @@ Examples
 
    pair_style spin/exchange 4.0
    pair_coeff * * exchange 4.0 0.0446928 0.003496 1.4885
-   pair_coeff 1 2 exchange 6.0 -0.01575 0.0 1.965
+   pair_coeff 1 2 exchange 6.0 -0.01575 0.0 1.965 offset yes
+   pair_style spin/exchange/biquadratic 4.0
+   pair_coeff * * biquadratic 4.0 0.05 0.03 1.48 0.05 0.03 1.48 offset no
+   pair_coeff 1 2 biquadratic 6.0 -0.01 0.0 1.9 0.0 0.1 19
 
 Description
 """""""""""
@@ -31,69 +38,163 @@ pairs of magnetic spins:
 
    H_{ex} = -\sum_{i,j}^N J_{ij} (r_{ij}) \,\vec{s}_i \cdot \vec{s}_j
 
-where :math:`\vec{s}_i` and :math:`\vec{s}_j` are two neighboring magnetic spins of two particles,
-:math:`r_{ij} = \vert \vec{r}_i - \vec{r}_j \vert` is the inter-atomic distance between the two
-particles. The summation is over pairs of nearest neighbors.
-:math:`J(r_{ij})` is a function defining the intensity and the sign of the exchange
-interaction for different neighboring shells. This function is defined as:
+where :math:`\vec{s}_i` and :math:`\vec{s}_j` are two unit vectors representing
+the magnetic spins of two particles (usually atoms), and 
+:math:`r_{ij} = \vert \vec{r}_i - \vec{r}_j \vert` is the inter-atomic distance 
+between those two particles. The summation is over pairs of nearest neighbors. 
+:math:`J(r_{ij})` is a function defining the intensity and the sign of the 
+exchange interaction for different neighboring shells. 
+
+Style *spin/exchange/biquadratic* computes a biquadratic exchange interaction 
+between pairs of magnetic spins:
+
+.. math::
+  
+   H_{bi} = -\sum_{i, j}^{N} {J}_{ij} \left(r_{ij} \right)\,
+                      \vec{s}_{i}\cdot \vec{s}_{j} 
+                      -\sum_{i, j}^{N} {K}_{ij} \left(r_{ij} \right)\,
+                      \left(\vec{s}_{i}\cdot 
+                      \vec{s}_{j}\right)^2
+
+where :math:`\vec{s}_i`,  :math:`\vec{s}_j`,  :math:`r_{ij}` and 
+:math:`J(r_{ij})` have the same definitions as above, and :math:`K(r_{ij})` is 
+a second function, defining the intensity and the sign of the biquadratic term.
+
+The interatomic dependence of :math:`J(r_{ij})` and :math:`K(r_{ij})` in both 
+interactions above is defined by the following function:
 
 .. math::
 
-    {J}\left( r_{ij} \right) = 4 a \left( \frac{r_{ij}}{d}  \right)^2 \left( 1 - b \left( \frac{r_{ij}}{d}  \right)^2 \right) e^{-\left( \frac{r_{ij}}{d} \right)^2 }\Theta (R_c - r_{ij})
+    {f}\left( r_{ij} \right) = 4 a \left( \frac{r_{ij}}{d}  \right)^2 
+    \left( 1 - b \left( \frac{r_{ij}}{d}  \right)^2 \right) 
+    e^{-\left( \frac{r_{ij}}{d} \right)^2 }\Theta (R_c - r_{ij})
 
-where :math:`a`, :math:`b` and :math:`d` are the three constant coefficients defined in the associated
-"pair_coeff" command, and :math:`R_c` is the radius cutoff associated to
-the pair interaction (see below for more explanations).
+where :math:`a`, :math:`b` and :math:`d` are the three constant coefficients 
+defined in the associated "pair_coeff" command, and :math:`R_c` is the radius 
+cutoff associated to the pair interaction (see below for more explanations).
 
-The coefficients :math:`a`, :math:`b`, and :math:`d` need to be fitted so that the function above matches with
-the value of the exchange interaction for the :math:`N` neighbor shells taken into account.
-Examples and more explanations about this function and its parameterization are reported
-in :ref:`(Tranchida) <Tranchida3>`.
+The coefficients :math:`a`, :math:`b`, and :math:`d` need to be fitted so that 
+the function above matches with the value of the exchange interaction for the 
+:math:`N` neighbor shells taken into account.
+Examples and more explanations about this function and its parameterization 
+are reported in :ref:`(Tranchida) <Tranchida3>`.
+
+When a *spin/exchange/biquadratic* pair style is defined, six coefficients 
+(three for :math:`J(r_{ij})`, and three for :math:`K(r_{ij})`) have to be 
+fitted.
 
 From this exchange interaction, each spin :math:`i` will be submitted
-to a magnetic torque :math:`\vec{\omega}`, and its associated atom can be submitted to a
-force :math:`\vec{F}` for spin-lattice calculations (see :doc:`fix nve/spin <fix_nve_spin>`),
-such as:
+to a magnetic torque :math:`\vec{\omega}_{i}`, and its associated atom can be 
+submitted to a force :math:`\vec{F}_{i}` for spin-lattice calculations (see 
+:doc:`fix nve/spin <fix_nve_spin>`), such as:
 
 .. math::
 
    \vec{\omega}_{i} = \frac{1}{\hbar} \sum_{j}^{Neighb} {J}
    \left(r_{ij} \right)\,\vec{s}_{j}
    ~~{\rm and}~~
-   \vec{F}_{i} = \sum_{j}^{Neighb} \frac{\partial {J} \left(r_{ij} \right)}{ \partial r_{ij}} \left( \vec{s}_{i}\cdot \vec{s}_{j} \right) \vec{e}_{ij}
+   \vec{F}_{i} = \sum_{j}^{Neighb} \frac{\partial {J} \left(r_{ij} \right)}{ 
+   \partial r_{ij}} \left( \vec{s}_{i}\cdot \vec{s}_{j} \right) \vec{e}_{ij}
 
-with :math:`\hbar` the Planck constant (in metal units), and :math:`\vec{e}_{ij} = \frac{\vec{r}_i - \vec{r}_j}{\vert \vec{r}_i-\vec{r}_j \vert}` the unit
+with :math:`\hbar` the Planck constant (in metal units), and :math:`\vec{e}_{ij}
+= \frac{\vec{r}_i - \vec{r}_j}{\vert \vec{r}_i-\vec{r}_j \vert}` the unit
 vector between sites :math:`i` and :math:`j`.
+Equivalent forces and magnetic torques are generated for the biquadratic term 
+when a *spin/exchange/biquadratic* pair style is defined.
 
 More details about the derivation of these torques/forces are reported in
 :ref:`(Tranchida) <Tranchida3>`.
 
-For the *spin/exchange* pair style, the following coefficients must be defined
-for each pair of atoms types via the :doc:`pair_coeff <pair_coeff>` command as in
-the examples above, or in the data file or restart files read by the
-:doc:`read_data <read_data>` or :doc:`read_restart <read_restart>` commands, and
-set in the following order:
+For the *spin/exchange* and *spin/exchange/biquadratic* pair styles, the 
+following coefficients must be defined for each pair of atoms types via the 
+:doc:`pair_coeff <pair_coeff>` command as in the examples above, or in the data 
+file or restart files read by the :doc:`read_data <read_data>` or 
+:doc:`read_restart <read_restart>` commands, and set in the following order:
 
 * :math:`R_c` (distance units)
 * :math:`a`  (energy units)
 * :math:`b`  (adim parameter)
 * :math:`d`  (distance units)
 
-Note that :math:`R_c` is the radius cutoff of the considered exchange interaction,
-and :math:`a`, :math:`b` and :math:`d` are the three coefficients performing the parameterization
-of the function :math:`J(r_{ij})` defined above.
+for the *spin/exchange* pair style, and:
+
+* :math:`R_c` (distance units)
+* :math:`a_j`  (energy units)
+* :math:`b_j`  (adim parameter)
+* :math:`d_j`  (distance units)
+* :math:`a_k`  (energy units)
+* :math:`b_k`  (adim parameter)
+* :math:`d_k`  (distance units)
+
+for the *spin/exchange/biquadratic* pair style.
+
+Note that :math:`R_c` is the radius cutoff of the considered exchange 
+interaction, and :math:`a`, :math:`b` and :math:`d` are the three coefficients 
+performing the parameterization of the function :math:`J(r_{ij})` defined 
+above (in the *biquadratic* ase, :math:`a_j`, :math:`b_j`, :math:`d_j` and 
+:math:`a_k`, :math:`b_k`, :math:`d_k` are the coefficients of :math:`J(r_{ij})`
+and :math:`K(r_{ij})` respectively).
+
 
 None of those coefficients is optional. If not specified, the
 *spin/exchange* pair style cannot be used.
 
 ----------
 
+**Offsetting magnetic forces and energies**\ :
+
+For spin-lattice simulation, it can be useful to offset the
+mechanical forces and energies generated by the exchange
+interaction.
+The *offset* keyword allows to apply this offset. 
+By setting *offset* to *yes*, the energy definitions above are
+replaced by:
+
+.. math::
+
+   H_{ex} = -\sum_{i,j}^N J_{ij} (r_{ij}) \,[ \vec{s}_i \cdot \vec{s}_j-1 ]
+
+for the *spin/exchange* pair style, and:  
+
+.. math::
+  
+   H_{bi} = -\sum_{i, j}^{N} {J}_{ij} \left(r_{ij} \right)\,
+                      [ \vec{s}_{i}\cdot \vec{s}_{j} -1 ]
+                      -\sum_{i, j}^{N} {K}_{ij} \left(r_{ij} \right)\,
+                      [ \left(\vec{s}_{i}\cdot 
+                      \vec{s}_{j}\right)^2 -1]
+
+for the *spin/exchange/biquadratic* pair style.
+
+Note that this offset only affects the calculation of the energy
+and mechanical forces. It does not modify the calculation of the
+precession vectors (and thus does no impact the purely magnetic
+properties).
+This ensures that when all spins are aligned, the magnetic energy
+and the associated mechanical forces (and thus the pressure
+generated by the magnetic potential) are null. 
+
+.. note::
+  This offset term can be very important when calculations such as
+  equations of state (energy vs volume, or energy vs pressure) are
+  being performed. Indeed, setting the *offset* term ensures that
+  at the ground state of the crystal and at the equilibrium magnetic
+  configuration (typically ferromagnetic), the pressure is null,
+  as expected.
+  Otherwise, magnetic forces could generate a residual pressure.
+
+When the *offset* option is set to *no*, no offset is applied
+(also corresponding to the default option).
+
+----------
+
 Restrictions
 """"""""""""
 
 All the *pair/spin* styles are part of the SPIN package.  These styles
 are only enabled if LAMMPS was built with this package, and if the
-atom_style "spin" was declared.  See the :doc:`Build package <Build_package>` doc page for more info.
+atom_style "spin" was declared.  
+See the :doc:`Build package <Build_package>` doc page for more info.
 
 Related commands
 """"""""""""""""
@@ -103,7 +204,7 @@ Related commands
 
 **Default:**
 
-none
+The default *offset* keyword value is *no*.
 
 ----------
 
diff --git a/src/SPIN/pair_spin_exchange.cpp b/src/SPIN/pair_spin_exchange.cpp
index 611230c73e..5792738fd1 100644
--- a/src/SPIN/pair_spin_exchange.cpp
+++ b/src/SPIN/pair_spin_exchange.cpp
@@ -40,6 +40,14 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
+PairSpinExchange::PairSpinExchange(LAMMPS *lmp) : 
+  PairSpin(lmp) 
+{
+  e_offset = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
 PairSpinExchange::~PairSpinExchange()
 {
   if (allocated) {
@@ -61,6 +69,8 @@ PairSpinExchange::~PairSpinExchange()
 void PairSpinExchange::settings(int narg, char **arg)
 {
   PairSpin::settings(narg,arg);
+  
+  if (narg != 1) error->all(FLERR,"Illegal pair_style command");
 
   cut_spin_exchange_global = force->numeric(FLERR,arg[0]);
 
@@ -87,9 +97,9 @@ void PairSpinExchange::coeff(int narg, char **arg)
   // check if args correct
 
   if (strcmp(arg[2],"exchange") != 0)
-    error->all(FLERR,"Incorrect args in pair_style command");
-  if (narg != 7)
-    error->all(FLERR,"Incorrect args in pair_style command");
+    error->all(FLERR,"Incorrect args for pair coefficients");
+  if ((narg != 7) && (narg != 9))
+    error->all(FLERR,"Incorrect args for pair coefficients");
 
   int ilo,ihi,jlo,jhi;
   force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
@@ -97,11 +107,25 @@ void PairSpinExchange::coeff(int narg, char **arg)
 
   // get exchange arguments from input command
 
+  int iarg = 7;
   const double rc = force->numeric(FLERR,arg[3]);
   const double j1 = force->numeric(FLERR,arg[4]);
   const double j2 = force->numeric(FLERR,arg[5]);
   const double j3 = force->numeric(FLERR,arg[6]);
 
+  // read energy offset flag if specified
+
+  while (iarg < narg) { 
+    if (strcmp(arg[7],"offset") == 0) { 
+      if (strcmp(arg[8],"yes") == 0) {
+        e_offset = 1;
+      } else if  (strcmp(arg[8],"no") == 0) {
+        e_offset = 0;
+      } else error->all(FLERR,"Incorrect args for pair coefficients");
+      iarg += 2; 
+    } else error->all(FLERR,"Incorrect args for pair coefficients");
+  }
+  
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
@@ -236,8 +260,7 @@ void PairSpinExchange::compute(int eflag, int vflag)
           compute_exchange_mech(i,j,rsq,eij,fi,spi,spj);
         
         if (eflag) {
-          evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
-          evdwl *= 0.5*hbar;
+          evdwl -= compute_energy(i,j,rsq,spi,spj);
           emag[i] += evdwl;
         } else evdwl = 0.0;
 
@@ -373,7 +396,9 @@ void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq,
 {
   int *type = atom->type;
   int itype, jtype;
-  double Jex, Jex_mech, ra, rr, iJ3;
+  double Jex, Jex_mech, ra, sdots;
+  double rr, iJ3;
+  double fx, fy, fz;
   itype = type[i];
   jtype = type[j];
 
@@ -385,38 +410,62 @@ void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq,
 
   Jex_mech = 1.0-ra-J2[itype][jtype]*ra*(2.0-ra);
   Jex_mech *= 8.0*Jex*rr*exp(-ra);
-  Jex_mech *= (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+  
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
+  
+  // apply or not energy and force offset
+  
+  fx = fy = fz = 0.0;
+  if (e_offset == 1) { // set offset
+    fx = Jex_mech*(sdots-1.0)*eij[0];
+    fy = Jex_mech*(sdots-1.0)*eij[1];
+    fz = Jex_mech*(sdots-1.0)*eij[2];
+  } else if (e_offset == 0) { // no offset ("normal" calculation)
+    fx =  Jex_mech*sdots*eij[0];
+    fy =  Jex_mech*sdots*eij[1];
+    fz =  Jex_mech*sdots*eij[2];
+  } else error->all(FLERR,"Illegal option in pair exchange/biquadratic command");
+
+  fi[0] -= 0.5*fx;
+  fi[1] -= 0.5*fy;
+  fi[2] -= 0.5*fz;
+  // fi[0] -= fx;
+  // fi[1] -= fy;
+  // fi[2] -= fz;
 
-  fi[0] -= 0.5*Jex_mech*eij[0];
-  fi[1] -= 0.5*Jex_mech*eij[1];
-  fi[2] -= 0.5*Jex_mech*eij[2];
-  // fi[0] -= Jex_mech*eij[0];
-  // fi[1] -= Jex_mech*eij[1];
-  // fi[2] -= Jex_mech*eij[2];
 }
 
 /* ----------------------------------------------------------------------
    compute energy of spin pair i and j
 ------------------------------------------------------------------------- */
 
-// double PairSpinExchange::compute_energy(int i, int j, double rsq, double spi[3], double spj[3])
-// {
-//   int *type = atom->type;
-//   int itype, jtype;
-//   double Jex, ra;
-//   double energy = 0.0;
-//   itype = type[i];
-//   jtype = type[j];
-//
-//   Jex = J1_mech[itype][jtype];
-//   ra = rsq/J3[itype][jtype]/J3[itype][jtype];
-//   Jex = 4.0*Jex*ra;
-//   Jex *= (1.0-J2[itype][jtype]*ra);
-//   Jex *= exp(-ra);
-//
-//   energy = Jex*(spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
-//   return energy;
-// }
+double PairSpinExchange::compute_energy(int i, int j, double rsq, double spi[3], double spj[3])
+{
+  int *type = atom->type;
+  int itype, jtype;
+  double Jex, ra, sdots;
+  double energy = 0.0;
+  itype = type[i];
+  jtype = type[j];
+
+  Jex = J1_mech[itype][jtype];
+  ra = rsq/J3[itype][jtype]/J3[itype][jtype];
+  Jex = 4.0*Jex*ra;
+  Jex *= (1.0-J2[itype][jtype]*ra);
+  Jex *= exp(-ra);
+  
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
+
+  // apply or not energy and force offset
+  
+  if (e_offset == 1) { // set offset
+    energy = 0.5*Jex*(sdots-1.0);
+  } else if (e_offset == 0) { // no offset ("normal" calculation)
+    energy = 0.5*Jex*sdots;
+  } else error->all(FLERR,"Illegal option in pair exchange/biquadratic command");
+  
+  return energy;
+}
 
 /* ----------------------------------------------------------------------
    allocate all arrays
@@ -505,6 +554,7 @@ void PairSpinExchange::read_restart(FILE *fp)
 void PairSpinExchange::write_restart_settings(FILE *fp)
 {
   fwrite(&cut_spin_exchange_global,sizeof(double),1,fp);
+  fwrite(&e_offset,sizeof(int),1,fp);
   fwrite(&offset_flag,sizeof(int),1,fp);
   fwrite(&mix_flag,sizeof(int),1,fp);
 }
@@ -517,10 +567,12 @@ void PairSpinExchange::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     utils::sfread(FLERR,&cut_spin_exchange_global,sizeof(double),1,fp,NULL,error);
+    utils::sfread(FLERR,&e_offset,sizeof(int),1,fp,NULL,error);
     utils::sfread(FLERR,&offset_flag,sizeof(int),1,fp,NULL,error);
     utils::sfread(FLERR,&mix_flag,sizeof(int),1,fp,NULL,error);
   }
   MPI_Bcast(&cut_spin_exchange_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&e_offset,1,MPI_INT,0,world);
   MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
   MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
 }
diff --git a/src/SPIN/pair_spin_exchange.h b/src/SPIN/pair_spin_exchange.h
index 4e9e6bfac8..2a31f9516e 100644
--- a/src/SPIN/pair_spin_exchange.h
+++ b/src/SPIN/pair_spin_exchange.h
@@ -26,7 +26,7 @@ namespace LAMMPS_NS {
 
 class PairSpinExchange : public PairSpin {
  public:
-  PairSpinExchange(LAMMPS *lmp) : PairSpin(lmp) {}
+  PairSpinExchange(class LAMMPS *);
   virtual ~PairSpinExchange();
   void settings(int, char **);
   void coeff(int, char **);
@@ -38,8 +38,7 @@ class PairSpinExchange : public PairSpin {
 
   void compute_exchange(int, int, double, double *, double *);
   void compute_exchange_mech(int, int, double, double *, double *, double *, double *);
-
-  // double compute_energy(int , int , double , double *, double *);
+  double compute_energy(int , int , double , double *, double *);
 
   void write_restart(FILE *);
   void read_restart(FILE *);
@@ -49,6 +48,7 @@ class PairSpinExchange : public PairSpin {
   double cut_spin_exchange_global;      // global exchange cutoff distance
 
  protected:
+  int e_offset;                         // apply energy offset
   double **J1_mag;                      // exchange coeffs in eV
   double **J1_mech;                     // mech exchange coeffs in
   double **J2, **J3;                    // J1 in eV, J2 adim, J3 in Ang
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 3fffb8b58e..4c6c3936cf 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -40,6 +40,14 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
+PairSpinExchangeBiquadratic::PairSpinExchangeBiquadratic(LAMMPS *lmp) : 
+  PairSpin(lmp) 
+{
+  e_offset = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
 PairSpinExchangeBiquadratic::~PairSpinExchangeBiquadratic()
 {
   if (allocated) {
@@ -66,6 +74,8 @@ void PairSpinExchangeBiquadratic::settings(int narg, char **arg)
 {
   PairSpin::settings(narg,arg);
 
+  if (narg != 1) error->all(FLERR,"Illegal pair_style command");
+
   cut_spin_exchange_global = force->numeric(FLERR,arg[0]);
 
   // reset cutoffs that have been explicitly set
@@ -91,9 +101,9 @@ void PairSpinExchangeBiquadratic::coeff(int narg, char **arg)
   // check if args correct
 
   if (strcmp(arg[2],"biquadratic") != 0)
-    error->all(FLERR,"Incorrect args in pair_style command");
-  if (narg != 10)
-    error->all(FLERR,"Incorrect args in pair_style command");
+    error->all(FLERR,"Incorrect args for pair coefficients");
+  if ((narg != 10) && (narg != 12))
+    error->all(FLERR,"Incorrect args for pair coefficients");
 
   int ilo,ihi,jlo,jhi;
   force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
@@ -101,6 +111,7 @@ void PairSpinExchangeBiquadratic::coeff(int narg, char **arg)
 
   // get exchange arguments from input command
 
+  int iarg = 10;
   const double rc = force->numeric(FLERR,arg[3]);
   const double j1 = force->numeric(FLERR,arg[4]);
   const double j2 = force->numeric(FLERR,arg[5]);
@@ -109,6 +120,19 @@ void PairSpinExchangeBiquadratic::coeff(int narg, char **arg)
   const double k2 = force->numeric(FLERR,arg[8]);
   const double k3 = force->numeric(FLERR,arg[9]);
 
+  // read energy offset flag if specified
+
+  while (iarg < narg) { 
+    if (strcmp(arg[10],"offset") == 0) { 
+      if (strcmp(arg[11],"yes") == 0) {
+        e_offset = 1;
+      } else if  (strcmp(arg[11],"no") == 0) {
+        e_offset = 0;
+      } else error->all(FLERR,"Incorrect args for pair coefficients");
+      iarg += 2; 
+    } else error->all(FLERR,"Incorrect args for pair coefficients");
+  }
+
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
@@ -399,8 +423,9 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j,
 {
   int *type = atom->type;
   int itype,jtype;
-  double Jex,Jex_mech,Kex,Kex_mech,ra,sdots;
+  double Jex,Jex_mech,Kex,Kex_mech,sdots;
   double rja,rka,rjr,rkr,iJ3,iK3;
+  double fx, fy, fz;
   itype = type[i];
   jtype = type[j];
 
@@ -422,12 +447,25 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j,
 
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
-  fi[0] -= 0.5*(Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
-  fi[1] -= 0.5*(Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[1];
-  fi[2] -= 0.5*(Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[2];
-  // fi[0] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
-  // fi[1] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[1];
-  // fi[2] -= (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[2];
+  // apply or not energy and force offset
+  
+  fx = fy = fz = 0.0;
+  if (e_offset == 1) { // set offset
+    fx = (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
+    fy = (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[1];
+    fz = (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[2];
+  } else if (e_offset == 0) { // no offset ("normal" calculation)
+    fx =  (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[0];
+    fy =  (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[1];
+    fz =  (Jex_mech*sdots + Kex_mech*sdots*sdots)*eij[2];
+  } else error->all(FLERR,"Illegal option in pair exchange/biquadratic command");
+
+  fi[0] -= 0.5*fx;
+  fi[1] -= 0.5*fy;
+  fi[2] -= 0.5*fz;
+  // fi[0] -= fx;
+  // fi[1] -= fy;
+  // fi[2] -= fz;
 }
 
 /* ----------------------------------------------------------------------
@@ -463,8 +501,14 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
 
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
 
-  energy = 0.5*(Jex*(sdots-1.0) + Kex*(sdots*sdots-1.0));
-  // energy = 0.5*(Jex*(sdots) + Kex*(sdots*sdots-1.0));
+  // apply or not energy and force offset
+  
+  if (e_offset == 1) { // set offset
+    energy = 0.5*(Jex*(sdots-1.0) + Kex*(sdots*sdots-1.0));
+  } else if (e_offset == 0) { // no offset ("normal" calculation)
+    energy = 0.5*(Jex*sdots + Kex*sdots*sdots);
+  } else error->all(FLERR,"Illegal option in pair exchange/biquadratic command");
+  
   return energy;
 }
 
@@ -571,6 +615,7 @@ void PairSpinExchangeBiquadratic::read_restart(FILE *fp)
 void PairSpinExchangeBiquadratic::write_restart_settings(FILE *fp)
 {
   fwrite(&cut_spin_exchange_global,sizeof(double),1,fp);
+  fwrite(&e_offset,sizeof(int),1,fp);
   fwrite(&offset_flag,sizeof(int),1,fp);
   fwrite(&mix_flag,sizeof(int),1,fp);
 }
@@ -583,10 +628,12 @@ void PairSpinExchangeBiquadratic::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
     utils::sfread(FLERR,&cut_spin_exchange_global,sizeof(double),1,fp,NULL,error);
+    utils::sfread(FLERR,&e_offset,sizeof(int),1,fp,NULL,error);
     utils::sfread(FLERR,&offset_flag,sizeof(int),1,fp,NULL,error);
     utils::sfread(FLERR,&mix_flag,sizeof(int),1,fp,NULL,error);
   }
   MPI_Bcast(&cut_spin_exchange_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&e_offset,1,MPI_INT,0,world);
   MPI_Bcast(&offset_flag,1,MPI_INT,0,world);
   MPI_Bcast(&mix_flag,1,MPI_INT,0,world);
 }
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.h b/src/SPIN/pair_spin_exchange_biquadratic.h
index 6fb9a7a94c..1074b50f7b 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.h
+++ b/src/SPIN/pair_spin_exchange_biquadratic.h
@@ -26,7 +26,7 @@ namespace LAMMPS_NS {
 
 class PairSpinExchangeBiquadratic : public PairSpin {
  public:
-  PairSpinExchangeBiquadratic(LAMMPS *lmp) : PairSpin(lmp) {}
+  PairSpinExchangeBiquadratic(class LAMMPS *);
   virtual ~PairSpinExchangeBiquadratic();
   void settings(int, char **);
   void coeff(int, char **);
@@ -48,6 +48,8 @@ class PairSpinExchangeBiquadratic : public PairSpin {
   double cut_spin_exchange_global;      // global exchange cutoff distance
 
  protected:
+  
+  int e_offset;                         // apply energy offset
   double **J1_mag;                      // H exchange coeffs in eV
   double **J1_mech;                     // mech exchange coeffs in
   double **J2, **J3;                    // J1 in eV, J2 in Ang-1, J3 in Ang
diff --git a/src/SPIN/pair_spin_neel.cpp b/src/SPIN/pair_spin_neel.cpp
index fc7cb6ab9a..c2377e7aee 100644
--- a/src/SPIN/pair_spin_neel.cpp
+++ b/src/SPIN/pair_spin_neel.cpp
@@ -612,7 +612,7 @@ double PairSpinNeel::compute_neel_energy(int i, int j, double rsq, double eij[3]
   eij_sj_3 = eij_sj*eij_sj_2;
   epq2 = q2r*(eij_si*eij_sj_3+eij_sj*eij_si_3);
 
-  return (epd+epq1+epq2);
+  return 0.5*(epd+epq1+epq2);
 }
 
 /* ----------------------------------------------------------------------

From 1cb0b9dece6e7e07b6dc8f2ba6bbd790b1bbfe9c Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 5 Oct 2020 17:11:53 -0600
Subject: [PATCH 12/64] - modified all pairs (if on rcut) - clean KOKKOS from
 atom spin/kk (other PR)

---
 src/KOKKOS/Install.sh                       |    2 -
 src/KOKKOS/atom_kokkos.cpp                  |    6 -
 src/KOKKOS/atom_kokkos.h                    |    5 -
 src/KOKKOS/atom_vec_spin_kokkos.cpp         | 1308 -------------------
 src/KOKKOS/atom_vec_spin_kokkos.h           |  132 --
 src/KOKKOS/kokkos_type.h                    |   60 -
 src/SPIN/pair_spin_dipole_cut.cpp           |   48 +-
 src/SPIN/pair_spin_dipole_long.cpp          |   46 +-
 src/SPIN/pair_spin_dmi.cpp                  |   49 +-
 src/SPIN/pair_spin_exchange_biquadratic.cpp |   33 +-
 src/SPIN/pair_spin_magelec.cpp              |   46 +-
 src/SPIN/pair_spin_neel.cpp                 |   44 +-
 12 files changed, 143 insertions(+), 1636 deletions(-)
 delete mode 100644 src/KOKKOS/atom_vec_spin_kokkos.cpp
 delete mode 100644 src/KOKKOS/atom_vec_spin_kokkos.h

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 4c5c9d7e1d..03508578ae 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -63,8 +63,6 @@ action atom_vec_bond_kokkos.cpp atom_vec_bond.cpp
 action atom_vec_bond_kokkos.h atom_vec_bond.h
 action atom_vec_charge_kokkos.cpp
 action atom_vec_charge_kokkos.h
-action atom_vec_spin_kokkos.cpp
-action atom_vec_spin_kokkos.h
 action atom_vec_dpd_kokkos.cpp atom_vec_dpd.cpp
 action atom_vec_dpd_kokkos.h atom_vec_dpd.h
 action atom_vec_full_kokkos.cpp atom_vec_full.cpp
diff --git a/src/KOKKOS/atom_kokkos.cpp b/src/KOKKOS/atom_kokkos.cpp
index b85b063190..a587494d09 100644
--- a/src/KOKKOS/atom_kokkos.cpp
+++ b/src/KOKKOS/atom_kokkos.cpp
@@ -76,12 +76,6 @@ AtomKokkos::~AtomKokkos()
   memoryKK->destroy_kokkos(k_improper_atom3, improper_atom3);
   memoryKK->destroy_kokkos(k_improper_atom4, improper_atom4);
 
-  // SPIN package
-
-  memoryKK->destroy_kokkos(k_sp, sp);
-  memoryKK->destroy_kokkos(k_fm, fm);
-  memoryKK->destroy_kokkos(k_fm_long, fm_long);
-
   // USER-DPD package
   memoryKK->destroy_kokkos(k_uCond,uCond);
   memoryKK->destroy_kokkos(k_uMech,uMech);
diff --git a/src/KOKKOS/atom_kokkos.h b/src/KOKKOS/atom_kokkos.h
index e2c666fea5..6eebbad661 100644
--- a/src/KOKKOS/atom_kokkos.h
+++ b/src/KOKKOS/atom_kokkos.h
@@ -54,11 +54,6 @@ class AtomKokkos : public Atom {
 
   DAT::tdual_float_2d k_dvector;
 
-  // SPIN package
-
-  DAT::tdual_sp_array k_sp;
-  DAT::tdual_fm_array k_fm;
-  DAT::tdual_fm_long_array k_fm_long;
 
 // USER-DPD package
   DAT::tdual_efloat_1d k_uCond, k_uMech, k_uChem, k_uCG, k_uCGnew,
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.cpp b/src/KOKKOS/atom_vec_spin_kokkos.cpp
deleted file mode 100644
index 6ed62c0242..0000000000
--- a/src/KOKKOS/atom_vec_spin_kokkos.cpp
+++ /dev/null
@@ -1,1308 +0,0 @@
-/* ----------------------------------------------------------------------
-
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-
-------------------------------------------------------------------------- */
-
-/* ------------------------------------------------------------------------
-   Contributing authors: Julien Tranchida (SNL)
-                         Aidan Thompson (SNL)
-
-   Please cite the related publication:
-   Tranchida, J., Plimpton, S. J., Thibaudeau, P., & Thompson, A. P. (2018).
-   Massively parallel symplectic algorithm for coupled magnetic spin dynamics
-   and molecular dynamics. Journal of Computational Physics.
-------------------------------------------------------------------------- */
-
-#include "atom_vec_spin_kokkos.h"
-#include "atom_kokkos.h"
-#include "comm_kokkos.h"
-#include "domain.h"
-#include "modify.h"
-#include "fix.h"
-#include "atom_masks.h"
-#include "memory_kokkos.h"
-#include "error.h"
-#include "utils.h"
-
-using namespace LAMMPS_NS;
-
-#define DELTA 10
-
-/* ---------------------------------------------------------------------- */
-
-AtomVecSpinKokkos::AtomVecSpinKokkos(LAMMPS *lmp) : AtomVecKokkos(lmp)
-{
-  molecular = 0;
-  mass_type = 1;
-  forceclearflag = 1;
-
-  comm_x_only = comm_f_only = 0;
-  size_forward = 7;
-  size_reverse = 9;
-  size_border = 10;
-  size_velocity = 3;
-  size_data_atom = 9;
-  size_data_vel = 4;
-  xcol_data = 4;
-
-  atom->sp_flag = 1;
-  
-  k_count = DAT::tdual_int_1d("atom::k_count",1);
-  atomKK = (AtomKokkos *) atom;
-  commKK = (CommKokkos *) comm;
-}
-
-/* ----------------------------------------------------------------------
-   grow atom arrays
-   n = 0 grows arrays by a chunk
-   n > 0 allocates arrays to size n
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::grow(int n)
-{
-  int step = MAX(DELTA,nmax*0.01);
-  if (n == 0) nmax += step;
-  else nmax = n;
-  atomKK->nmax = nmax;
-  if (nmax < 0 || nmax > MAXSMALLINT)
-    error->one(FLERR,"Per-processor system is too big");
-
-  atomKK->sync(Device,ALL_MASK);
-  atomKK->modified(Device,ALL_MASK);
-
-  memoryKK->grow_kokkos(atomKK->k_tag,atomKK->tag,nmax,"atom:tag");
-  memoryKK->grow_kokkos(atomKK->k_type,atomKK->type,nmax,"atom:type");
-  memoryKK->grow_kokkos(atomKK->k_mask,atomKK->mask,nmax,"atom:mask");
-  memoryKK->grow_kokkos(atomKK->k_image,atomKK->image,nmax,"atom:image");
-   
-  // allocating mech. quantities
-
-  memoryKK->grow_kokkos(atomKK->k_x,atomKK->x,nmax,"atom:x");
-  memoryKK->grow_kokkos(atomKK->k_v,atomKK->v,nmax,"atom:v");
-  memoryKK->grow_kokkos(atomKK->k_f,atomKK->f,nmax,"atom:f");
-  
-  // allocating mag. quantities
-
-  memoryKK->grow_kokkos(atomKK->k_sp,atomKK->sp,nmax,"atom:sp");
-  memoryKK->grow_kokkos(atomKK->k_fm,atomKK->fm,nmax,"atom:fm");
-  memoryKK->grow_kokkos(atomKK->k_fm_long,atomKK->fm_long,nmax,"atom:fm_long");
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      modify->fix[atom->extra_grow[iextra]]->grow_arrays(nmax);
-}
-
-/* ----------------------------------------------------------------------
-   reset local array ptrs
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::grow_reset()
-{
-  tag = atomKK->tag;
-  d_tag = atomKK->k_tag.d_view;
-  h_tag = atomKK->k_tag.h_view;
-
-  type = atomKK->type;
-  d_type = atomKK->k_type.d_view;
-  h_type = atomKK->k_type.h_view;
-  mask = atomKK->mask;
-  d_mask = atomKK->k_mask.d_view;
-  h_mask = atomKK->k_mask.h_view;
-  image = atomKK->image;
-  d_image = atomKK->k_image.d_view;
-  h_image = atomKK->k_image.h_view;
-  
-  x = atomKK->x;
-  d_x = atomKK->k_x.d_view;
-  h_x = atomKK->k_x.h_view;
-  v = atomKK->v;
-  d_v = atomKK->k_v.d_view;
-  h_v = atomKK->k_v.h_view;
-  f = atomKK->f;
-  d_f = atomKK->k_f.d_view;
-  h_f = atomKK->k_f.h_view;
-  
-  sp = atomKK->sp; 
-  d_sp = atomKK->k_sp.d_view;
-  h_sp = atomKK->k_sp.h_view;
-  fm = atomKK->fm; 
-  d_fm = atomKK->k_fm.d_view;
-  h_fm = atomKK->k_fm.h_view;
-  fm_long = atomKK->fm_long;
-  d_fm_long = atomKK->k_fm_long.d_view;
-  h_fm_long = atomKK->k_fm_long.h_view;
-}
-
-/* ----------------------------------------------------------------------
-   copy atom I info to atom J
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::copy(int i, int j, int delflag)
-{
-  h_tag[j] = h_tag[i];
-  h_type[j] = h_type[i];
-  mask[j] = mask[i];
-  h_image[j] = h_image[i];
-  h_x(j,0) = h_x(i,0);
-  h_x(j,1) = h_x(i,1);
-  h_x(j,2) = h_x(i,2);
-  h_v(j,0) = h_v(i,0);
-  h_v(j,1) = h_v(i,1);
-  h_v(j,2) = h_v(i,2);
-
-  h_sp(j,0) = h_sp(i,0); 
-  h_sp(j,1) = h_sp(i,1);
-  h_sp(j,2) = h_sp(i,2);
-  h_sp(j,3) = h_sp(i,3);
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      modify->fix[atom->extra_grow[iextra]]->copy_arrays(i,j,delflag);
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG,int TRICLINIC>
-struct AtomVecSpinKokkos_PackComm {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  typename ArrayTypes<DeviceType>::t_sp_array_randomread _sp;
-  typename ArrayTypes<DeviceType>::t_xfloat_2d_um _buf;
-  typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  X_FLOAT _xprd,_yprd,_zprd,_xy,_xz,_yz;
-  X_FLOAT _pbc[6];
-  
-  AtomVecSpinKokkos_PackComm(
-      const typename DAT::tdual_x_array &x,
-      const typename DAT::tdual_sp_array &sp,
-      const typename DAT::tdual_xfloat_2d &buf,
-      const typename DAT::tdual_int_2d &list,
-      const int & iswap,
-      const X_FLOAT &xprd, const X_FLOAT &yprd, const X_FLOAT &zprd,
-      const X_FLOAT &xy, const X_FLOAT &xz, const X_FLOAT &yz, const int* const pbc):
-      _x(x.view<DeviceType>()),_sp(sp.view<DeviceType>()),
-      _list(list.view<DeviceType>()),_iswap(iswap),
-      _xprd(xprd),_yprd(yprd),_zprd(zprd),
-      _xy(xy),_xz(xz),_yz(yz) {
-        const size_t maxsend = (buf.view<DeviceType>().extent(0)*buf.view<DeviceType>().extent(1))/3;
-        // const size_t elements = 3;
-        const size_t elements = 7;
-        buffer_view<DeviceType>(_buf,buf,maxsend,elements);
-        _pbc[0] = pbc[0]; _pbc[1] = pbc[1]; _pbc[2] = pbc[2];
-        _pbc[3] = pbc[3]; _pbc[4] = pbc[4]; _pbc[5] = pbc[5];
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-        const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-          _buf(i,3) = _sp(j,0);
-          _buf(i,4) = _sp(j,1);
-          _buf(i,5) = _sp(j,2);
-          _buf(i,6) = _sp(j,3);
-      } else {
-        if (TRICLINIC == 0) {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-          _buf(i,3) = _sp(j,0);
-          _buf(i,4) = _sp(j,1);
-          _buf(i,5) = _sp(j,2);
-          _buf(i,6) = _sp(j,3);
-        } else {
-          _buf(i,0) = _x(j,0) + _pbc[0]*_xprd + _pbc[5]*_xy + _pbc[4]*_xz;
-          _buf(i,1) = _x(j,1) + _pbc[1]*_yprd + _pbc[3]*_yz;
-          _buf(i,2) = _x(j,2) + _pbc[2]*_zprd;
-          _buf(i,3) = _sp(j,0);
-          _buf(i,4) = _sp(j,1);
-          _buf(i,5) = _sp(j,2);
-          _buf(i,6) = _sp(j,3);
-        }
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType,int PBC_FLAG>
-struct AtomVecSpinKokkos_PackBorder {
-  typedef DeviceType device_type;
-
-  typename ArrayTypes<DeviceType>::t_xfloat_2d _buf;
-  const typename ArrayTypes<DeviceType>::t_int_2d_const _list;
-  const int _iswap;
-  const typename ArrayTypes<DeviceType>::t_x_array_randomread _x;
-  const typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
-  const typename ArrayTypes<DeviceType>::t_int_1d _type;
-  const typename ArrayTypes<DeviceType>::t_int_1d _mask;
-  const typename ArrayTypes<DeviceType>::t_sp_array_randomread _sp;
-  X_FLOAT _dx,_dy,_dz;
-
-  AtomVecSpinKokkos_PackBorder(
-      const typename ArrayTypes<DeviceType>::t_xfloat_2d &buf,
-      const typename ArrayTypes<DeviceType>::t_int_2d_const &list,
-      const int & iswap,
-      const typename ArrayTypes<DeviceType>::t_x_array &x,
-      const typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
-      const typename ArrayTypes<DeviceType>::t_int_1d &type,
-      const typename ArrayTypes<DeviceType>::t_int_1d &mask,
-      const typename ArrayTypes<DeviceType>::t_sp_array &sp,
-      const X_FLOAT &dx, const X_FLOAT &dy, const X_FLOAT &dz):
-  _buf(buf),_list(list),_iswap(iswap),
-    _x(x),_tag(tag),_type(type),_mask(mask),_sp(sp),
-    _dx(dx),_dy(dy),_dz(dz) {}
-  
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      const int j = _list(_iswap,i);
-      if (PBC_FLAG == 0) {
-          _buf(i,0) = _x(j,0);
-          _buf(i,1) = _x(j,1);
-          _buf(i,2) = _x(j,2);
-          _buf(i,3) = d_ubuf(_tag(j)).d;
-          _buf(i,4) = d_ubuf(_type(j)).d;
-          _buf(i,5) = d_ubuf(_mask(j)).d;
-          _buf(i,6) = _sp(j,0);
-          _buf(i,7) = _sp(j,1);
-          _buf(i,8) = _sp(j,2);
-          _buf(i,9) = _sp(j,3);
-      } else {
-          _buf(i,0) = _x(j,0) + _dx;
-          _buf(i,1) = _x(j,1) + _dy;
-          _buf(i,2) = _x(j,2) + _dz;
-          _buf(i,3) = d_ubuf(_tag(j)).d;
-          _buf(i,4) = d_ubuf(_type(j)).d;
-          _buf(i,5) = d_ubuf(_mask(j)).d;
-          _buf(i,6) = _sp(j,0);
-          _buf(i,7) = _sp(j,1);
-          _buf(i,8) = _sp(j,2);
-          _buf(i,9) = _sp(j,3);
-      }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap,
-                               int pbc_flag, int *pbc, ExecutionSpace space)
-{
-  X_FLOAT dx,dy,dz;
-
-  if (pbc_flag != 0) {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-    if(space==Host) {
-      AtomVecSpinKokkos_PackBorder<LMPHostType,1> f(
-        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
-        iswap,h_x,h_tag,h_type,h_mask,h_sp,dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-    } else {
-      AtomVecSpinKokkos_PackBorder<LMPDeviceType,1> f(
-        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
-        iswap,d_x,d_tag,d_type,d_mask,d_sp,dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-    }
-
-  } else {
-    dx = dy = dz = 0;
-    if(space==Host) {
-      AtomVecSpinKokkos_PackBorder<LMPHostType,0> f(
-        buf.view<LMPHostType>(), k_sendlist.view<LMPHostType>(),
-        iswap,h_x,h_tag,h_type,h_mask,h_sp,dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-    } else {
-      AtomVecSpinKokkos_PackBorder<LMPDeviceType,0> f(
-        buf.view<LMPDeviceType>(), k_sendlist.view<LMPDeviceType>(),
-        iswap,d_x,d_tag,d_type,d_mask,d_sp,dx,dy,dz);
-      Kokkos::parallel_for(n,f);
-    }
-  }
-  return n*size_border;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::pack_border(int n, int *list, double *buf,
-                               int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = ubuf(h_tag(j)).d;
-      buf[m++] = ubuf(h_type(j)).d;
-      buf[m++] = ubuf(h_mask(j)).d;
-      buf[m++] = h_sp(j,0);
-      buf[m++] = h_sp(j,1);
-      buf[m++] = h_sp(j,2);
-      buf[m++] = h_sp(j,3);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0) + dx;
-      buf[m++] = h_x(j,1) + dy;
-      buf[m++] = h_x(j,2) + dz;
-      buf[m++] = ubuf(h_tag(j)).d;
-      buf[m++] = ubuf(h_type(j)).d;
-      buf[m++] = ubuf(h_mask(j)).d;
-      buf[m++] = h_sp(j,0);
-      buf[m++] = h_sp(j,1);
-      buf[m++] = h_sp(j,2);
-      buf[m++] = h_sp(j,3);
-    }
-  }
-  
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::pack_border_vel(int n, int *list, double *buf,
-                                   int pbc_flag, int *pbc)
-{
-  int i,j,m;
-  double dx,dy,dz,dvx,dvy,dvz;
-
-  m = 0;
-  if (pbc_flag == 0) {
-    for (i = 0; i < n; i++) {
-      j = list[i];
-      buf[m++] = h_x(j,0);
-      buf[m++] = h_x(j,1);
-      buf[m++] = h_x(j,2);
-      buf[m++] = ubuf(h_tag(j)).d;
-      buf[m++] = ubuf(h_type(j)).d;
-      buf[m++] = ubuf(h_mask(j)).d;
-      buf[m++] = h_sp(j,0);
-      buf[m++] = h_sp(j,1);
-      buf[m++] = h_sp(j,2);
-      buf[m++] = h_sp(j,3);
-      buf[m++] = h_v(j,0);
-      buf[m++] = h_v(j,1);
-      buf[m++] = h_v(j,2);
-    }
-  } else {
-    if (domain->triclinic == 0) {
-      dx = pbc[0]*domain->xprd;
-      dy = pbc[1]*domain->yprd;
-      dz = pbc[2]*domain->zprd;
-    } else {
-      dx = pbc[0];
-      dy = pbc[1];
-      dz = pbc[2];
-    }
-    if (!deform_vremap) {
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = ubuf(h_tag(j)).d;
-        buf[m++] = ubuf(h_type(j)).d;
-        buf[m++] = ubuf(h_mask(j)).d;
-        buf[m++] = h_sp(j,0);
-        buf[m++] = h_sp(j,1);
-        buf[m++] = h_sp(j,2);
-        buf[m++] = h_sp(j,3);
-        buf[m++] = h_v(j,0);
-        buf[m++] = h_v(j,1);
-        buf[m++] = h_v(j,2);
-      }
-    } else {
-      dvx = pbc[0]*h_rate[0] + pbc[5]*h_rate[5] + pbc[4]*h_rate[4];
-      dvy = pbc[1]*h_rate[1] + pbc[3]*h_rate[3];
-      dvz = pbc[2]*h_rate[2];
-      for (i = 0; i < n; i++) {
-        j = list[i];
-        buf[m++] = h_x(j,0) + dx;
-        buf[m++] = h_x(j,1) + dy;
-        buf[m++] = h_x(j,2) + dz;
-        buf[m++] = ubuf(h_tag(j)).d;
-        buf[m++] = ubuf(h_type(j)).d;
-        buf[m++] = ubuf(h_mask(j)).d;
-        buf[m++] = h_sp(j,0);
-        buf[m++] = h_sp(j,1);
-        buf[m++] = h_sp(j,2);
-        buf[m++] = h_sp(j,3);
-        if (mask[i] & deform_groupbit) {
-          buf[m++] = h_v(j,0) + dvx;
-          buf[m++] = h_v(j,1) + dvy;
-          buf[m++] = h_v(j,2) + dvz;
-        } else {
-          buf[m++] = h_v(j,0);
-          buf[m++] = h_v(j,1);
-          buf[m++] = h_v(j,2);
-        }
-      }
-    }
-  }
-  
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->pack_border(n,list,&buf[m]);
-
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::pack_border_hybrid(int n, int *list, double *buf)
-{
-  int i,j,m;
-
-  m = 0;
-  for (i = 0; i < n; i++) {
-    j = list[i];
-    buf[m++] = h_sp(j,0);
-    buf[m++] = h_sp(j,1);
-    buf[m++] = h_sp(j,2);
-    buf[m++] = h_sp(j,3);
-  }
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecSpinKokkos_UnpackBorder {
-  typedef DeviceType device_type;
-
-  const typename ArrayTypes<DeviceType>::t_xfloat_2d_const _buf;
-  typename ArrayTypes<DeviceType>::t_x_array _x;
-  typename ArrayTypes<DeviceType>::t_tagint_1d _tag;
-  typename ArrayTypes<DeviceType>::t_int_1d _type;
-  typename ArrayTypes<DeviceType>::t_int_1d _mask;
-  typename ArrayTypes<DeviceType>::t_sp_array _sp;
-  int _first;
-
-
-  AtomVecSpinKokkos_UnpackBorder(
-      const typename ArrayTypes<DeviceType>::t_xfloat_2d_const &buf,
-      typename ArrayTypes<DeviceType>::t_x_array &x,
-      typename ArrayTypes<DeviceType>::t_tagint_1d &tag,
-      typename ArrayTypes<DeviceType>::t_int_1d &type,
-      typename ArrayTypes<DeviceType>::t_int_1d &mask,
-      typename ArrayTypes<DeviceType>::t_sp_array &sp,
-      const int& first):
-    _buf(buf),_x(x),_tag(tag),_type(type),_mask(mask),_sp(sp),_first(first){
-  };
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int& i) const {
-      _x(i+_first,0) = _buf(i,0);
-      _x(i+_first,1) = _buf(i,1);
-      _x(i+_first,2) = _buf(i,2);
-      _tag(i+_first) = (tagint) d_ubuf(_buf(i,3)).i;
-      _type(i+_first) = (int) d_ubuf(_buf(i,4)).i;
-      _mask(i+_first) = (int) d_ubuf(_buf(i,5)).i;
-      _sp(i+_first,0) = _buf(i,6);
-      _sp(i+_first,1) = _buf(i,7);
-      _sp(i+_first,2) = _buf(i,8);
-      _sp(i+_first,3) = _buf(i,9);
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::unpack_border_kokkos(const int &n, const int &first,
-                     const DAT::tdual_xfloat_2d &buf,ExecutionSpace space) {
-  if (first+n >= nmax) {
-    grow(first+n+100);
-  }
-  if(space==Host) {
-    struct AtomVecSpinKokkos_UnpackBorder<LMPHostType>
-      f(buf.view<LMPHostType>(),h_x,h_tag,h_type,h_mask,h_sp,first);
-    Kokkos::parallel_for(n,f);
-  } else {
-    struct AtomVecSpinKokkos_UnpackBorder<LMPDeviceType>
-      f(buf.view<LMPDeviceType>(),d_x,d_tag,d_type,d_mask,d_sp,first);
-    Kokkos::parallel_for(n,f);
-  }
-  atomKK->modified(space,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|SP_MASK);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::unpack_border(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-
-  for (i = first; i < last; i++) {
-    if (i == nmax) {
-      grow(0);
-    }
-    atomKK->modified(Host,X_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|SP_MASK);
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
-    h_type(i) = (int) ubuf(buf[m++]).i;
-    h_mask(i) = (int) ubuf(buf[m++]).i;
-    h_sp(i,0) = buf[m++];
-    h_sp(i,1) = buf[m++];
-    h_sp(i,2) = buf[m++];
-    h_sp(i,3) = buf[m++];
-  }
-
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->
-        unpack_border(n,first,&buf[m]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::unpack_border_vel(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++) {
-    if (i == nmax) grow(0);
-    atomKK->modified(Host,X_MASK|V_MASK|TAG_MASK|TYPE_MASK|MASK_MASK|SP_MASK);
-    h_x(i,0) = buf[m++];
-    h_x(i,1) = buf[m++];
-    h_x(i,2) = buf[m++];
-    h_tag(i) =  (tagint)  ubuf(buf[m++]).i;
-    h_type(i) = (int) ubuf(buf[m++]).i;
-    h_mask(i) = (int) ubuf(buf[m++]).i;
-    h_sp(i,0) = buf[m++];
-    h_sp(i,1) = buf[m++];
-    h_sp(i,2) = buf[m++];
-    h_sp(i,3) = buf[m++];
-    h_v(i,0) = buf[m++];
-    h_v(i,1) = buf[m++];
-    h_v(i,2) = buf[m++];
-  }
-
-  if (atom->nextra_border)
-    for (int iextra = 0; iextra < atom->nextra_border; iextra++)
-      m += modify->fix[atom->extra_border[iextra]]->
-        unpack_border(n,first,&buf[m]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::unpack_border_hybrid(int n, int first, double *buf)
-{
-  int i,m,last;
-
-  m = 0;
-  last = first + n;
-  for (i = first; i < last; i++)
-    h_sp(i,0) = buf[m++];
-    h_sp(i,1) = buf[m++];
-    h_sp(i,2) = buf[m++];
-    h_sp(i,3) = buf[m++];
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecSpinKokkos_PackExchangeFunctor {
-  typedef DeviceType device_type;
-  typedef ArrayTypes<DeviceType> AT;
-  typename AT::t_x_array_randomread _x;
-  typename AT::t_v_array_randomread _v;
-  typename AT::t_tagint_1d_randomread _tag;
-  typename AT::t_int_1d_randomread _type;
-  typename AT::t_int_1d_randomread _mask;
-  typename AT::t_imageint_1d_randomread _image;
-  typename AT::t_sp_array_randomread _sp;
-  typename AT::t_x_array _xw;
-  typename AT::t_v_array _vw;
-  typename AT::t_tagint_1d _tagw;
-  typename AT::t_int_1d _typew;
-  typename AT::t_int_1d _maskw;
-  typename AT::t_imageint_1d _imagew;
-  typename AT::t_sp_array _spw;
-
-  typename AT::t_xfloat_2d_um _buf;
-  typename AT::t_int_1d_const _sendlist;
-  typename AT::t_int_1d_const _copylist;
-  int _nlocal,_dim;
-  X_FLOAT _lo,_hi;
-
-  AtomVecSpinKokkos_PackExchangeFunctor(
-      const AtomKokkos* atom,
-      const typename AT::tdual_xfloat_2d buf,
-      typename AT::tdual_int_1d sendlist,
-      typename AT::tdual_int_1d copylist,int nlocal, int dim,
-                X_FLOAT lo, X_FLOAT hi):
-    _x(atom->k_x.view<DeviceType>()),
-    _v(atom->k_v.view<DeviceType>()),
-    _tag(atom->k_tag.view<DeviceType>()),
-    _type(atom->k_type.view<DeviceType>()),
-    _mask(atom->k_mask.view<DeviceType>()),
-    _image(atom->k_image.view<DeviceType>()),
-    _sp(atom->k_sp.view<DeviceType>()),
-    _xw(atom->k_x.view<DeviceType>()),
-    _vw(atom->k_v.view<DeviceType>()),
-    _tagw(atom->k_tag.view<DeviceType>()),
-    _typew(atom->k_type.view<DeviceType>()),
-    _maskw(atom->k_mask.view<DeviceType>()),
-    _imagew(atom->k_image.view<DeviceType>()),
-    _spw(atom->k_sp.view<DeviceType>()),
-    _sendlist(sendlist.template view<DeviceType>()),
-    _copylist(copylist.template view<DeviceType>()),
-    _nlocal(nlocal),_dim(dim),
-    _lo(lo),_hi(hi){
-    const size_t elements = 15;
-    const int maxsendlist = (buf.template view<DeviceType>().extent(0)*
-                             buf.template view<DeviceType>().extent(1))/elements;
-
-    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &mysend) const {
-    const int i = _sendlist(mysend);
-    _buf(mysend,0) = 15;
-    _buf(mysend,1) = _x(i,0);
-    _buf(mysend,2) = _x(i,1);
-    _buf(mysend,3) = _x(i,2);
-    _buf(mysend,4) = _v(i,0);
-    _buf(mysend,5) = _v(i,1);
-    _buf(mysend,6) = _v(i,2);
-    _buf(mysend,7) = d_ubuf(_tag[i]).d;
-    _buf(mysend,8) = d_ubuf(_type[i]).d;
-    _buf(mysend,9) = d_ubuf(_mask[i]).d;
-    _buf(mysend,10) = d_ubuf(_image[i]).d;
-    _buf(mysend,11) = _sp(i,0);
-    _buf(mysend,12) = _sp(i,1);
-    _buf(mysend,13) = _sp(i,2);
-    _buf(mysend,14) = _sp(i,3);
-    const int j = _copylist(mysend);
-
-    if(j>-1) {
-    _xw(i,0) = _x(j,0);
-    _xw(i,1) = _x(j,1);
-    _xw(i,2) = _x(j,2);
-    _vw(i,0) = _v(j,0);
-    _vw(i,1) = _v(j,1);
-    _vw(i,2) = _v(j,2);
-    _tagw(i) = _tag(j);
-    _typew(i) = _type(j);
-    _maskw(i) = _mask(j);
-    _imagew(i) = _image(j);
-    _spw(i,0) = _sp(j,0);
-    _spw(i,1) = _sp(j,1);
-    _spw(i,2) = _sp(j,2);
-    _spw(i,3) = _sp(j,3);
-    }
-  }
-};
-  
-/* ---------------------------------------------------------------------- */
-  
-int AtomVecSpinKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &k_buf,
-                                              DAT::tdual_int_1d k_sendlist,
-                                              DAT::tdual_int_1d k_copylist,
-                                              ExecutionSpace space,int dim,
-                                              X_FLOAT lo,X_FLOAT hi )
-{
-  if(nsend > (int) (k_buf.view<LMPHostType>().extent(0)*k_buf.view<LMPHostType>().extent(1))/15) {
-    int newsize = nsend*15/k_buf.view<LMPHostType>().extent(1)+1;
-    k_buf.resize(newsize,k_buf.view<LMPHostType>().extent(1));
-  }
-  if(space == Host) {
-    AtomVecSpinKokkos_PackExchangeFunctor<LMPHostType>
-      f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
-    Kokkos::parallel_for(nsend,f);
-    return nsend*15;
-  } else {
-    AtomVecSpinKokkos_PackExchangeFunctor<LMPDeviceType>
-      f(atomKK,k_buf,k_sendlist,k_copylist,atom->nlocal,dim,lo,hi);
-    Kokkos::parallel_for(nsend,f);
-    return nsend*15;
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-  
-int AtomVecSpinKokkos::pack_exchange(int i, double *buf)
-{
-  int m = 1;
-  buf[m++] = h_x(i,0);
-  buf[m++] = h_x(i,1);
-  buf[m++] = h_x(i,2);
-  buf[m++] = h_v(i,0);
-  buf[m++] = h_v(i,1);
-  buf[m++] = h_v(i,2);
-  buf[m++] = ubuf(h_tag(i)).d;
-  buf[m++] = ubuf(h_type(i)).d;
-  buf[m++] = ubuf(h_mask(i)).d;
-  buf[m++] = ubuf(h_image(i)).d;
-  buf[m++] = h_sp(i,0);
-  buf[m++] = h_sp(i,1);
-  buf[m++] = h_sp(i,2);
-  buf[m++] = h_sp(i,3);
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      m += modify->fix[atom->extra_grow[iextra]]->pack_exchange(i,&buf[m]);
-
-  buf[0] = m;
-  return m;
-}
-
-/* ---------------------------------------------------------------------- */
-
-template<class DeviceType>
-struct AtomVecSpinKokkos_UnpackExchangeFunctor {
-  typedef DeviceType device_type;
-  typedef ArrayTypes<DeviceType> AT;
-  typename AT::t_x_array _x;
-  typename AT::t_v_array _v;
-  typename AT::t_tagint_1d _tag;
-  typename AT::t_int_1d _type;
-  typename AT::t_int_1d _mask;
-  typename AT::t_imageint_1d _image;
-  typename AT::t_sp_array _sp;
-  typename AT::t_xfloat_2d_um _buf;
-  typename AT::t_int_1d _nlocal;
-  int _dim;
-  X_FLOAT _lo,_hi;
-
-  AtomVecSpinKokkos_UnpackExchangeFunctor(
-      const AtomKokkos* atom,
-      const typename AT::tdual_xfloat_2d buf,
-      typename AT::tdual_int_1d nlocal,
-      int dim, X_FLOAT lo, X_FLOAT hi):
-    _x(atom->k_x.view<DeviceType>()),
-    _v(atom->k_v.view<DeviceType>()),
-    _tag(atom->k_tag.view<DeviceType>()),
-    _type(atom->k_type.view<DeviceType>()),
-    _mask(atom->k_mask.view<DeviceType>()),
-    _image(atom->k_image.view<DeviceType>()),
-    _sp(atom->k_sp.view<DeviceType>()),
-    _nlocal(nlocal.template view<DeviceType>()),_dim(dim),
-    _lo(lo),_hi(hi){
-    const size_t elements = 15;
-    const int maxsendlist = (buf.template view<DeviceType>().extent(0)*buf.template view<DeviceType>().extent(1))/elements;
-
-    buffer_view<DeviceType>(_buf,buf,maxsendlist,elements);
-  }
-
-  KOKKOS_INLINE_FUNCTION
-  void operator() (const int &myrecv) const {
-    X_FLOAT x = _buf(myrecv,_dim+1);
-    if (x >= _lo && x < _hi) {
-      int i = Kokkos::atomic_fetch_add(&_nlocal(0),1);
-      _x(i,0) = _buf(myrecv,1);
-      _x(i,1) = _buf(myrecv,2);
-      _x(i,2) = _buf(myrecv,3);
-      _v(i,0) = _buf(myrecv,4);
-      _v(i,1) = _buf(myrecv,5);
-      _v(i,2) = _buf(myrecv,6);
-      _tag[i] = (tagint) d_ubuf(_buf(myrecv,7)).i;
-      _type[i] = (int) d_ubuf(_buf(myrecv,8)).i;
-      _mask[i] = (int) d_ubuf(_buf(myrecv,9)).i;
-      _image[i] = (imageint) d_ubuf(_buf(myrecv,10)).i;
-      _sp(i,0) = _buf(myrecv,11);
-      _sp(i,1) = _buf(myrecv,12);
-      _sp(i,2) = _buf(myrecv,13);
-      _sp(i,3) = _buf(myrecv,14);
-    }
-  }
-};
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf,int nrecv,
-                                                int nlocal,int dim,X_FLOAT lo,X_FLOAT hi,
-                                                ExecutionSpace space) {
-  if(space == Host) {
-    k_count.h_view(0) = nlocal;
-    AtomVecSpinKokkos_UnpackExchangeFunctor<LMPHostType> f(atomKK,k_buf,k_count,dim,lo,hi);
-    Kokkos::parallel_for(nrecv/15,f);
-    return k_count.h_view(0);
-  } else {
-    k_count.h_view(0) = nlocal;
-    k_count.modify<LMPHostType>();
-    k_count.sync<LMPDeviceType>();
-    AtomVecSpinKokkos_UnpackExchangeFunctor<LMPDeviceType>
-      f(atomKK,k_buf,k_count,dim,lo,hi);
-    Kokkos::parallel_for(nrecv/15,f);
-    k_count.modify<LMPDeviceType>();
-    k_count.sync<LMPHostType>();
-
-    return k_count.h_view(0);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::unpack_exchange(double *buf)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) grow(0);
-  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-           MASK_MASK | IMAGE_MASK | SP_MASK);
-
-  int m = 1;
-  h_x(nlocal,0) = buf[m++];
-  h_x(nlocal,1) = buf[m++];
-  h_x(nlocal,2) = buf[m++];
-  h_v(nlocal,0) = buf[m++];
-  h_v(nlocal,1) = buf[m++];
-  h_v(nlocal,2) = buf[m++];
-  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
-  h_type(nlocal) = (int) ubuf(buf[m++]).i;
-  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
-  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
-  h_sp(nlocal,0) = buf[m++];
-  h_sp(nlocal,1) = buf[m++];
-  h_sp(nlocal,2) = buf[m++];
-  h_sp(nlocal,3) = buf[m++];
-
-  if (atom->nextra_grow)
-    for (int iextra = 0; iextra < atom->nextra_grow; iextra++)
-      m += modify->fix[atom->extra_grow[iextra]]->
-        unpack_exchange(nlocal,&buf[m]);
-
-  atom->nlocal++;
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   size of restart data for all atoms owned by this proc
-   include extra data stored by fixes
-------------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::size_restart()
-{
-  int i;
-
-  int nlocal = atom->nlocal;
-  int n = 15 * nlocal;
-
-  if (atom->nextra_restart)
-    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
-      for (i = 0; i < nlocal; i++)
-        n += modify->fix[atom->extra_restart[iextra]]->size_restart(i);
-
-  return n;
-}
-
-/* ----------------------------------------------------------------------
-   pack atom I's data for restart file including extra quantities
-   xyz must be 1st 3 values, so that read_restart can test on them
-   molecular types may be negative, but write as positive
-------------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::pack_restart(int i, double *buf)
-{
-  atomKK->sync(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-            MASK_MASK | IMAGE_MASK | SP_MASK);
-
-  int m = 1;
-  buf[m++] = h_x(i,0);
-  buf[m++] = h_x(i,1);
-  buf[m++] = h_x(i,2);
-  buf[m++] = ubuf(h_tag(i)).d;
-  buf[m++] = ubuf(h_type(i)).d;
-  buf[m++] = ubuf(h_mask(i)).d;
-  buf[m++] = ubuf(h_image(i)).d;
-  buf[m++] = h_v(i,0);
-  buf[m++] = h_v(i,1);
-  buf[m++] = h_v(i,2);
-
-  buf[m++] = h_sp(i,0);
-  buf[m++] = h_sp(i,1);
-  buf[m++] = h_sp(i,2);
-  buf[m++] = h_sp(i,3);
-
-  if (atom->nextra_restart)
-    for (int iextra = 0; iextra < atom->nextra_restart; iextra++)
-      m += modify->fix[atom->extra_restart[iextra]]->pack_restart(i,&buf[m]);
-
-  buf[0] = m;
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   unpack data for one atom from restart file including extra quantities
-------------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::unpack_restart(double *buf)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) {
-    grow(0);
-    if (atom->nextra_store)
-      memory->grow(atom->extra,nmax,atom->nextra_store,"atom:extra");
-  }
-
-  atomKK->modified(Host,X_MASK | V_MASK | TAG_MASK | TYPE_MASK |
-           MASK_MASK | IMAGE_MASK | SP_MASK);
-
-  int m = 1;
-  h_x(nlocal,0) = buf[m++];
-  h_x(nlocal,1) = buf[m++];
-  h_x(nlocal,2) = buf[m++];
-  h_tag(nlocal) = (tagint) ubuf(buf[m++]).i;
-  h_type(nlocal) = (int) ubuf(buf[m++]).i;
-  h_mask(nlocal) = (int) ubuf(buf[m++]).i;
-  h_image(nlocal) = (imageint) ubuf(buf[m++]).i;
-  h_v(nlocal,0) = buf[m++];
-  h_v(nlocal,1) = buf[m++];
-  h_v(nlocal,2) = buf[m++];
-
-  h_sp(nlocal,0) = buf[m++];
-  h_sp(nlocal,1) = buf[m++];
-  h_sp(nlocal,2) = buf[m++];
-  h_sp(nlocal,3) = buf[m++];
-
-  double **extra = atom->extra;
-  if (atom->nextra_store) {
-    int size = static_cast<int> (buf[0]) - m;
-    for (int i = 0; i < size; i++) extra[nlocal][i] = buf[m++];
-  }
-
-  atom->nlocal++;
-  return m;
-}
-
-/* ----------------------------------------------------------------------
-   create one atom of itype at coord
-   set other values to defaults
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::create_atom(int itype, double *coord)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) {
-    atomKK->modified(Host,ALL_MASK);
-    grow(0);
-  }
-  atomKK->sync(Host,ALL_MASK);
-  atomKK->modified(Host,ALL_MASK);
-
-  tag[nlocal] = 0;
-  type[nlocal] = itype;
-  h_x(nlocal,0) = coord[0];
-  h_x(nlocal,1) = coord[1];
-  h_x(nlocal,2) = coord[2];
-  h_mask[nlocal] = 1;
-  h_image[nlocal] = ((imageint) IMGMAX << IMG2BITS) |
-    ((imageint) IMGMAX << IMGBITS) | IMGMAX;
-  h_v(nlocal,0) = 0.0;
-  h_v(nlocal,1) = 0.0;
-  h_v(nlocal,2) = 0.0;
-
-  h_sp(nlocal,0) = 0.0;
-  h_sp(nlocal,1) = 0.0;
-  h_sp(nlocal,2) = 0.0;
-  h_sp(nlocal,3) = 0.0;
-
-  atom->nlocal++;
-}
-
-/* ----------------------------------------------------------------------
-   unpack one line from Atoms section of data file
-   initialize other atom quantities
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::data_atom(double *coord, imageint imagetmp,
-                                    char **values)
-{
-  int nlocal = atom->nlocal;
-  if (nlocal == nmax) grow(0);
-
-  h_tag[nlocal] = utils::inumeric(FLERR,values[0],true,lmp);
-  h_type[nlocal] = utils::inumeric(FLERR,values[1],true,lmp);
-  if (type[nlocal] <= 0 || type[nlocal] > atom->ntypes)
-    error->one(FLERR,"Invalid atom type in Atoms section of data file");
-
-  h_sp(nlocal,3) = utils::numeric(FLERR,values[2],true,lmp);
-  h_sp(nlocal,0) = utils::numeric(FLERR,values[6],true,lmp);
-  h_sp(nlocal,1) = utils::numeric(FLERR,values[7],true,lmp);
-  h_sp(nlocal,2) = utils::numeric(FLERR,values[8],true,lmp);
-  double inorm = 1.0/sqrt(sp[nlocal][0]*sp[nlocal][0] +
-                          sp[nlocal][1]*sp[nlocal][1] +
-                          sp[nlocal][2]*sp[nlocal][2]);
-  h_sp(nlocal,0) *= inorm;
-  h_sp(nlocal,1) *= inorm;
-  h_sp(nlocal,2) *= inorm;
-
-  h_x(nlocal,0) = coord[0];
-  h_x(nlocal,1) = coord[1];
-  h_x(nlocal,2) = coord[2];
-
-  h_image[nlocal] = imagetmp;
-
-  h_mask[nlocal] = 1;
-  h_v(nlocal,0) = 0.0;
-  h_v(nlocal,1) = 0.0;
-  h_v(nlocal,2) = 0.0;
-
-  atomKK->modified(Host,ALL_MASK);
-
-  atom->nlocal++;
-}
-
-/* ----------------------------------------------------------------------
-   unpack hybrid quantities from one line in Atoms section of data file
-   initialize other atom quantities for this sub-style
-------------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::data_atom_hybrid(int nlocal, char **values)
-{
-  h_sp(nlocal,3) = utils::numeric(FLERR,values[0],true,lmp);
-  h_sp(nlocal,0) = utils::numeric(FLERR,values[1],true,lmp);
-  h_sp(nlocal,1) = utils::numeric(FLERR,values[2],true,lmp);
-  h_sp(nlocal,2) = utils::numeric(FLERR,values[3],true,lmp);
-  double inorm = 1.0/sqrt(sp[nlocal][0]*sp[nlocal][0] +
-                          sp[nlocal][1]*sp[nlocal][1] +
-                          sp[nlocal][2]*sp[nlocal][2]);
-  sp[nlocal][0] *= inorm;
-  sp[nlocal][1] *= inorm;
-  sp[nlocal][2] *= inorm;
-
-  return 4;
-}
-
-/* ----------------------------------------------------------------------
-   pack atom info for data file including 3 image flags
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::pack_data(double **buf)
-{
-  int nlocal = atom->nlocal;
-  for (int i = 0; i < nlocal; i++) {
-    buf[i][0] = h_tag[i];
-    buf[i][1] = h_type[i];
-    buf[i][2] = h_sp(i,0);
-    buf[i][3] = h_x(i,0);
-    buf[i][4] = h_x(i,1);
-    buf[i][5] = h_x(i,2);
-    buf[i][2] = h_sp(i,1);
-    buf[i][2] = h_sp(i,2);
-    buf[i][2] = h_sp(i,3);
-    buf[i][6] = (h_image[i] & IMGMASK) - IMGMAX;
-    buf[i][7] = (h_image[i] >> IMGBITS & IMGMASK) - IMGMAX;
-    buf[i][8] = (h_image[i] >> IMG2BITS) - IMGMAX;
-  }
-}
-
-/* ----------------------------------------------------------------------
-   pack hybrid atom info for data file
-------------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::pack_data_hybrid(int i, double *buf)
-{
-  buf[0] = h_sp(i,3);
-  buf[1] = h_sp(i,0);
-  buf[2] = h_sp(i,1);
-  buf[3] = h_sp(i,2);
-  return 4;
-}
-
-/* ----------------------------------------------------------------------
-   write atom info to data file including 3 image flags
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::write_data(FILE *fp, int n, double **buf)
-{
-  for (int i = 0; i < n; i++)
-    fprintf(fp,"%d %d %-1.16e %-1.16e %-1.16e %-1.16e %d %d %d\n",
-            (int) buf[i][0],(int) buf[i][1],buf[i][2],buf[i][3],buf[i][4],
-            buf[i][5],(int) buf[i][6],(int) buf[i][7],(int) buf[i][8]);
-}
-
-/* ----------------------------------------------------------------------
-   write hybrid atom info to data file
-------------------------------------------------------------------------- */
-
-int AtomVecSpinKokkos::write_data_hybrid(FILE *fp, double *buf)
-{
-  fprintf(fp," %-1.16e %-1.16e %-1.16e %-1.16e",buf[0],buf[1],buf[2],buf[3]);
-  return 4;
-}
-
-/* ----------------------------------------------------------------------
-   return # of bytes of allocated memory
-------------------------------------------------------------------------- */
-
-bigint AtomVecSpinKokkos::memory_usage()
-{
-  bigint bytes = 0;
-
-  if (atom->memcheck("tag")) bytes += memory->usage(tag,nmax);
-  if (atom->memcheck("type")) bytes += memory->usage(type,nmax);
-  if (atom->memcheck("mask")) bytes += memory->usage(mask,nmax);
-  if (atom->memcheck("image")) bytes += memory->usage(image,nmax);
-  if (atom->memcheck("x")) bytes += memory->usage(x,nmax,3);
-  if (atom->memcheck("v")) bytes += memory->usage(v,nmax,3);
-  if (atom->memcheck("f")) bytes += memory->usage(f,nmax*commKK->nthreads,3);
-
-  if (atom->memcheck("sp")) bytes += memory->usage(sp,nmax,4);
-  if (atom->memcheck("fm")) bytes += memory->usage(fm,nmax*comm->nthreads,3);
-  if (atom->memcheck("fm_long")) bytes += memory->usage(fm_long,nmax*comm->nthreads,3);
-
-  return bytes;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::sync(ExecutionSpace space, unsigned int mask)
-{
-  if (space == Device) {
-    if (mask & X_MASK) atomKK->k_x.sync<LMPDeviceType>();
-    if (mask & V_MASK) atomKK->k_v.sync<LMPDeviceType>();
-    if (mask & F_MASK) atomKK->k_f.sync<LMPDeviceType>();
-    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPDeviceType>();
-    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPDeviceType>();
-    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPDeviceType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPDeviceType>();
-    if (mask & SP_MASK) atomKK->k_sp.sync<LMPDeviceType>();
-    if (mask & FM_MASK) atomKK->k_fm.sync<LMPDeviceType>();
-    if (mask & FML_MASK) atomKK->k_fm_long.sync<LMPDeviceType>();
-  } else {
-    if (mask & X_MASK) atomKK->k_x.sync<LMPHostType>();
-    if (mask & V_MASK) atomKK->k_v.sync<LMPHostType>();
-    if (mask & F_MASK) atomKK->k_f.sync<LMPHostType>();
-    if (mask & TAG_MASK) atomKK->k_tag.sync<LMPHostType>();
-    if (mask & TYPE_MASK) atomKK->k_type.sync<LMPHostType>();
-    if (mask & MASK_MASK) atomKK->k_mask.sync<LMPHostType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.sync<LMPHostType>();
-    if (mask & SP_MASK) atomKK->k_sp.sync<LMPHostType>();
-    if (mask & FM_MASK) atomKK->k_fm.sync<LMPHostType>();
-    if (mask & FML_MASK) atomKK->k_fm_long.sync<LMPHostType>();
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::modified(ExecutionSpace space, unsigned int mask)
-{
-  if (space == Device) {
-    if (mask & X_MASK) atomKK->k_x.modify<LMPDeviceType>();
-    if (mask & V_MASK) atomKK->k_v.modify<LMPDeviceType>();
-    if (mask & F_MASK) atomKK->k_f.modify<LMPDeviceType>();
-    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPDeviceType>();
-    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPDeviceType>();
-    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPDeviceType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPDeviceType>();
-    if (mask & SP_MASK) atomKK->k_sp.modify<LMPDeviceType>();
-    if (mask & FM_MASK) atomKK->k_fm.modify<LMPDeviceType>();
-    if (mask & FML_MASK) atomKK->k_fm_long.modify<LMPDeviceType>();
-  } else {
-    if (mask & X_MASK) atomKK->k_x.modify<LMPHostType>();
-    if (mask & V_MASK) atomKK->k_v.modify<LMPHostType>();
-    if (mask & F_MASK) atomKK->k_f.modify<LMPHostType>();
-    if (mask & TAG_MASK) atomKK->k_tag.modify<LMPHostType>();
-    if (mask & TYPE_MASK) atomKK->k_type.modify<LMPHostType>();
-    if (mask & MASK_MASK) atomKK->k_mask.modify<LMPHostType>();
-    if (mask & IMAGE_MASK) atomKK->k_image.modify<LMPHostType>();
-    if (mask & SP_MASK) atomKK->k_sp.modify<LMPHostType>();
-    if (mask & FM_MASK) atomKK->k_fm.modify<LMPHostType>();
-    if (mask & FML_MASK) atomKK->k_fm_long.modify<LMPHostType>();
-  }
-}
-
-void AtomVecSpinKokkos::sync_overlapping_device(ExecutionSpace space, unsigned int mask)
-{
-  if (space == Device) {
-    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
-    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
-    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
-    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
-    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
-    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
-    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
-    if ((mask & SP_MASK) && atomKK->k_sp.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_sp_array>(atomKK->k_sp,space);
-    if ((mask & FM_MASK) && atomKK->k_sp.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_fm_array>(atomKK->k_fm,space);
-    if ((mask & FML_MASK) && atomKK->k_fm_long.need_sync<LMPDeviceType>())
-      perform_async_copy<DAT::tdual_fm_long_array>(atomKK->k_fm_long,space);
-  } else {
-    if ((mask & X_MASK) && atomKK->k_x.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_x_array>(atomKK->k_x,space);
-    if ((mask & V_MASK) && atomKK->k_v.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_v_array>(atomKK->k_v,space);
-    if ((mask & F_MASK) && atomKK->k_f.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_f_array>(atomKK->k_f,space);
-    if ((mask & TAG_MASK) && atomKK->k_tag.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_tagint_1d>(atomKK->k_tag,space);
-    if ((mask & TYPE_MASK) && atomKK->k_type.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_type,space);
-    if ((mask & MASK_MASK) && atomKK->k_mask.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_int_1d>(atomKK->k_mask,space);
-    if ((mask & IMAGE_MASK) && atomKK->k_image.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_imageint_1d>(atomKK->k_image,space);
-    if ((mask & SP_MASK) && atomKK->k_sp.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_sp_array>(atomKK->k_sp,space);
-    if ((mask & FM_MASK) && atomKK->k_fm.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_fm_array>(atomKK->k_fm,space);
-    if ((mask & FML_MASK) && atomKK->k_fm_long.need_sync<LMPHostType>())
-      perform_async_copy<DAT::tdual_fm_long_array>(atomKK->k_fm_long,space);
-  }
-}
-
-/* ----------------------------------------------------------------------
-   clear all forces (mech and mag)
-------------------------------------------------------------------------- */
-
-void AtomVecSpinKokkos::force_clear(int /*n*/, size_t nbytes)
-{
-  memset(&atom->f[0][0],0,3*nbytes);
-  memset(&atom->fm[0][0],0,3*nbytes);
-  memset(&atom->fm_long[0][0],0,3*nbytes);
-}
diff --git a/src/KOKKOS/atom_vec_spin_kokkos.h b/src/KOKKOS/atom_vec_spin_kokkos.h
deleted file mode 100644
index d439424076..0000000000
--- a/src/KOKKOS/atom_vec_spin_kokkos.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-------------------------------------------------------------------------- */
-
-#ifdef ATOM_CLASS
-
-AtomStyle(spin/kk,AtomVecSpinKokkos)
-AtomStyle(spin/kk/device,AtomVecSpinKokkos)
-AtomStyle(spin/kk/host,AtomVecSpinKokkos)
-
-#else
-
-#ifndef LMP_ATOM_VEC_SPIN_KOKKOS_H
-#define LMP_ATOM_VEC_SPIN_KOKKOS_H
-
-#include "atom_vec_kokkos.h"
-#include "kokkos_type.h"
-
-namespace LAMMPS_NS {
-
-class AtomVecSpinKokkos : public AtomVecKokkos {
- public:
-  AtomVecSpinKokkos(class LAMMPS *);
-  void grow(int);
-  void copy(int, int, int);
-  int pack_border(int, int *, double *, int, int *);
-  int pack_border_vel(int, int *, double *, int, int *);
-  int pack_border_hybrid(int, int *, double *);
-  void unpack_border(int, int, double *);
-  void unpack_border_vel(int, int, double *);
-  int unpack_border_hybrid(int, int, double *);
-  int pack_exchange(int, double *);
-  int unpack_exchange(double *);
-  int size_restart();
-  int pack_restart(int, double *);
-  int unpack_restart(double *);
-  void create_atom(int, double *);
-  void data_atom(double *, imageint, char **);
-  int data_atom_hybrid(int, char **);
-  void pack_data(double **);
-  int pack_data_hybrid(int, double *);
-  void write_data(FILE *, int, double **);
-  int write_data_hybrid(FILE *, double *);
-  bigint memory_usage();
-  
-  // clear magnetic and mechanic forces
-
-  void force_clear(int, size_t);
-
-  void grow_reset();
-  // input lists to be checked
-  int pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist,
-                         DAT::tdual_xfloat_2d buf,int iswap,
-                         int pbc_flag, int *pbc, ExecutionSpace space);
-  void unpack_border_kokkos(const int &n, const int &nfirst,
-                            const DAT::tdual_xfloat_2d &buf,
-                            ExecutionSpace space);
-  int pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat_2d &buf,
-                           DAT::tdual_int_1d k_sendlist,
-                           DAT::tdual_int_1d k_copylist,
-                           ExecutionSpace space, int dim,
-                           X_FLOAT lo, X_FLOAT hi);
-  int unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv,
-                             int nlocal, int dim, X_FLOAT lo, X_FLOAT hi,
-                             ExecutionSpace space);
-
-  void sync(ExecutionSpace space, unsigned int mask);
-  void modified(ExecutionSpace space, unsigned int mask);
-  void sync_overlapping_device(ExecutionSpace space, unsigned int mask);
-
- protected:
-  tagint *tag;
-  int *type,*mask;
-  imageint *image;
-  double **x,**v,**f;           // lattice quantities
-
-                                // spin quantities
-  double **sp;                  // sp[i][0-2] direction of the spin i
-                                // sp[i][3] atomic magnetic moment of the spin i
-  double **fm;                  // fm[i][0-2] direction of magnetic precession
-  double **fm_long;             // storage of long-range spin prec. components
-
-  DAT::t_tagint_1d d_tag;
-  HAT::t_tagint_1d h_tag;
-
-  DAT::t_int_1d d_type, d_mask;
-  HAT::t_int_1d h_type, h_mask;
-
-  DAT::t_imageint_1d d_image;
-  HAT::t_imageint_1d h_image;
-
-  DAT::t_x_array d_x;
-  DAT::t_v_array d_v;
-  DAT::t_f_array d_f;
-
-  DAT::t_sp_array d_sp;
-  DAT::t_fm_array d_fm;
-  DAT::t_fm_long_array d_fm_long;
-
-  HAT::t_sp_array h_sp;
-  HAT::t_fm_array h_fm;
-  HAT::t_fm_long_array h_fm_long;
-
-  DAT::tdual_int_1d k_count;
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-E: Per-processor system is too big
-
-The number of owned atoms plus ghost atoms on a single
-processor must fit in 32-bit integer.
-
-E: Invalid atom type in Atoms section of data file
-
-Atom types must range from 1 to specified # of types.
-
-*/
diff --git a/src/KOKKOS/kokkos_type.h b/src/KOKKOS/kokkos_type.h
index 7a575ecf28..5930a9e207 100644
--- a/src/KOKKOS/kokkos_type.h
+++ b/src/KOKKOS/kokkos_type.h
@@ -758,39 +758,6 @@ typedef tdual_virial_array::t_dev_um t_virial_array_um;
 typedef tdual_virial_array::t_dev_const_um t_virial_array_const_um;
 typedef tdual_virial_array::t_dev_const_randomread t_virial_array_randomread;
 
-// Spin Types
-
-//3d SP_FLOAT array n*4
-#ifdef LMP_KOKKOS_NO_LEGACY
-typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutLeft, LMPDeviceType> tdual_sp_array;
-#else
-typedef Kokkos::DualView<X_FLOAT*[4], Kokkos::LayoutRight, LMPDeviceType> tdual_sp_array;
-#endif
-typedef tdual_sp_array::t_dev t_sp_array;
-typedef tdual_sp_array::t_dev_const t_sp_array_const;
-typedef tdual_sp_array::t_dev_um t_sp_array_um;
-typedef tdual_sp_array::t_dev_const_um t_sp_array_const_um;
-typedef tdual_sp_array::t_dev_const_randomread t_sp_array_randomread;
-
-//3d FM_FLOAT array n*3
-
-typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_array;
-typedef tdual_fm_array::t_dev t_fm_array;
-typedef tdual_fm_array::t_dev_const t_fm_array_const;
-typedef tdual_fm_array::t_dev_um t_fm_array_um;
-typedef tdual_fm_array::t_dev_const_um t_fm_array_const_um;
-typedef tdual_fm_array::t_dev_const_randomread t_fm_array_randomread;
-
-//3d FML_FLOAT array n*3
-
-typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_long_array;
-typedef tdual_fm_long_array::t_dev t_fm_long_array;
-typedef tdual_fm_long_array::t_dev_const t_fm_long_array_const;
-typedef tdual_fm_long_array::t_dev_um t_fm_long_array_um;
-typedef tdual_fm_long_array::t_dev_const_um t_fm_long_array_const_um;
-typedef tdual_fm_long_array::t_dev_const_randomread t_fm_long_array_randomread;
-
-
 //Energy Types
 //1d E_FLOAT array n
 
@@ -1027,33 +994,6 @@ typedef tdual_virial_array::t_host_um t_virial_array_um;
 typedef tdual_virial_array::t_host_const_um t_virial_array_const_um;
 typedef tdual_virial_array::t_host_const_randomread t_virial_array_randomread;
 
-// Spin types
-
-//2d X_FLOAT array n*3
-typedef Kokkos::DualView<X_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_sp_array;
-typedef tdual_sp_array::t_host t_sp_array;
-typedef tdual_sp_array::t_host_const t_sp_array_const;
-typedef tdual_sp_array::t_host_um t_sp_array_um;
-typedef tdual_sp_array::t_host_const_um t_sp_array_const_um;
-typedef tdual_sp_array::t_host_const_randomread t_sp_array_randomread;
-
-//2d F_FLOAT array n*3
-typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_array;
-//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array;
-typedef tdual_fm_array::t_host t_fm_array;
-typedef tdual_fm_array::t_host_const t_fm_array_const;
-typedef tdual_fm_array::t_host_um t_fm_array_um;
-typedef tdual_fm_array::t_host_const_um t_fm_array_const_um;
-typedef tdual_fm_array::t_host_const_randomread t_fm_array_randomread;
-
-//2d F_FLOAT array n*3
-typedef Kokkos::DualView<F_FLOAT*[3], Kokkos::LayoutRight, LMPDeviceType> tdual_fm_long_array;
-//typedef Kokkos::DualView<F_FLOAT*[3], LMPDeviceType::array_layout, LMPDeviceType> tdual_f_array;
-typedef tdual_fm_long_array::t_host t_fm_long_array;
-typedef tdual_fm_long_array::t_host_const t_fm_long_array_const;
-typedef tdual_fm_long_array::t_host_um t_fm_long_array_um;
-typedef tdual_fm_long_array::t_host_const_um t_fm_long_array_const_um;
-typedef tdual_fm_long_array::t_host_const_randomread t_fm_long_array_randomread;
 
 
 //Energy Types
diff --git a/src/SPIN/pair_spin_dipole_cut.cpp b/src/SPIN/pair_spin_dipole_cut.cpp
index 41bb1a7755..7eb81e7a03 100644
--- a/src/SPIN/pair_spin_dipole_cut.cpp
+++ b/src/SPIN/pair_spin_dipole_cut.cpp
@@ -233,36 +233,44 @@ void PairSpinDipoleCut::compute(int eflag, int vflag)
 
       local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
 
+      // compute dipolar interaction
+      
       if (rsq < local_cut2) {
         r2inv = 1.0/rsq;
         r3inv = r2inv*rinv;
 
         compute_dipolar(i,j,eij,fmi,spi,spj,r3inv);
-        if (lattice_flag) compute_dipolar_mech(i,j,eij,fi,spi,spj,r2inv);
-      }
+        
+        if (lattice_flag) 
+          compute_dipolar_mech(i,j,eij,fi,spi,spj,r2inv);
 
-      // force accumulation
+        if (eflag) {
+          if (rsq <= local_cut2) {
+            evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+            evdwl *= 0.5*hbar;
+            emag[i] += evdwl;
+          }
+        } else evdwl = 0.0;
 
-      f[i][0] += fi[0];
-      f[i][1] += fi[1];
-      f[i][2] += fi[2];
-      fm[i][0] += fmi[0];
-      fm[i][1] += fmi[1];
-      fm[i][2] += fmi[2];
-
-      if (eflag) {
-        if (rsq <= local_cut2) {
-          evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
-          evdwl *= 0.5*hbar;
-          emag[i] += evdwl;
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fi[0];
+          f[j][1] -= fi[1];
+          f[j][2] -= fi[2];
         }
-      } else evdwl = 0.0;
-
-      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
-          evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
+        fm[i][0] += fmi[0];
+        fm[i][1] += fmi[1];
+        fm[i][2] += fmi[2];
 
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+            evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
+      }
     }
   }
+  
+  if (vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ----------------------------------------------------------------------
@@ -391,7 +399,7 @@ void PairSpinDipoleCut::compute_dipolar_mech(int /* i */, int /* j */, double ei
   sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
 
   bij = sisj - 5.0*sieij*sjeij;
-  pre = 3.0*mub2mu0*gigjri4;
+  pre = 0.5*3.0*mub2mu0*gigjri4;
 
   fi[0] -= pre * (eij[0] * bij + (sjeij*spi[0] + sieij*spj[0]));
   fi[1] -= pre * (eij[1] * bij + (sjeij*spi[1] + sieij*spj[1]));
diff --git a/src/SPIN/pair_spin_dipole_long.cpp b/src/SPIN/pair_spin_dipole_long.cpp
index 85b3c1d7bb..e90a43d1cf 100644
--- a/src/SPIN/pair_spin_dipole_long.cpp
+++ b/src/SPIN/pair_spin_dipole_long.cpp
@@ -281,32 +281,37 @@ void PairSpinDipoleLong::compute(int eflag, int vflag)
         bij[3] = (5.0*bij[2] + pre3*expm2) * r2inv;
 
         compute_long(i,j,eij,bij,fmi,spi,spj);
-        compute_long_mech(i,j,eij,bij,fmi,spi,spj);
-      }
+        if (lattice_flag)
+          compute_long_mech(i,j,eij,bij,fmi,spi,spj);
 
-      // force accumulation
+        if (eflag) {
+          if (rsq <= local_cut2) {
+            evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+            evdwl *= 0.5*hbar;
+            emag[i] += evdwl;
+          }
+        } else evdwl = 0.0;
 
-      f[i][0] += fi[0];
-      f[i][1] += fi[1];
-      f[i][2] += fi[2];
-      fm[i][0] += fmi[0];
-      fm[i][1] += fmi[1];
-      fm[i][2] += fmi[2];
-
-      if (eflag) {
-        if (rsq <= local_cut2) {
-          evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
-          evdwl *= 0.5*hbar;
-          emag[i] += evdwl;
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fi[0];
+          f[j][1] -= fi[1];
+          f[j][2] -= fi[2];
         }
-      } else evdwl = 0.0;
+        fm[i][0] += fmi[0];
+        fm[i][1] += fmi[1];
+        fm[i][2] += fmi[2];
 
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+            evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
 
-      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
-          evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
-
+      }
     }
   }
+  
+  if (vflag_fdotr) virial_fdotr_compute();
 }
 
 /* ----------------------------------------------------------------------
@@ -373,7 +378,6 @@ void PairSpinDipoleLong::compute_single_pair(int ii, double fmi[3])
     spi[3] = sp[ii][3];
     jlist = firstneigh[ii];
     jnum = numneigh[ii];
-    //itype = type[i];
 
     for (jj = 0; jj < jnum; jj++) {
       j = jlist[jj];
@@ -459,7 +463,7 @@ void PairSpinDipoleLong::compute_long_mech(int /* i */, int /* j */, double eij[
   double g1,g2,g1b2_g2b3,gigj,pre;
 
   gigj = spi[3] * spj[3];
-  pre = gigj*mub2mu0;
+  pre = 0.5 * gigj*mub2mu0;
   sisj = spi[0]*spj[0] + spi[1]*spj[1] + spi[2]*spj[2];
   sieij = spi[0]*eij[0] + spi[1]*eij[1] + spi[2]*eij[2];
   sjeij = spj[0]*eij[0] + spj[1]*eij[1] + spj[2]*eij[2];
diff --git a/src/SPIN/pair_spin_dmi.cpp b/src/SPIN/pair_spin_dmi.cpp
index e2ddd708df..8d43a3a870 100644
--- a/src/SPIN/pair_spin_dmi.cpp
+++ b/src/SPIN/pair_spin_dmi.cpp
@@ -244,31 +244,36 @@ void PairSpinDmi::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_dmi(i,j,eij,fmi,spj);
-        if (lattice_flag) {
+        
+        if (lattice_flag)
           compute_dmi_mech(i,j,rsq,eij,fi,spi,spj);
-        }
+
+        if (eflag) {
+          evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+          evdwl *= 0.5*hbar;
+          emag[i] += evdwl;
+        } else evdwl = 0.0;
+        
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+          if (newton_pair || j < nlocal) {
+            f[j][0] -= fi[0];
+            f[j][1] -= fi[1];
+            f[j][2] -= fi[2];
+          }
+        fm[i][0] += fmi[0];
+        fm[i][1] += fmi[1];
+        fm[i][2] += fmi[2];
+
+
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+            evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
       }
-
-      f[i][0] += fi[0];
-      f[i][1] += fi[1];
-      f[i][2] += fi[2];
-      fm[i][0] += fmi[0];
-      fm[i][1] += fmi[1];
-      fm[i][2] += fmi[2];
-
-      if (eflag) {
-        evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
-        evdwl *= 0.5*hbar;
-        emag[i] += evdwl;
-      } else evdwl = 0.0;
-
-      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
-          evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
-
 }
 
 /* ----------------------------------------------------------------------
@@ -405,9 +410,9 @@ void PairSpinDmi::compute_dmi_mech(int i, int j, double rsq, double /*eij*/[3],
   cdmy = (dmiz*csx - dmix*csz);
   cdmz = (dmix*csy - dmiy*csz);
 
-  fi[0] += irij*cdmx;
-  fi[1] += irij*cdmy;
-  fi[2] += irij*cdmz;
+  fi[0] += 0.5*irij*cdmx;
+  fi[1] += 0.5*irij*cdmy;
+  fi[2] += 0.5*irij*cdmz;
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 4c6c3936cf..36f3dbcf5e 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -22,19 +22,16 @@
 ------------------------------------------------------------------------- */
 
 #include "pair_spin_exchange_biquadratic.h"
-#include <mpi.h>
-#include <cmath>
-#include <cstring>
+
 #include "atom.h"
 #include "comm.h"
 #include "error.h"
-#include "fix.h"
 #include "force.h"
-#include "neigh_list.h"
 #include "memory.h"
-#include "modify.h"
-#include "update.h"
-#include "utils.h"
+#include "neigh_list.h"
+
+#include <cmath>
+#include <cstring>
 
 using namespace LAMMPS_NS;
 
@@ -76,7 +73,7 @@ void PairSpinExchangeBiquadratic::settings(int narg, char **arg)
 
   if (narg != 1) error->all(FLERR,"Illegal pair_style command");
 
-  cut_spin_exchange_global = force->numeric(FLERR,arg[0]);
+  cut_spin_exchange_global = utils::numeric(FLERR,arg[0],false,lmp);
 
   // reset cutoffs that have been explicitly set
 
@@ -106,19 +103,19 @@ void PairSpinExchangeBiquadratic::coeff(int narg, char **arg)
     error->all(FLERR,"Incorrect args for pair coefficients");
 
   int ilo,ihi,jlo,jhi;
-  force->bounds(FLERR,arg[0],atom->ntypes,ilo,ihi);
-  force->bounds(FLERR,arg[1],atom->ntypes,jlo,jhi);
+  utils::bounds(FLERR,arg[0],1,atom->ntypes,ilo,ihi,error);
+  utils::bounds(FLERR,arg[1],1,atom->ntypes,jlo,jhi,error);
 
   // get exchange arguments from input command
 
   int iarg = 10;
-  const double rc = force->numeric(FLERR,arg[3]);
-  const double j1 = force->numeric(FLERR,arg[4]);
-  const double j2 = force->numeric(FLERR,arg[5]);
-  const double j3 = force->numeric(FLERR,arg[6]);
-  const double k1 = force->numeric(FLERR,arg[7]);
-  const double k2 = force->numeric(FLERR,arg[8]);
-  const double k3 = force->numeric(FLERR,arg[9]);
+  const double rc = utils::numeric(FLERR,arg[3],false,lmp);
+  const double j1 = utils::numeric(FLERR,arg[4],false,lmp);
+  const double j2 = utils::numeric(FLERR,arg[5],false,lmp);
+  const double j3 = utils::numeric(FLERR,arg[6],false,lmp);
+  const double k1 = utils::numeric(FLERR,arg[7],false,lmp);
+  const double k2 = utils::numeric(FLERR,arg[8],false,lmp);
+  const double k3 = utils::numeric(FLERR,arg[9],false,lmp);
 
   // read energy offset flag if specified
 
diff --git a/src/SPIN/pair_spin_magelec.cpp b/src/SPIN/pair_spin_magelec.cpp
index 849590bad2..2a672416b9 100644
--- a/src/SPIN/pair_spin_magelec.cpp
+++ b/src/SPIN/pair_spin_magelec.cpp
@@ -237,31 +237,35 @@ void PairSpinMagelec::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_magelec(i,j,eij,fmi,spj);
-        if (lattice_flag) {
+        
+        if (lattice_flag)
           compute_magelec_mech(i,j,fi,spi,spj);
+
+        if (eflag) {
+          evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
+          evdwl *= 0.5*hbar;
+          emag[i] += evdwl;
+        } else evdwl = 0.0;
+        
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fi[0];
+          f[j][1] -= fi[1];
+          f[j][2] -= fi[2];
         }
+        fm[i][0] += fmi[0];
+        fm[i][1] += fmi[1];
+        fm[i][2] += fmi[2];
+
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+            evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
       }
-
-      f[i][0] += fi[0];
-      f[i][1] += fi[1];
-      f[i][2] += fi[2];
-      fm[i][0] += fmi[0];
-      fm[i][1] += fmi[1];
-      fm[i][2] += fmi[2];
-
-      if (eflag) {
-        evdwl -= (spi[0]*fmi[0] + spi[1]*fmi[1] + spi[2]*fmi[2]);
-        evdwl *= 0.5*hbar;
-        emag[i] += evdwl;
-      } else evdwl = 0.0;
-
-      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
-          evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
-
 }
 
 /* ----------------------------------------------------------------------
@@ -400,9 +404,9 @@ void PairSpinMagelec::compute_magelec_mech(int i, int j, double fi[3], double sp
   meiy *= ME_mech[itype][jtype];
   meiz *= ME_mech[itype][jtype];
 
-  fi[0] += (meiy*vz - meiz*vy);
-  fi[1] += (meiz*vx - meix*vz);
-  fi[2] += (meix*vy - meiy*vx);
+  fi[0] += 0.5*(meiy*vz - meiz*vy);
+  fi[1] += 0.5*(meiz*vx - meix*vz);
+  fi[2] += 0.5*(meix*vy - meiy*vx);
 
 }
 
diff --git a/src/SPIN/pair_spin_neel.cpp b/src/SPIN/pair_spin_neel.cpp
index c09b5ac191..5c05bef525 100644
--- a/src/SPIN/pair_spin_neel.cpp
+++ b/src/SPIN/pair_spin_neel.cpp
@@ -246,31 +246,33 @@ void PairSpinNeel::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_neel(i,j,rsq,eij,fmi,spi,spj);
-        if (lattice_flag) {
+        if (lattice_flag)
           compute_neel_mech(i,j,rsq,eij,fi,spi,spj);
+
+        f[i][0] += fi[0];
+        f[i][1] += fi[1];
+        f[i][2] += fi[2];
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fi[0];
+          f[j][1] -= fi[1];
+          f[j][2] -= fi[2];
         }
+        fm[i][0] += fmi[0];
+        fm[i][1] += fmi[1];
+        fm[i][2] += fmi[2];
+
+        if (eflag) {
+          evdwl -= compute_neel_energy(i,j,rsq,eij,spi,spj);
+          emag[i] += evdwl;
+        } else evdwl = 0.0;
+
+        if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
+            evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
       }
-
-      f[i][0] += fi[0];
-      f[i][1] += fi[1];
-      f[i][2] += fi[2];
-      fm[i][0] += fmi[0];
-      fm[i][1] += fmi[1];
-      fm[i][2] += fmi[2];
-
-      if (eflag) {
-        evdwl -= compute_neel_energy(i,j,rsq,eij,spi,spj);
-        // evdwl *= 0.5*hbar;
-        emag[i] += evdwl;
-      } else evdwl = 0.0;
-
-      if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
-          evdwl,ecoul,fi[0],fi[1],fi[2],rij[0],rij[1],rij[2]);
     }
   }
 
   if (vflag_fdotr) virial_fdotr_compute();
-
 }
 
 /* ----------------------------------------------------------------------
@@ -563,9 +565,9 @@ void PairSpinNeel::compute_neel_mech(int i, int j, double rsq, double eij[3], do
 
   // adding three contributions
 
-  fi[0] = pdx + pq1x + pq2x;
-  fi[1] = pdy + pq1y + pq2y;
-  fi[2] = pdz + pq1z + pq2z;
+  fi[0] = 0.5*(pdx + pq1x + pq2x);
+  fi[1] = 0.5*(pdy + pq1y + pq2y);
+  fi[2] = 0.5*(pdz + pq1z + pq2z);
 }
 
 /* ---------------------------------------------------------------------- */

From 2825abb0284a164b368d1dda18a62140c807b000 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 5 Oct 2020 17:13:54 -0600
Subject: [PATCH 13/64] Improved scripts validaton problems

---
 .../validation_damped_exchange/run-test-exchange.sh           | 4 ++--
 examples/SPIN/test_problems/validation_nve/run-test-nve.sh    | 2 +-
 examples/SPIN/test_problems/validation_nvt/plot_nvt.py        | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/SPIN/test_problems/validation_damped_exchange/run-test-exchange.sh b/examples/SPIN/test_problems/validation_damped_exchange/run-test-exchange.sh
index 599730fe7b..bd878a52de 100755
--- a/examples/SPIN/test_problems/validation_damped_exchange/run-test-exchange.sh
+++ b/examples/SPIN/test_problems/validation_damped_exchange/run-test-exchange.sh
@@ -13,7 +13,7 @@ en="$(echo "$en-$in" | bc -l)"
 tail -n +$in log.lammps | head -n $en > res_lammps.dat
 
 # compute Langevin
-python3 -m llg_exchange.py > res_llg.dat
+python3 llg_exchange.py > res_llg.dat
 
 # plot results
-python3 -m plot_precession.py res_lammps.dat res_llg.dat
+python3 plot_precession.py res_lammps.dat res_llg.dat
diff --git a/examples/SPIN/test_problems/validation_nve/run-test-nve.sh b/examples/SPIN/test_problems/validation_nve/run-test-nve.sh
index 441e7cf46d..18cedd9503 100755
--- a/examples/SPIN/test_problems/validation_nve/run-test-nve.sh
+++ b/examples/SPIN/test_problems/validation_nve/run-test-nve.sh
@@ -13,4 +13,4 @@ en="$(echo "$en-$in" | bc -l)"
 tail -n +$in log.lammps | head -n $en > res_lammps.dat
 
 # plot results
-python3 -m plot_nve.py res_lammps.dat res_llg.dat
+python3 plot_nve.py res_lammps.dat res_llg.dat
diff --git a/examples/SPIN/test_problems/validation_nvt/plot_nvt.py b/examples/SPIN/test_problems/validation_nvt/plot_nvt.py
index 06c48b4c28..4109d60245 100755
--- a/examples/SPIN/test_problems/validation_nvt/plot_nvt.py
+++ b/examples/SPIN/test_problems/validation_nvt/plot_nvt.py
@@ -39,5 +39,5 @@ plt.xlabel('Time (in ps)')
 plt.legend()
 plt.show()
 
-fig.savefig(os.path.join(os.getcwd(), "nve_spin_lattice.pdf"), bbox_inches="tight")
+fig.savefig(os.path.join(os.getcwd(), "nvt_spin_lattice.pdf"), bbox_inches="tight")
 plt.close(fig)

From 2d7494186c053e96d85440dd4892566072f9d90e Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 5 Oct 2020 19:37:24 -0600
Subject: [PATCH 14/64] rerun all validations tests (modifed one)

---
 .../test-spin-precession.in                   | 18 ++++++++-------
 .../validation_damped_exchange/two_spins.data | 22 -------------------
 .../validation_nvt/in.spin.nvt_lattice        |  2 +-
 .../validation_nvt/in.spin.nvt_spin           |  2 +-
 src/SPIN/pair_spin_exchange.cpp               |  4 ----
 5 files changed, 12 insertions(+), 36 deletions(-)
 delete mode 100644 examples/SPIN/test_problems/validation_damped_exchange/two_spins.data

diff --git a/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in b/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in
index 0ca49364d2..86da20e6f9 100644
--- a/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in
+++ b/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in
@@ -5,22 +5,24 @@ atom_style      spin
 atom_modify     map array
 boundary        f f f 
 
-read_data	two_spins.data
+atom_modify 	map array 
+lattice 	sc 3.0
+region 		box block 0 2 0 1 0 1 
+create_box 	1 box
+create_atoms 	1 box
+
+mass		1 55.845
+set 		atom 1 spin 2.0 1.0 0.0 0.0
+set 		atom 2 spin 2.0 0.0 1.0 0.0
 
 pair_style      spin/exchange 3.1
 pair_coeff	* * exchange 3.1 11.254 0.0 1.0
 
-group bead      type 1  
- 
-variable        H equal 0.0
-variable        Kan equal 0.0
 variable        Temperature equal 0.0 
 variable        RUN equal 30000
 
 fix             1 all nve/spin lattice no
-fix             2 all precession/spin zeeman ${H} 0.0 0.0 1.0 anisotropy ${Kan} 0.0 0.0 1.0
-fix_modify      2 energy yes
-fix             3 all langevin/spin ${Temperature} 0.01 12345
+fix             2 all langevin/spin ${Temperature} 0.01 12345
 
 compute		out_mag    all spin
 compute		out_pe     all pe
diff --git a/examples/SPIN/test_problems/validation_damped_exchange/two_spins.data b/examples/SPIN/test_problems/validation_damped_exchange/two_spins.data
deleted file mode 100644
index 013f813751..0000000000
--- a/examples/SPIN/test_problems/validation_damped_exchange/two_spins.data
+++ /dev/null
@@ -1,22 +0,0 @@
-LAMMPS data file via write_data, version 19 Sep 2019, timestep = 0
-
-2 atoms
-1 atom types
-
-0.0 6.0 xlo xhi
-0.0 3.0 ylo yhi
-0.0 3.0 zlo zhi
-
-Masses
-
-1 1
-
-Atoms # spin
-
-1 1 2.0 0.0 0.0 0.0 1.0 0.0 0.0 0 0 0
-2 1 2.0 3.0 0.0 0.0 0.0 1.0 0.0 0 0 0
-
-Velocities
-
-1 0.0 0.0 0.0
-2 0.0 0.0 0.0
diff --git a/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_lattice b/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_lattice
index 1d63f01d43..2375c0ff8d 100644
--- a/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_lattice
+++ b/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_lattice
@@ -30,7 +30,7 @@ neighbor 	0.1 bin
 neigh_modify 	every 10 check yes delay 20
 
 fix 		1 all precession/spin zeeman 0.0 0.0 0.0 1.0
-fix             2 all langevin 200.0 200.0 10.0 48279
+fix             2 all langevin 200.0 200.0 1.0 48279
 fix 		3 all langevin/spin 0.0 0.00001 321
 fix 		4 all nve/spin lattice moving
 timestep	0.001
diff --git a/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_spin b/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_spin
index 435e877bdf..6b65df7109 100644
--- a/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_spin
+++ b/examples/SPIN/test_problems/validation_nvt/in.spin.nvt_spin
@@ -29,7 +29,7 @@ neighbor 	0.1 bin
 neigh_modify 	every 10 check yes delay 20
 
 fix 		1 all precession/spin zeeman 0.0 0.0 0.0 1.0
-fix 		2 all langevin/spin 200.0 0.1 321
+fix 		2 all langevin/spin 200.0 0.01 321
 fix 		3 all nve/spin lattice moving
 timestep	0.001
 
diff --git a/src/SPIN/pair_spin_exchange.cpp b/src/SPIN/pair_spin_exchange.cpp
index e6b6db375f..e35408e9ec 100644
--- a/src/SPIN/pair_spin_exchange.cpp
+++ b/src/SPIN/pair_spin_exchange.cpp
@@ -426,10 +426,6 @@ void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq,
   fi[0] -= 0.5*fx;
   fi[1] -= 0.5*fy;
   fi[2] -= 0.5*fz;
-  // fi[0] -= fx;
-  // fi[1] -= fy;
-  // fi[2] -= fz;
-
 }
 
 /* ----------------------------------------------------------------------

From 3147dd850c53be305c776b1b5ff76fce7c1b4b0f Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 5 Oct 2020 20:01:34 -0600
Subject: [PATCH 15/64] adding corrections to doc page

---
 doc/src/Commands_pair.rst      | 1 +
 doc/src/Packages_details.rst   | 2 ++
 doc/src/pair_spin_exchange.rst | 1 +
 doc/src/pair_style.rst         | 1 +
 4 files changed, 5 insertions(+)

diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst
index 888a445daa..4f3b164c98 100644
--- a/doc/src/Commands_pair.rst
+++ b/doc/src/Commands_pair.rst
@@ -240,6 +240,7 @@ OPT.
    * :doc:`spin/dipole/long <pair_spin_dipole>`
    * :doc:`spin/dmi <pair_spin_dmi>`
    * :doc:`spin/exchange <pair_spin_exchange>`
+   * :doc:`spin/exchange/biquadratic <pair_spin_exchange>`
    * :doc:`spin/magelec <pair_spin_magelec>`
    * :doc:`spin/neel <pair_spin_neel>`
    * :doc:`srp <pair_srp>`
diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst
index 1beeeff5b4..d9e1e31470 100644
--- a/doc/src/Packages_details.rst
+++ b/doc/src/Packages_details.rst
@@ -1036,9 +1036,11 @@ the usual manner via MD.  Various pair, fix, and compute styles.
 * :doc:`pair_style spin/dipole/long <pair_spin_dipole>`
 * :doc:`pair_style spin/dmi <pair_spin_dmi>`
 * :doc:`pair_style spin/exchange <pair_spin_exchange>`
+* :doc:`pair_style spin/exchange/biquadratic <pair_spin_exchange>`
 * :doc:`pair_style spin/magelec <pair_spin_magelec>`
 * :doc:`pair_style spin/neel <pair_spin_neel>`
 * :doc:`fix nve/spin <fix_nve_spin>`
+* :doc:`fix langevin/spin <fix_langevin_spin>`
 * :doc:`fix precession/spin <fix_precession_spin>`
 * :doc:`compute spin <compute_spin>`
 * :doc:`neb/spin <neb_spin>`
diff --git a/doc/src/pair_spin_exchange.rst b/doc/src/pair_spin_exchange.rst
index 85cf6d3aa8..38e59eed19 100644
--- a/doc/src/pair_spin_exchange.rst
+++ b/doc/src/pair_spin_exchange.rst
@@ -1,4 +1,5 @@
 .. index:: pair_style spin/exchange
+.. index:: pair_style spin/exchange/biquadratic
 
 pair_style spin/exchange command
 ================================
diff --git a/doc/src/pair_style.rst b/doc/src/pair_style.rst
index 4feaeacad0..2a6c81c0f8 100644
--- a/doc/src/pair_style.rst
+++ b/doc/src/pair_style.rst
@@ -304,6 +304,7 @@ accelerated styles exist.
 * :doc:`spin/dipole/long <pair_spin_dipole>` -
 * :doc:`spin/dmi <pair_spin_dmi>` -
 * :doc:`spin/exchange <pair_spin_exchange>` -
+* :doc:`spin/exchange/biquadratic <pair_spin_exchange>` -
 * :doc:`spin/magelec <pair_spin_magelec>` -
 * :doc:`spin/neel <pair_spin_neel>` -
 * :doc:`srp <pair_srp>` -

From 4baf60ffd1393e063b9332c3c0cfca94d986da7a Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 5 Oct 2020 20:47:07 -0600
Subject: [PATCH 16/64] adding examples of the biquadratic pair_style and
 offset option

---
 examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp | 2 +-
 examples/SPIN/iron/in.spin.iron             | 2 +-
 examples/SPIN/iron/in.spin.iron_cubic       | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp b/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
index 2bfa8393f3..6429cec349 100644
--- a/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
+++ b/examples/SPIN/cobalt_hcp/in.spin.cobalt_hcp
@@ -26,7 +26,7 @@ velocity 	all create 100 4928459 rot yes dist gaussian
 #pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0 spin/neel 4.0
 pair_style 	hybrid/overlay eam/alloy spin/exchange 4.0
 pair_coeff 	* * eam/alloy Co_PurjaPun_2012.eam.alloy Co
-pair_coeff 	* * spin/exchange exchange 4.0 -0.3593 1.135028015e-05 1.064568567
+pair_coeff 	* * spin/exchange exchange 4.0 -0.3593 1.135028015e-05 1.0645 offset yes
 #pair_coeff 	* * spin/neel neel 4.0 0.0048 0.234 1.168 2.6905 0.705 0.652  
 
 neighbor 	0.1 bin
diff --git a/examples/SPIN/iron/in.spin.iron b/examples/SPIN/iron/in.spin.iron
index 58c0537af7..f678d39f56 100644
--- a/examples/SPIN/iron/in.spin.iron
+++ b/examples/SPIN/iron/in.spin.iron
@@ -25,7 +25,7 @@ velocity 	all create 100 4928459 rot yes dist gaussian
 
 pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5
 pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
-pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841 offset yes
 
 neighbor 	0.1 bin
 neigh_modify 	every 10 check yes delay 20
diff --git a/examples/SPIN/iron/in.spin.iron_cubic b/examples/SPIN/iron/in.spin.iron_cubic
index 30a3e0e97c..35011e796f 100644
--- a/examples/SPIN/iron/in.spin.iron_cubic
+++ b/examples/SPIN/iron/in.spin.iron_cubic
@@ -21,9 +21,9 @@ mass		1 55.845
 set 		group all spin 2.2 -1.0 0.0 0.0
 velocity 	all create 100 4928459 rot yes dist gaussian
 
-pair_style 	hybrid/overlay eam/alloy spin/exchange 3.5
+pair_style 	hybrid/overlay eam/alloy spin/exchange/biquadratic 3.5
 pair_coeff 	* * eam/alloy Fe_Mishin2006.eam.alloy Fe
-pair_coeff 	* * spin/exchange exchange 3.4 0.02726 0.2171 1.841
+pair_coeff 	* * spin/exchange/biquadratic biquadratic 3.4 0.02726 0.2171 1.841 0.0 0.0 2.0 offset yes
 neighbor 	0.1 bin
 neigh_modify 	every 10 check yes delay 20
 

From e3b8563ed9785455a7211e9933cd5daa6ff88d7c Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 5 Oct 2020 21:28:26 -0600
Subject: [PATCH 17/64] correcting spelling errors

---
 doc/src/pair_spin_exchange.rst              | 2 +-
 doc/utils/sphinx-config/false_positives.txt | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/src/pair_spin_exchange.rst b/doc/src/pair_spin_exchange.rst
index 38e59eed19..72c416ac72 100644
--- a/doc/src/pair_spin_exchange.rst
+++ b/doc/src/pair_spin_exchange.rst
@@ -132,7 +132,7 @@ for the *spin/exchange/biquadratic* pair style.
 Note that :math:`R_c` is the radius cutoff of the considered exchange 
 interaction, and :math:`a`, :math:`b` and :math:`d` are the three coefficients 
 performing the parameterization of the function :math:`J(r_{ij})` defined 
-above (in the *biquadratic* ase, :math:`a_j`, :math:`b_j`, :math:`d_j` and 
+above (in the *biquadratic* style, :math:`a_j`, :math:`b_j`, :math:`d_j` and 
 :math:`a_k`, :math:`b_k`, :math:`d_k` are the coefficients of :math:`J(r_{ij})`
 and :math:`K(r_{ij})` respectively).
 
diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index b276933a88..a06f72fde5 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -240,6 +240,7 @@ bigint
 Bij
 bilayer
 bilayers
+biquadratic
 binsize
 binstyle
 binutils

From 73b2ad0acce681b5203ffe8c67d7f8f3a906ee26 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 12 Oct 2020 11:38:52 -0600
Subject: [PATCH 18/64] - slight modifs of the damped exchange example

---
 .../validation_damped_exchange/llg_exchange.py | 18 +++++++++++++++++-
 .../test-spin-precession.in                    |  5 ++++-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py b/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py
index dd1c543bb3..5b93ac5c2d 100755
--- a/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py
+++ b/examples/SPIN/test_problems/validation_damped_exchange/llg_exchange.py
@@ -6,9 +6,17 @@ import matplotlib.pyplot as plt
 import mpmath as mp
 
 hbar=0.658212           # Planck's constant (eV.fs/rad)
-J0=0.05                 # per-neighbor exchange interaction (eV)
+# J0=0.05                 # per-neighbor exchange interaction (eV)
+
+# exchange interaction parameters
+J1 = 11.254 # in eV 
+J2 = 0.0    # adim
+J3 = 1.0    # in Ang.
+
+# initial spins
 S1 = np.array([1.0, 0.0, 0.0])
 S2 = np.array([0.0, 1.0, 0.0])
+
 alpha=0.01              # damping coefficient
 pi=math.pi
 
@@ -30,6 +38,14 @@ def rotation_matrix(axis, theta):
       [2 * (bc - ad), aa + cc - bb - dd, 2 * (cd + ab)],
       [2 * (bd + ac), 2 * (cd - ab), aa + dd - bb - cc]])
 
+#Definition of the Bethe-Slater function
+def func_BS(x,a,b,c):
+    return 4*a*((x/c)**2)*(1-b*(x/c)**2)*np.exp(-(x/c)**2)
+
+#Definition of the derivative of the Bethe-Slater function
+def func_dBS(x,a,b,c):
+    return 4*a*((x/c)**2)*(1-b*(x/c)**2)*np.exp(-(x/c)**2)
+
 # calculating precession field of spin Sr
 def calc_rot_vector(Sr,Sf):
   rot = (J0/hbar)*(Sf-alpha*np.cross(Sf,Sr))/(1.0+alpha**2)
diff --git a/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in b/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in
index 86da20e6f9..9dfb4a98d6 100644
--- a/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in
+++ b/examples/SPIN/test_problems/validation_damped_exchange/test-spin-precession.in
@@ -21,7 +21,7 @@ pair_coeff	* * exchange 3.1 11.254 0.0 1.0
 variable        Temperature equal 0.0 
 variable        RUN equal 30000
 
-fix             1 all nve/spin lattice no
+fix             1 all nve/spin lattice frozen
 fix             2 all langevin/spin ${Temperature} 0.01 12345
 
 compute		out_mag    all spin
@@ -36,6 +36,9 @@ variable	emag      equal c_out_mag[5]
 thermo_style    custom step time v_magx v_magy v_magz v_emag pe etotal
 thermo          10
 
+compute 	outsp all property/atom spx spy spz sp fmx fmy fmz
+dump 		1 all custom 10 dump.data type x y z c_outsp[1] c_outsp[2] c_outsp[3] fx fy fz
+
 timestep	0.0001
 
 run             ${RUN}

From 5159d255a74bffef78aee32ec5d05c514618a26e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 26 Oct 2020 11:02:09 -0400
Subject: [PATCH 19/64] update bundled fmtlib to version 7.1.0

---
 src/fmt/chrono.h                           |   79 +-
 src/fmt/color.h                            |   56 +-
 src/fmt/compile.h                          |   75 +-
 src/fmt/core.h                             |  432 +++--
 src/fmt/format-inl.h                       | 1890 +++++++++++++++++---
 src/fmt/format.h                           | 1137 +++++++-----
 src/fmt/locale.h                           |   40 +-
 src/fmt/os.h                               |  106 +-
 src/fmt/ostream.h                          |   28 +-
 src/fmt/printf.h                           |    4 +-
 src/fmt/ranges.h                           |   19 +-
 src/fmtlib_format.cpp                      |    4 +-
 src/fmtlib_os.cpp                          |   17 +-
 unittest/force-styles/test_error_stats.cpp |    2 +-
 14 files changed, 2897 insertions(+), 992 deletions(-)

diff --git a/src/fmt/chrono.h b/src/fmt/chrono.h
index e70b8053a6..1a3b8d5e5c 100644
--- a/src/fmt/chrono.h
+++ b/src/fmt/chrono.h
@@ -72,43 +72,27 @@ FMT_CONSTEXPR To lossless_integral_conversion(const From from, int& ec) {
   static_assert(F::is_integer, "From must be integral");
   static_assert(T::is_integer, "To must be integral");
 
-  if (F::is_signed && !T::is_signed) {
+  if (detail::const_check(F::is_signed && !T::is_signed)) {
     // From may be negative, not allowed!
     if (fmt::detail::is_negative(from)) {
       ec = 1;
       return {};
     }
-
     // From is positive. Can it always fit in To?
-    if (F::digits <= T::digits) {
-      // yes, From always fits in To.
-    } else {
-      // from may not fit in To, we have to do a dynamic check
-      if (from > static_cast<From>((T::max)())) {
-        ec = 1;
-        return {};
-      }
+    if (F::digits > T::digits &&
+        from > static_cast<From>(detail::max_value<To>())) {
+      ec = 1;
+      return {};
     }
   }
 
-  if (!F::is_signed && T::is_signed) {
-    // can from be held in To?
-    if (F::digits < T::digits) {
-      // yes, From always fits in To.
-    } else {
-      // from may not fit in To, we have to do a dynamic check
-      if (from > static_cast<From>((T::max)())) {
-        // outside range.
-        ec = 1;
-        return {};
-      }
-    }
+  if (!F::is_signed && T::is_signed && F::digits >= T::digits &&
+      from > static_cast<From>(detail::max_value<To>())) {
+    ec = 1;
+    return {};
   }
-
-  // reaching here means all is ok for lossless conversion.
-  return static_cast<To>(from);
-
-}  // function
+  return static_cast<To>(from);  // Lossless conversion.
+}
 
 template <typename To, typename From,
           FMT_ENABLE_IF(std::is_same<From, To>::value)>
@@ -190,11 +174,9 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
   // safe conversion to IntermediateRep
   IntermediateRep count =
       lossless_integral_conversion<IntermediateRep>(from.count(), ec);
-  if (ec) {
-    return {};
-  }
+  if (ec) return {};
   // multiply with Factor::num without overflow or underflow
-  if (Factor::num != 1) {
+  if (detail::const_check(Factor::num != 1)) {
     const auto max1 = detail::max_value<IntermediateRep>() / Factor::num;
     if (count > max1) {
       ec = 1;
@@ -209,17 +191,9 @@ To safe_duration_cast(std::chrono::duration<FromRep, FromPeriod> from,
     count *= Factor::num;
   }
 
-  // this can't go wrong, right? den>0 is checked earlier.
-  if (Factor::den != 1) {
-    count /= Factor::den;
-  }
-  // convert to the to type, safely
-  using ToRep = typename To::rep;
-  const ToRep tocount = lossless_integral_conversion<ToRep>(count, ec);
-  if (ec) {
-    return {};
-  }
-  return To{tocount};
+  if (detail::const_check(Factor::den != 1)) count /= Factor::den;
+  auto tocount = lossless_integral_conversion<typename To::rep>(count, ec);
+  return ec ? To() : To(tocount);
 }
 
 /**
@@ -351,6 +325,11 @@ inline std::tm localtime(std::time_t time) {
   return lt.tm_;
 }
 
+inline std::tm localtime(
+    std::chrono::time_point<std::chrono::system_clock> time_point) {
+  return localtime(std::chrono::system_clock::to_time_t(time_point));
+}
+
 // Thread-safe replacement for std::gmtime
 inline std::tm gmtime(std::time_t time) {
   struct dispatcher {
@@ -387,6 +366,11 @@ inline std::tm gmtime(std::time_t time) {
   return gt.tm_;
 }
 
+inline std::tm gmtime(
+    std::chrono::time_point<std::chrono::system_clock> time_point) {
+  return gmtime(std::chrono::system_clock::to_time_t(time_point));
+}
+
 namespace detail {
 inline size_t strftime(char* str, size_t count, const char* format,
                        const std::tm* time) {
@@ -399,6 +383,17 @@ inline size_t strftime(wchar_t* str, size_t count, const wchar_t* format,
 }
 }  // namespace detail
 
+template <typename Char>
+struct formatter<std::chrono::time_point<std::chrono::system_clock>, Char>
+    : formatter<std::tm, Char> {
+  template <typename FormatContext>
+  auto format(std::chrono::time_point<std::chrono::system_clock> val,
+              FormatContext& ctx) -> decltype(ctx.out()) {
+    std::tm time = localtime(val);
+    return formatter<std::tm, Char>::format(time, ctx);
+  }
+};
+
 template <typename Char> struct formatter<std::tm, Char> {
   template <typename ParseContext>
   auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
diff --git a/src/fmt/color.h b/src/fmt/color.h
index b65f892afc..7891058950 100644
--- a/src/fmt/color.h
+++ b/src/fmt/color.h
@@ -463,16 +463,16 @@ template <> inline void reset_color<wchar_t>(FILE* stream) FMT_NOEXCEPT {
 }
 
 template <typename Char>
-inline void reset_color(basic_memory_buffer<Char>& buffer) FMT_NOEXCEPT {
+inline void reset_color(buffer<Char>& buffer) FMT_NOEXCEPT {
   const char* begin = data::reset_color;
   const char* end = begin + sizeof(data::reset_color) - 1;
   buffer.append(begin, end);
 }
 
 template <typename Char>
-void vformat_to(basic_memory_buffer<Char>& buf, const text_style& ts,
+void vformat_to(buffer<Char>& buf, const text_style& ts,
                 basic_string_view<Char> format_str,
-                basic_format_args<buffer_context<Char>> args) {
+                basic_format_args<buffer_context<type_identity_t<Char>>> args) {
   bool has_style = false;
   if (ts.has_emphasis()) {
     has_style = true;
@@ -496,7 +496,7 @@ void vformat_to(basic_memory_buffer<Char>& buf, const text_style& ts,
 
 template <typename S, typename Char = char_t<S>>
 void vprint(std::FILE* f, const text_style& ts, const S& format,
-            basic_format_args<buffer_context<Char>> args) {
+            basic_format_args<buffer_context<type_identity_t<Char>>> args) {
   basic_memory_buffer<Char> buf;
   detail::vformat_to(buf, ts, to_string_view(format), args);
   buf.push_back(Char(0));
@@ -504,20 +504,22 @@ void vprint(std::FILE* f, const text_style& ts, const S& format,
 }
 
 /**
+  \rst
   Formats a string and prints it to the specified file stream using ANSI
   escape sequences to specify text formatting.
-  Example:
+
+  **Example**::
+
     fmt::print(fmt::emphasis::bold | fg(fmt::color::red),
                "Elapsed time: {0:.2f} seconds", 1.23);
+  \endrst
  */
 template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_string<S>::value)>
 void print(std::FILE* f, const text_style& ts, const S& format_str,
            const Args&... args) {
-  detail::check_format_string<Args...>(format_str);
-  using context = buffer_context<char_t<S>>;
-  format_arg_store<context, Args...> as{args...};
-  vprint(f, ts, format_str, basic_format_args<context>(as));
+  vprint(f, ts, format_str,
+         fmt::make_args_checked<Args...>(format_str, args...));
 }
 
 /**
@@ -558,7 +560,41 @@ template <typename S, typename... Args, typename Char = char_t<S>>
 inline std::basic_string<Char> format(const text_style& ts, const S& format_str,
                                       const Args&... args) {
   return vformat(ts, to_string_view(format_str),
-                 detail::make_args_checked<Args...>(format_str, args...));
+                 fmt::make_args_checked<Args...>(format_str, args...));
+}
+
+/**
+  Formats a string with the given text_style and writes the output to ``out``.
+ */
+template <typename OutputIt, typename Char,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
+OutputIt vformat_to(
+    OutputIt out, const text_style& ts, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
+  detail::vformat_to(buf, ts, format_str, args);
+  return detail::get_iterator(buf);
+}
+
+/**
+  \rst
+  Formats arguments with the given text_style, writes the result to the output
+  iterator ``out`` and returns the iterator past the end of the output range.
+
+  **Example**::
+
+    std::vector<char> out;
+    fmt::format_to(std::back_inserter(out),
+                   fmt::emphasis::bold | fg(fmt::color::red), "{}", 42);
+  \endrst
+*/
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char_t<S>>::value&&
+                            detail::is_string<S>::value)>
+inline OutputIt format_to(OutputIt out, const text_style& ts,
+                          const S& format_str, Args&&... args) {
+  return vformat_to(out, ts, to_string_view(format_str),
+                    fmt::make_args_checked<Args...>(format_str, args...));
 }
 
 FMT_END_NAMESPACE
diff --git a/src/fmt/compile.h b/src/fmt/compile.h
index d7e6449ebb..7db610d90f 100644
--- a/src/fmt/compile.h
+++ b/src/fmt/compile.h
@@ -368,7 +368,8 @@ template <typename... Args> struct type_list {};
 
 // Returns a reference to the argument at index N from [first, rest...].
 template <int N, typename T, typename... Args>
-constexpr const auto& get(const T& first, const Args&... rest) {
+constexpr const auto& get([[maybe_unused]] const T& first,
+                          [[maybe_unused]] const Args&... rest) {
   static_assert(N < 1 + sizeof...(Args), "index is out of bounds");
   if constexpr (N == 0)
     return first;
@@ -406,6 +407,19 @@ constexpr text<Char> make_text(basic_string_view<Char> s, size_t pos,
   return {{&s[pos], size}};
 }
 
+template <typename Char> struct code_unit {
+  Char value;
+  using char_type = Char;
+
+  template <typename OutputIt, typename... Args>
+  OutputIt format(OutputIt out, const Args&...) const {
+    return write<Char>(out, value);
+  }
+};
+
+template <typename Char>
+struct is_compiled_format<code_unit<Char>> : std::true_type {};
+
 // A replacement field that refers to argument N.
 template <typename Char, typename T, int N> struct field {
   using char_type = Char;
@@ -430,7 +444,9 @@ template <typename Char, typename T, int N> struct spec_field {
   OutputIt format(OutputIt out, const Args&... args) const {
     // This ensures that the argument type is convertile to `const T&`.
     const T& arg = get<N>(args...);
-    basic_format_context<OutputIt, Char> ctx(out, {});
+    const auto& vargs =
+        make_format_args<basic_format_context<OutputIt, Char>>(args...);
+    basic_format_context<OutputIt, Char> ctx(out, vargs);
     return fmt.format(arg, ctx);
   }
 };
@@ -489,16 +505,17 @@ constexpr auto parse_tail(T head, S format_str) {
 template <typename T, typename Char> struct parse_specs_result {
   formatter<T, Char> fmt;
   size_t end;
+  int next_arg_id;
 };
 
 template <typename T, typename Char>
 constexpr parse_specs_result<T, Char> parse_specs(basic_string_view<Char> str,
-                                                  size_t pos) {
+                                                  size_t pos, int arg_id) {
   str.remove_prefix(pos);
-  auto ctx = basic_format_parse_context<Char>(str);
+  auto ctx = basic_format_parse_context<Char>(str, {}, arg_id + 1);
   auto f = formatter<T, Char>();
   auto end = f.parse(ctx);
-  return {f, pos + (end - str.data()) + 1};
+  return {f, pos + (end - str.data()) + 1, ctx.next_arg_id()};
 }
 
 // Compiles a non-empty format string and returns the compiled representation
@@ -518,8 +535,8 @@ constexpr auto compile_format_string(S format_str) {
                                                format_str);
     } else if constexpr (str[POS + 1] == ':') {
       using type = get_type<ID, Args>;
-      constexpr auto result = parse_specs<type>(str, POS + 2);
-      return parse_tail<Args, result.end, ID + 1>(
+      constexpr auto result = parse_specs<type>(str, POS + 2, ID);
+      return parse_tail<Args, result.end, result.next_arg_id>(
           spec_field<char_type, type, ID>{result.fmt}, format_str);
     } else {
       return unknown_format();
@@ -530,8 +547,13 @@ constexpr auto compile_format_string(S format_str) {
     return parse_tail<Args, POS + 2, ID>(make_text(str, POS, 1), format_str);
   } else {
     constexpr auto end = parse_text(str, POS + 1);
-    return parse_tail<Args, end, ID>(make_text(str, POS, end - POS),
-                                     format_str);
+    if constexpr (end - POS > 1) {
+      return parse_tail<Args, end, ID>(make_text(str, POS, end - POS),
+                                       format_str);
+    } else {
+      return parse_tail<Args, end, ID>(code_unit<char_type>{str[POS]},
+                                       format_str);
+    }
   }
 }
 
@@ -587,8 +609,7 @@ template <typename CompiledFormat, typename... Args,
 FMT_INLINE std::basic_string<Char> format(const CompiledFormat& cf,
                                           const Args&... args) {
   basic_memory_buffer<Char> buffer;
-  detail::buffer<Char>& base = buffer;
-  cf.format(std::back_inserter(base), args...);
+  cf.format(detail::buffer_appender<Char>(buffer), args...);
   return to_string(buffer);
 }
 
@@ -608,8 +629,7 @@ template <typename CompiledFormat, typename... Args,
 std::basic_string<Char> format(const CompiledFormat& cf, const Args&... args) {
   basic_memory_buffer<Char> buffer;
   using context = buffer_context<Char>;
-  detail::buffer<Char>& base = buffer;
-  detail::cf::vformat_to<context>(std::back_inserter(base), cf,
+  detail::cf::vformat_to<context>(detail::buffer_appender<Char>(buffer), cf,
                                   make_format_args<context>(args...));
   return to_string(buffer);
 }
@@ -618,9 +638,13 @@ template <typename S, typename... Args,
           FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
 FMT_INLINE std::basic_string<typename S::char_type> format(const S&,
                                                            Args&&... args) {
-  constexpr basic_string_view<typename S::char_type> str = S();
-  if (str.size() == 2 && str[0] == '{' && str[1] == '}')
-    return fmt::to_string(detail::first(args...));
+#ifdef __cpp_if_constexpr
+  if constexpr (std::is_same<typename S::char_type, char>::value) {
+    constexpr basic_string_view<typename S::char_type> str = S();
+    if (str.size() == 2 && str[0] == '{' && str[1] == '}')
+      return fmt::to_string(detail::first(args...));
+  }
+#endif
   constexpr auto compiled = detail::compile<Args...>(S());
   return format(compiled, std::forward<Args>(args)...);
 }
@@ -643,10 +667,11 @@ OutputIt format_to(OutputIt out, const S&, const Args&... args) {
   return format_to(out, compiled, args...);
 }
 
-template <
-    typename OutputIt, typename CompiledFormat, typename... Args,
-    FMT_ENABLE_IF(detail::is_output_iterator<OutputIt>::value&& std::is_base_of<
-                  detail::basic_compiled_format, CompiledFormat>::value)>
+template <typename OutputIt, typename CompiledFormat, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<
+                        OutputIt, typename CompiledFormat::char_type>::value&&
+                            std::is_base_of<detail::basic_compiled_format,
+                                            CompiledFormat>::value)>
 format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
                                          const CompiledFormat& cf,
                                          const Args&... args) {
@@ -655,6 +680,16 @@ format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
   return {it.base(), it.count()};
 }
 
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_compiled_string<S>::value)>
+format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n, const S&,
+                                         const Args&... args) {
+  constexpr auto compiled = detail::compile<Args...>(S());
+  auto it = format_to(detail::truncating_iterator<OutputIt>(out, n), compiled,
+                      args...);
+  return {it.base(), it.count()};
+}
+
 template <typename CompiledFormat, typename... Args>
 size_t formatted_size(const CompiledFormat& cf, const Args&... args) {
   return format_to(detail::counting_iterator(), cf, args...).count();
diff --git a/src/fmt/core.h b/src/fmt/core.h
index 6d87ab290a..317292288d 100644
--- a/src/fmt/core.h
+++ b/src/fmt/core.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 // The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 70003
+#define FMT_VERSION 70100
 
 #ifdef __clang__
 #  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
@@ -57,6 +57,7 @@
 #  define FMT_MSC_VER 0
 #  define FMT_SUPPRESS_MSC_WARNING(n)
 #endif
+
 #ifdef __has_feature
 #  define FMT_HAS_FEATURE(x) __has_feature(x)
 #else
@@ -64,7 +65,7 @@
 #endif
 
 #if defined(__has_include) && !defined(__INTELLISENSE__) && \
-    !(FMT_ICC_VERSION && FMT_ICC_VERSION < 1600)
+    (!FMT_ICC_VERSION || FMT_ICC_VERSION >= 1600)
 #  define FMT_HAS_INCLUDE(x) __has_include(x)
 #else
 #  define FMT_HAS_INCLUDE(x) 0
@@ -99,7 +100,7 @@
 #endif
 
 #ifndef FMT_OVERRIDE
-#  if FMT_HAS_FEATURE(cxx_override) || \
+#  if FMT_HAS_FEATURE(cxx_override_control) || \
       (FMT_GCC_VERSION >= 408 && FMT_HAS_GXX_CXX11) || FMT_MSC_VER >= 1900
 #    define FMT_OVERRIDE override
 #  else
@@ -152,7 +153,7 @@
 #  if FMT_HAS_CPP14_ATTRIBUTE(deprecated) || FMT_MSC_VER >= 1900
 #    define FMT_DEPRECATED [[deprecated]]
 #  else
-#    if defined(__GNUC__) || defined(__clang__)
+#    if (defined(__GNUC__) && !defined(__LCC__)) || defined(__clang__)
 #      define FMT_DEPRECATED __attribute__((deprecated))
 #    elif FMT_MSC_VER
 #      define FMT_DEPRECATED __declspec(deprecated)
@@ -177,6 +178,15 @@
 #  endif
 #endif
 
+#ifndef FMT_USE_INLINE_NAMESPACES
+#  if FMT_HAS_FEATURE(cxx_inline_namespaces) || FMT_GCC_VERSION >= 404 || \
+      (FMT_MSC_VER >= 1900 && !_MANAGED)
+#    define FMT_USE_INLINE_NAMESPACES 1
+#  else
+#    define FMT_USE_INLINE_NAMESPACES 0
+#  endif
+#endif
+
 // LAMMPS customization
 // use 'v7_lmp' namespace instead of 'v7' so that our
 // bundled copy does not collide with linking other code
@@ -184,8 +194,7 @@
 // a different version.
 
 #ifndef FMT_BEGIN_NAMESPACE
-#  if FMT_HAS_FEATURE(cxx_inline_namespaces) || FMT_GCC_VERSION >= 404 || \
-      FMT_MSC_VER >= 1900
+#  if FMT_USE_INLINE_NAMESPACES
 #    define FMT_INLINE_NAMESPACE inline namespace
 #    define FMT_END_NAMESPACE \
       }                       \
@@ -275,8 +284,7 @@ struct monostate {};
 
 namespace detail {
 
-// A helper function to suppress bogus "conditional expression is constant"
-// warnings.
+// A helper function to suppress "conditional expression is constant" warnings.
 template <typename T> constexpr T const_check(T value) { return value; }
 
 FMT_NORETURN FMT_API void assert_fail(const char* file, int line,
@@ -305,7 +313,8 @@ template <typename T> struct std_string_view {};
 
 #ifdef FMT_USE_INT128
 // Do nothing.
-#elif defined(__SIZEOF_INT128__) && !FMT_NVCC && !(FMT_CLANG_VERSION && FMT_MSC_VER)
+#elif defined(__SIZEOF_INT128__) && !FMT_NVCC && \
+    !(FMT_CLANG_VERSION && FMT_MSC_VER)
 #  define FMT_USE_INT128 1
 using int128_t = __int128_t;
 using uint128_t = __uint128_t;
@@ -514,6 +523,18 @@ template <typename S> struct char_t_impl<S, enable_if_t<is_string<S>::value>> {
   using type = typename result::value_type;
 };
 
+// Reports a compile-time error if S is not a valid format string.
+template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
+FMT_INLINE void check_format_string(const S&) {
+#ifdef FMT_ENFORCE_COMPILE_STRING
+  static_assert(is_compile_string<S>::value,
+                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
+                "FMT_STRING.");
+#endif
+}
+template <typename..., typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
+void check_format_string(S);
+
 struct error_handler {
   constexpr error_handler() = default;
   constexpr error_handler(const error_handler&) = default;
@@ -553,8 +574,9 @@ class basic_format_parse_context : private ErrorHandler {
   using iterator = typename basic_string_view<Char>::iterator;
 
   explicit constexpr basic_format_parse_context(
-      basic_string_view<Char> format_str, ErrorHandler eh = {})
-      : ErrorHandler(eh), format_str_(format_str), next_arg_id_(0) {}
+      basic_string_view<Char> format_str, ErrorHandler eh = {},
+      int next_arg_id = 0)
+      : ErrorHandler(eh), format_str_(format_str), next_arg_id_(next_arg_id) {}
 
   /**
     Returns an iterator to the beginning of the format string range being
@@ -624,8 +646,24 @@ template <typename T, typename Context>
 using has_formatter =
     std::is_constructible<typename Context::template formatter_type<T>>;
 
+// Checks whether T is a container with contiguous storage.
+template <typename T> struct is_contiguous : std::false_type {};
+template <typename Char>
+struct is_contiguous<std::basic_string<Char>> : std::true_type {};
+
 namespace detail {
 
+// Extracts a reference to the container from back_insert_iterator.
+template <typename Container>
+inline Container& get_container(std::back_insert_iterator<Container> it) {
+  using bi_iterator = std::back_insert_iterator<Container>;
+  struct accessor : bi_iterator {
+    accessor(bi_iterator iter) : bi_iterator(iter) {}
+    using bi_iterator::container;
+  };
+  return *accessor(it).container;
+}
+
 /**
   \rst
   A contiguous memory buffer with an optional growing ability. It is an internal
@@ -648,6 +686,8 @@ template <typename T> class buffer {
         size_(sz),
         capacity_(cap) {}
 
+  ~buffer() = default;
+
   /** Sets the buffer data and capacity. */
   void set(T* buf_data, size_t buf_capacity) FMT_NOEXCEPT {
     ptr_ = buf_data;
@@ -663,7 +703,6 @@ template <typename T> class buffer {
 
   buffer(const buffer&) = delete;
   void operator=(const buffer&) = delete;
-  virtual ~buffer() = default;
 
   T* begin() FMT_NOEXCEPT { return ptr_; }
   T* end() FMT_NOEXCEPT { return ptr_ + size_; }
@@ -683,24 +722,26 @@ template <typename T> class buffer {
   /** Returns a pointer to the buffer data. */
   const T* data() const FMT_NOEXCEPT { return ptr_; }
 
-  /**
-    Resizes the buffer. If T is a POD type new elements may not be initialized.
-   */
-  void resize(size_t new_size) {
-    reserve(new_size);
-    size_ = new_size;
-  }
-
   /** Clears this buffer. */
   void clear() { size_ = 0; }
 
-  /** Reserves space to store at least *capacity* elements. */
-  void reserve(size_t new_capacity) {
+  // Tries resizing the buffer to contain *count* elements. If T is a POD type
+  // the new elements may not be initialized.
+  void try_resize(size_t count) {
+    try_reserve(count);
+    size_ = count <= capacity_ ? count : capacity_;
+  }
+
+  // Tries increasing the buffer capacity to *new_capacity*. It can increase the
+  // capacity by a smaller amount than requested but guarantees there is space
+  // for at least one additional element either by increasing the capacity or by
+  // flushing the buffer if it is full.
+  void try_reserve(size_t new_capacity) {
     if (new_capacity > capacity_) grow(new_capacity);
   }
 
   void push_back(const T& value) {
-    reserve(size_ + 1);
+    try_reserve(size_ + 1);
     ptr_[size_++] = value;
   }
 
@@ -713,32 +754,150 @@ template <typename T> class buffer {
   }
 };
 
-// A container-backed buffer.
+struct buffer_traits {
+  explicit buffer_traits(size_t) {}
+  size_t count() const { return 0; }
+  size_t limit(size_t size) { return size; }
+};
+
+class fixed_buffer_traits {
+ private:
+  size_t count_ = 0;
+  size_t limit_;
+
+ public:
+  explicit fixed_buffer_traits(size_t limit) : limit_(limit) {}
+  size_t count() const { return count_; }
+  size_t limit(size_t size) {
+    size_t n = limit_ - count_;
+    count_ += size;
+    return size < n ? size : n;
+  }
+};
+
+// A buffer that writes to an output iterator when flushed.
+template <typename OutputIt, typename T, typename Traits = buffer_traits>
+class iterator_buffer final : public Traits, public buffer<T> {
+ private:
+  OutputIt out_;
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+
+ protected:
+  void grow(size_t) final FMT_OVERRIDE {
+    if (this->size() == buffer_size) flush();
+  }
+  void flush();
+
+ public:
+  explicit iterator_buffer(OutputIt out, size_t n = buffer_size)
+      : Traits(n),
+        buffer<T>(data_, 0, n < size_t(buffer_size) ? n : size_t(buffer_size)),
+        out_(out) {}
+  ~iterator_buffer() { flush(); }
+
+  OutputIt out() {
+    flush();
+    return out_;
+  }
+  size_t count() const { return Traits::count() + this->size(); }
+};
+
+template <typename T> class iterator_buffer<T*, T> final : public buffer<T> {
+ protected:
+  void grow(size_t) final FMT_OVERRIDE {}
+
+ public:
+  explicit iterator_buffer(T* out, size_t = 0) : buffer<T>(out, 0, ~size_t()) {}
+
+  T* out() { return &*this->end(); }
+};
+
+// A buffer that writes to a container with the contiguous storage.
 template <typename Container>
-class container_buffer : public buffer<typename Container::value_type> {
+class iterator_buffer<std::back_insert_iterator<Container>,
+                      enable_if_t<is_contiguous<Container>::value,
+                                  typename Container::value_type>>
+    final : public buffer<typename Container::value_type> {
  private:
   Container& container_;
 
  protected:
-  void grow(size_t capacity) FMT_OVERRIDE {
+  void grow(size_t capacity) final FMT_OVERRIDE {
     container_.resize(capacity);
     this->set(&container_[0], capacity);
   }
 
  public:
-  explicit container_buffer(Container& c)
+  explicit iterator_buffer(Container& c)
       : buffer<typename Container::value_type>(c.size()), container_(c) {}
+  explicit iterator_buffer(std::back_insert_iterator<Container> out, size_t = 0)
+      : iterator_buffer(get_container(out)) {}
+  std::back_insert_iterator<Container> out() {
+    return std::back_inserter(container_);
+  }
 };
 
-// Extracts a reference to the container from back_insert_iterator.
-template <typename Container>
-inline Container& get_container(std::back_insert_iterator<Container> it) {
-  using bi_iterator = std::back_insert_iterator<Container>;
-  struct accessor : bi_iterator {
-    accessor(bi_iterator iter) : bi_iterator(iter) {}
-    using bi_iterator::container;
-  };
-  return *accessor(it).container;
+// A buffer that counts the number of code units written discarding the output.
+template <typename T = char> class counting_buffer final : public buffer<T> {
+ private:
+  enum { buffer_size = 256 };
+  T data_[buffer_size];
+  size_t count_ = 0;
+
+ protected:
+  void grow(size_t) final FMT_OVERRIDE {
+    if (this->size() != buffer_size) return;
+    count_ += this->size();
+    this->clear();
+  }
+
+ public:
+  counting_buffer() : buffer<T>(data_, 0, buffer_size) {}
+
+  size_t count() { return count_ + this->size(); }
+};
+
+// An output iterator that appends to the buffer.
+// It is used to reduce symbol sizes for the common case.
+template <typename T>
+class buffer_appender : public std::back_insert_iterator<buffer<T>> {
+  using base = std::back_insert_iterator<buffer<T>>;
+
+ public:
+  explicit buffer_appender(buffer<T>& buf) : base(buf) {}
+  buffer_appender(base it) : base(it) {}
+
+  buffer_appender& operator++() {
+    base::operator++();
+    return *this;
+  }
+
+  buffer_appender operator++(int) {
+    buffer_appender tmp = *this;
+    ++*this;
+    return tmp;
+  }
+};
+
+// Maps an output iterator into a buffer.
+template <typename T, typename OutputIt>
+iterator_buffer<OutputIt, T> get_buffer(OutputIt);
+template <typename T> buffer<T>& get_buffer(buffer_appender<T>);
+
+template <typename OutputIt> OutputIt get_buffer_init(OutputIt out) {
+  return out;
+}
+template <typename T> buffer<T>& get_buffer_init(buffer_appender<T> out) {
+  return get_container(out);
+}
+
+template <typename Buffer>
+auto get_iterator(Buffer& buf) -> decltype(buf.out()) {
+  return buf.out();
+}
+template <typename T> buffer_appender<T> get_iterator(buffer<T>& buf) {
+  return buffer_appender<T>(buf);
 }
 
 template <typename T, typename Char = char, typename Enable = void>
@@ -767,7 +926,8 @@ template <typename Char> struct named_arg_info {
 template <typename T, typename Char, size_t NUM_ARGS, size_t NUM_NAMED_ARGS>
 struct arg_data {
   // args_[0].named_args points to named_args_ to avoid bloating format_args.
-  T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : 1)];
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[1 + (NUM_ARGS != 0 ? NUM_ARGS : +1)];
   named_arg_info<Char> named_args_[NUM_NAMED_ARGS];
 
   template <typename... U>
@@ -779,7 +939,8 @@ struct arg_data {
 
 template <typename T, typename Char, size_t NUM_ARGS>
 struct arg_data<T, Char, NUM_ARGS, 0> {
-  T args_[NUM_ARGS != 0 ? NUM_ARGS : 1];
+  // +1 to workaround a bug in gcc 7.5 that causes duplicated-branches warning.
+  T args_[NUM_ARGS != 0 ? NUM_ARGS : +1];
 
   template <typename... U>
   FMT_INLINE arg_data(const U&... init) : args_{init...} {}
@@ -967,6 +1128,8 @@ enum { long_short = sizeof(long) == sizeof(int) };
 using long_type = conditional_t<long_short, int, long long>;
 using ulong_type = conditional_t<long_short, unsigned, unsigned long long>;
 
+struct unformattable {};
+
 // Maps formatting arguments to core types.
 template <typename Context> struct arg_mapper {
   using char_type = typename Context::char_type;
@@ -1075,15 +1238,7 @@ template <typename Context> struct arg_mapper {
     return map(val.value);
   }
 
-  int map(...) {
-    constexpr bool formattable = sizeof(Context) == 0;
-    static_assert(
-        formattable,
-        "Cannot format argument. To make type T formattable provide a "
-        "formatter<T> specialization: "
-        "https://fmt.dev/latest/api.html#formatting-user-defined-types");
-    return 0;
-  }
+  unformattable map(...) { return {}; }
 };
 
 // A type constant after applying arg_mapper<Context>.
@@ -1207,15 +1362,25 @@ FMT_CONSTEXPR_DECL FMT_INLINE auto visit_format_arg(
   return vis(monostate());
 }
 
-// Checks whether T is a container with contiguous storage.
-template <typename T> struct is_contiguous : std::false_type {};
-template <typename Char>
-struct is_contiguous<std::basic_string<Char>> : std::true_type {};
-template <typename Char>
-struct is_contiguous<detail::buffer<Char>> : std::true_type {};
+template <typename T> struct formattable : std::false_type {};
 
 namespace detail {
 
+// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
+template <typename... Ts> struct void_t_impl { using type = void; };
+template <typename... Ts>
+using void_t = typename detail::void_t_impl<Ts...>::type;
+
+template <typename It, typename T, typename Enable = void>
+struct is_output_iterator : std::false_type {};
+
+template <typename It, typename T>
+struct is_output_iterator<
+    It, T,
+    void_t<typename std::iterator_traits<It>::iterator_category,
+           decltype(*std::declval<It>() = std::declval<T>())>>
+    : std::true_type {};
+
 template <typename OutputIt>
 struct is_back_insert_iterator : std::false_type {};
 template <typename Container>
@@ -1227,6 +1392,9 @@ struct is_contiguous_back_insert_iterator : std::false_type {};
 template <typename Container>
 struct is_contiguous_back_insert_iterator<std::back_insert_iterator<Container>>
     : is_contiguous<Container> {};
+template <typename Char>
+struct is_contiguous_back_insert_iterator<buffer_appender<Char>>
+    : std::true_type {};
 
 // A type-erased reference to an std::locale to avoid heavy <locale> include.
 class locale_ref {
@@ -1258,13 +1426,24 @@ FMT_CONSTEXPR basic_format_arg<Context> make_arg(const T& value) {
   return arg;
 }
 
+template <typename T> int check(unformattable) {
+  static_assert(
+      formattable<T>(),
+      "Cannot format an argument. To make type T formattable provide a "
+      "formatter<T> specialization: https://fmt.dev/latest/api.html#udt");
+  return 0;
+}
+template <typename T, typename U> inline const U& check(const U& val) {
+  return val;
+}
+
 // The type template parameter is there to avoid an ODR violation when using
 // a fallback formatter in one translation unit and an implicit conversion in
 // another (not recommended).
 template <bool IS_PACKED, typename Context, type, typename T,
           FMT_ENABLE_IF(IS_PACKED)>
 inline value<Context> make_arg(const T& val) {
-  return arg_mapper<Context>().map(val);
+  return check<T>(arg_mapper<Context>().map(val));
 }
 
 template <bool IS_PACKED, typename Context, type, typename T,
@@ -1364,13 +1543,13 @@ template <typename OutputIt, typename Char> class basic_format_context {
 
 template <typename Char>
 using buffer_context =
-    basic_format_context<std::back_insert_iterator<detail::buffer<Char>>, Char>;
+    basic_format_context<detail::buffer_appender<Char>, Char>;
 using format_context = buffer_context<char>;
 using wformat_context = buffer_context<wchar_t>;
 
-// Workaround a bug in gcc: https://stackoverflow.com/q/62767544/471164.
+// Workaround an alias issue: https://stackoverflow.com/q/62767544/471164.
 #define FMT_BUFFER_CONTEXT(Char) \
-  basic_format_context<std::back_insert_iterator<detail::buffer<Char>>, Char>
+  basic_format_context<detail::buffer_appender<Char>, Char>
 
 /**
   \rst
@@ -1422,7 +1601,7 @@ class format_arg_store
 
 /**
   \rst
-  Constructs an `~fmt::format_arg_store` object that contains references to
+  Constructs a `~fmt::format_arg_store` object that contains references to
   arguments and can be implicitly converted to `~fmt::format_args`. `Context`
   can be omitted in which case it defaults to `~fmt::context`.
   See `~fmt::arg` for lifetime considerations.
@@ -1434,6 +1613,27 @@ inline format_arg_store<Context, Args...> make_format_args(
   return {args...};
 }
 
+/**
+  \rst
+  Constructs a `~fmt::format_arg_store` object that contains references
+  to arguments and can be implicitly converted to `~fmt::format_args`.
+  If ``format_str`` is a compile-time string then `make_args_checked` checks
+  its validity at compile time.
+  \endrst
+ */
+template <typename... Args, typename S, typename Char = char_t<S>>
+inline auto make_args_checked(const S& format_str,
+                              const remove_reference_t<Args>&... args)
+    -> format_arg_store<buffer_context<Char>, remove_reference_t<Args>...> {
+  static_assert(
+      detail::count<(
+              std::is_base_of<detail::view, remove_reference_t<Args>>::value &&
+              std::is_reference<Args>::value)...>() == 0,
+      "passing views as lvalues is disallowed");
+  detail::check_format_string<Args...>(format_str);
+  return {args...};
+}
+
 /**
   \rst
   Returns a named argument to be used in a formatting function. It should only
@@ -1749,29 +1949,6 @@ struct wformat_args : basic_format_args<wformat_context> {
 
 namespace detail {
 
-// Reports a compile-time error if S is not a valid format string.
-template <typename..., typename S, FMT_ENABLE_IF(!is_compile_string<S>::value)>
-FMT_INLINE void check_format_string(const S&) {
-#ifdef FMT_ENFORCE_COMPILE_STRING
-  static_assert(is_compile_string<S>::value,
-                "FMT_ENFORCE_COMPILE_STRING requires all format strings to use "
-                "FMT_STRING.");
-#endif
-}
-template <typename..., typename S, FMT_ENABLE_IF(is_compile_string<S>::value)>
-void check_format_string(S);
-
-template <typename... Args, typename S, typename Char = char_t<S>>
-inline format_arg_store<buffer_context<Char>, remove_reference_t<Args>...>
-make_args_checked(const S& format_str,
-                  const remove_reference_t<Args>&... args) {
-  static_assert(count<(std::is_base_of<view, remove_reference_t<Args>>::value &&
-                       std::is_reference<Args>::value)...>() == 0,
-                "passing views as lvalues is disallowed");
-  check_format_string<Args...>(format_str);
-  return {args...};
-}
-
 template <typename Char, FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
 std::basic_string<Char> vformat(
     basic_string_view<Char> format_str,
@@ -1780,9 +1957,10 @@ std::basic_string<Char> vformat(
 FMT_API std::string vformat(string_view format_str, format_args args);
 
 template <typename Char>
-typename FMT_BUFFER_CONTEXT(Char)::iterator vformat_to(
+void vformat_to(
     buffer<Char>& buf, basic_string_view<Char> format_str,
-    basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args);
+    basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args,
+    detail::locale_ref loc = {});
 
 template <typename Char, typename Args,
           FMT_ENABLE_IF(!std::is_same<Char, char>::value)>
@@ -1797,26 +1975,80 @@ inline void vprint_mojibake(std::FILE*, string_view, format_args) {}
 /** Formats a string and writes the output to ``out``. */
 // GCC 8 and earlier cannot handle std::back_insert_iterator<Container> with
 // vformat_to<ArgFormatter>(...) overload, so SFINAE on iterator type instead.
-template <
-    typename OutputIt, typename S, typename Char = char_t<S>,
-    FMT_ENABLE_IF(detail::is_contiguous_back_insert_iterator<OutputIt>::value)>
+template <typename OutputIt, typename S, typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
 OutputIt vformat_to(
     OutputIt out, const S& format_str,
     basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  auto& c = detail::get_container(out);
-  detail::container_buffer<remove_reference_t<decltype(c)>> buf(c);
+  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
   detail::vformat_to(buf, to_string_view(format_str), args);
-  return out;
+  return detail::get_iterator(buf);
 }
 
-template <typename Container, typename S, typename... Args,
-          FMT_ENABLE_IF(
-              is_contiguous<Container>::value&& detail::is_string<S>::value)>
-inline std::back_insert_iterator<Container> format_to(
-    std::back_insert_iterator<Container> out, const S& format_str,
-    Args&&... args) {
-  return vformat_to(out, to_string_view(format_str),
-                    detail::make_args_checked<Args...>(format_str, args...));
+/**
+ \rst
+ Formats arguments, writes the result to the output iterator ``out`` and returns
+ the iterator past the end of the output range.
+
+ **Example**::
+
+   std::vector<char> out;
+   fmt::format_to(std::back_inserter(out), "{}", 42);
+ \endrst
+ */
+// We cannot use FMT_ENABLE_IF because of a bug in gcc 8.3.
+template <typename OutputIt, typename S, typename... Args,
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
+inline auto format_to(OutputIt out, const S& format_str, Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat_to(out, to_string_view(format_str), vargs);
+}
+
+template <typename OutputIt> struct format_to_n_result {
+  /** Iterator past the end of the output range. */
+  OutputIt out;
+  /** Total (not truncated) output size. */
+  size_t size;
+};
+
+template <typename OutputIt, typename Char, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
+inline format_to_n_result<OutputIt> vformat_to_n(
+    OutputIt out, size_t n, basic_string_view<Char> format_str,
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  detail::iterator_buffer<OutputIt, Char, detail::fixed_buffer_traits> buf(out,
+                                                                           n);
+  detail::vformat_to(buf, format_str, args);
+  return {buf.out(), buf.count()};
+}
+
+/**
+ \rst
+ Formats arguments, writes up to ``n`` characters of the result to the output
+ iterator ``out`` and returns the total output size and the iterator past the
+ end of the output range.
+ \endrst
+ */
+template <typename OutputIt, typename S, typename... Args,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char_t<S>>::value)>
+inline format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
+                                                const S& format_str,
+                                                const Args&... args) {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat_to_n(out, n, to_string_view(format_str), vargs);
+}
+
+/**
+  Returns the number of characters in the output of
+  ``format(format_str, args...)``.
+ */
+template <typename... Args>
+inline size_t formatted_size(string_view format_str, Args&&... args) {
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  detail::counting_buffer<> buf;
+  detail::vformat_to(buf, format_str, vargs);
+  return buf.count();
 }
 
 template <typename S, typename Char = char_t<S>>
@@ -1840,7 +2072,7 @@ FMT_INLINE std::basic_string<Char> vformat(
 // std::basic_string<char_t<S>> to reduce the symbol size.
 template <typename S, typename... Args, typename Char = char_t<S>>
 FMT_INLINE std::basic_string<Char> format(const S& format_str, Args&&... args) {
-  const auto& vargs = detail::make_args_checked<Args...>(format_str, args...);
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
   return detail::vformat(to_string_view(format_str), vargs);
 }
 
@@ -1860,7 +2092,7 @@ FMT_API void vprint(std::FILE*, string_view, format_args);
  */
 template <typename S, typename... Args, typename Char = char_t<S>>
 inline void print(std::FILE* f, const S& format_str, Args&&... args) {
-  const auto& vargs = detail::make_args_checked<Args...>(format_str, args...);
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
   return detail::is_unicode<Char>()
              ? vprint(f, to_string_view(format_str), vargs)
              : detail::vprint_mojibake(f, to_string_view(format_str), vargs);
@@ -1879,7 +2111,7 @@ inline void print(std::FILE* f, const S& format_str, Args&&... args) {
  */
 template <typename S, typename... Args, typename Char = char_t<S>>
 inline void print(const S& format_str, Args&&... args) {
-  const auto& vargs = detail::make_args_checked<Args...>(format_str, args...);
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
   return detail::is_unicode<Char>()
              ? vprint(to_string_view(format_str), vargs)
              : detail::vprint_mojibake(stdout, to_string_view(format_str),
diff --git a/src/fmt/format-inl.h b/src/fmt/format-inl.h
index d8c9c8a5ee..b7cb3209c8 100644
--- a/src/fmt/format-inl.h
+++ b/src/fmt/format-inl.h
@@ -13,32 +13,19 @@
 #include <climits>
 #include <cmath>
 #include <cstdarg>
-#include <cstring>  // for std::memmove
+#include <cstring>  // std::memmove
 #include <cwchar>
 #include <exception>
 
-#include "format.h"
-#if !defined(FMT_STATIC_THOUSANDS_SEPARATOR)
+#ifndef FMT_STATIC_THOUSANDS_SEPARATOR
 #  include <locale>
 #endif
 
 #ifdef _WIN32
-#  if !defined(NOMINMAX) && !defined(WIN32_LEAN_AND_MEAN)
-#    define NOMINMAX
-#    define WIN32_LEAN_AND_MEAN
-#    include <windows.h>
-#    undef WIN32_LEAN_AND_MEAN
-#    undef NOMINMAX
-#  else
-#    include <windows.h>
-#  endif
-#  include <io.h>
+#  include <io.h>  // _isatty
 #endif
 
-#ifdef _MSC_VER
-#  pragma warning(push)
-#  pragma warning(disable : 4702)  // unreachable code
-#endif
+#include "format.h"
 
 // Dummy implementations of strerror_r and strerror_s called if corresponding
 // system functions are not available.
@@ -79,8 +66,8 @@ inline int fmt_snprintf(char* buffer, size_t size, const char* format, ...) {
 //   ERANGE - buffer is not large enough to store the error message
 //   other  - failure
 // Buffer should be at least of size 1.
-FMT_FUNC int safe_strerror(int error_code, char*& buffer,
-                           size_t buffer_size) FMT_NOEXCEPT {
+inline int safe_strerror(int error_code, char*& buffer,
+                         size_t buffer_size) FMT_NOEXCEPT {
   FMT_ASSERT(buffer != nullptr && buffer_size != 0, "invalid buffer");
 
   class dispatcher {
@@ -145,7 +132,7 @@ FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
   // Report error code making sure that the output fits into
   // inline_buffer_size to avoid dynamic memory allocation and potential
   // bad_alloc.
-  out.resize(0);
+  out.try_resize(0);
   static const char SEP[] = ": ";
   static const char ERROR_STR[] = "error ";
   // Subtract 2 to account for terminating null characters in SEP and ERROR_STR.
@@ -156,7 +143,7 @@ FMT_FUNC void format_error_code(detail::buffer<char>& out, int error_code,
     ++error_code_size;
   }
   error_code_size += detail::to_unsigned(detail::count_digits(abs_value));
-  auto it = std::back_inserter(out);
+  auto it = buffer_appender<char>(out);
   if (message.size() <= inline_buffer_size - error_code_size)
     format_to(it, "{}{}", message, SEP);
   format_to(it, "{}{}", ERROR_STR, error_code);
@@ -173,8 +160,8 @@ FMT_FUNC void report_error(format_func func, int error_code,
 }
 
 // A wrapper around fwrite that throws on error.
-FMT_FUNC void fwrite_fully(const void* ptr, size_t size, size_t count,
-                           FILE* stream) {
+inline void fwrite_fully(const void* ptr, size_t size, size_t count,
+                         FILE* stream) {
   size_t written = std::fwrite(ptr, size, count, stream);
   if (written < count) FMT_THROW(system_error(errno, "cannot write to file"));
 }
@@ -242,26 +229,23 @@ template <> FMT_FUNC int count_digits<4>(detail::fallback_uintptr n) {
 
 template <typename T>
 const typename basic_data<T>::digit_pair basic_data<T>::digits[] = {
-    {'0', '0'},  {'0', '1'},  {'0', '2'},  {'0', '3'},  {'0', '4'},
-    {'0', '5'},  {'0', '6'},  {'0', '7'},  {'0', '8'},  {'0', '9'},
-    {'1', '0'},  {'1', '1'},  {'1', '2'},  {'1', '3'},  {'1', '4'},
-    {'1', '5'},  {'1', '6'},  {'1', '7'},  {'1', '8'},  {'1', '9'},
-    {'2', '0'},  {'2', '1'},  {'2', '2'},  {'2', '3'},  {'2', '4'},
-    {'2', '5'},  {'2', '6'},  {'2', '7'},  {'2', '8'},  {'2', '9'},
-    {'3', '0'},  {'3', '1'},  {'3', '2'},  {'3', '3'},  {'3', '4'},
-    {'3', '5'},  {'3', '6'},  {'3', '7'},  {'3', '8'},  {'3', '9'},
-    {'4', '0'},  {'4', '1'},  {'4', '2'},  {'4', '3'},  {'4', '4'},
-    {'4', '5'},  {'4', '6'},  {'4', '7'},  {'4', '8'},  {'4', '9'},
-    {'5', '0'},  {'5', '1'},  {'5', '2'},  {'5', '3'},  {'5', '4'},
-    {'5', '5'},  {'5', '6'},  {'5', '7'},  {'5', '8'},  {'5', '9'},
-    {'6', '0'},  {'6', '1'},  {'6', '2'},  {'6', '3'},  {'6', '4'},
-    {'6', '5'},  {'6', '6'},  {'6', '7'},  {'6', '8'},  {'6', '9'},
-    {'7', '0'},  {'7', '1'},  {'7', '2'},  {'7', '3'},  {'7', '4'},
-    {'7', '5'},  {'7', '6'},  {'7', '7'},  {'7', '8'},  {'7', '9'},
-    {'8', '0'},  {'8', '1'},  {'8', '2'},  {'8', '3'},  {'8', '4'},
-    {'8', '5'},  {'8', '6'},  {'8', '7'},  {'8', '8'},  {'8', '9'},
-    {'9', '0'},  {'9', '1'},  {'9', '2'},  {'9', '3'},  {'9', '4'},
-    {'9', '5'},  {'9', '6'},  {'9', '7'},  {'9', '8'},  {'9', '9'}};
+    {'0', '0'}, {'0', '1'}, {'0', '2'}, {'0', '3'}, {'0', '4'}, {'0', '5'},
+    {'0', '6'}, {'0', '7'}, {'0', '8'}, {'0', '9'}, {'1', '0'}, {'1', '1'},
+    {'1', '2'}, {'1', '3'}, {'1', '4'}, {'1', '5'}, {'1', '6'}, {'1', '7'},
+    {'1', '8'}, {'1', '9'}, {'2', '0'}, {'2', '1'}, {'2', '2'}, {'2', '3'},
+    {'2', '4'}, {'2', '5'}, {'2', '6'}, {'2', '7'}, {'2', '8'}, {'2', '9'},
+    {'3', '0'}, {'3', '1'}, {'3', '2'}, {'3', '3'}, {'3', '4'}, {'3', '5'},
+    {'3', '6'}, {'3', '7'}, {'3', '8'}, {'3', '9'}, {'4', '0'}, {'4', '1'},
+    {'4', '2'}, {'4', '3'}, {'4', '4'}, {'4', '5'}, {'4', '6'}, {'4', '7'},
+    {'4', '8'}, {'4', '9'}, {'5', '0'}, {'5', '1'}, {'5', '2'}, {'5', '3'},
+    {'5', '4'}, {'5', '5'}, {'5', '6'}, {'5', '7'}, {'5', '8'}, {'5', '9'},
+    {'6', '0'}, {'6', '1'}, {'6', '2'}, {'6', '3'}, {'6', '4'}, {'6', '5'},
+    {'6', '6'}, {'6', '7'}, {'6', '8'}, {'6', '9'}, {'7', '0'}, {'7', '1'},
+    {'7', '2'}, {'7', '3'}, {'7', '4'}, {'7', '5'}, {'7', '6'}, {'7', '7'},
+    {'7', '8'}, {'7', '9'}, {'8', '0'}, {'8', '1'}, {'8', '2'}, {'8', '3'},
+    {'8', '4'}, {'8', '5'}, {'8', '6'}, {'8', '7'}, {'8', '8'}, {'8', '9'},
+    {'9', '0'}, {'9', '1'}, {'9', '2'}, {'9', '3'}, {'9', '4'}, {'9', '5'},
+    {'9', '6'}, {'9', '7'}, {'9', '8'}, {'9', '9'}};
 
 template <typename T>
 const char basic_data<T>::hex_digits[] = "0123456789abcdef";
@@ -277,18 +261,18 @@ const uint64_t basic_data<T>::powers_of_10_64[] = {
     10000000000000000000ULL};
 
 template <typename T>
-const uint32_t basic_data<T>::zero_or_powers_of_10_32[] = {0,
+const uint32_t basic_data<T>::zero_or_powers_of_10_32[] = {0, 0,
                                                            FMT_POWERS_OF_10(1)};
 
 template <typename T>
 const uint64_t basic_data<T>::zero_or_powers_of_10_64[] = {
-    0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
+    0, 0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
     10000000000000000000ULL};
 
 // Normalized 64-bit significands of pow(10, k), for k = -348, -340, ..., 340.
 // These are generated by support/compute-powers.py.
 template <typename T>
-const uint64_t basic_data<T>::pow10_significands[] = {
+const uint64_t basic_data<T>::grisu_pow10_significands[] = {
     0xfa8fd5a0081c0288, 0xbaaee17fa23ebf76, 0x8b16fb203055ac76,
     0xcf42894a5dce35ea, 0x9a6bb0aa55653b2d, 0xe61acf033d1a45df,
     0xab70fe17c79ac6ca, 0xff77b1fcbebcdc4f, 0xbe5691ef416bd60c,
@@ -323,7 +307,7 @@ const uint64_t basic_data<T>::pow10_significands[] = {
 // Binary exponents of pow(10, k), for k = -348, -340, ..., 340, corresponding
 // to significands above.
 template <typename T>
-const int16_t basic_data<T>::pow10_exponents[] = {
+const int16_t basic_data<T>::grisu_pow10_exponents[] = {
     -1220, -1193, -1166, -1140, -1113, -1087, -1060, -1034, -1007, -980, -954,
     -927,  -901,  -874,  -847,  -821,  -794,  -768,  -741,  -715,  -688, -661,
     -635,  -608,  -582,  -555,  -529,  -502,  -475,  -449,  -422,  -396, -369,
@@ -333,6 +317,744 @@ const int16_t basic_data<T>::pow10_exponents[] = {
     534,   561,   588,   614,   641,   667,   694,   720,   747,   774,  800,
     827,   853,   880,   907,   933,   960,   986,   1013,  1039,  1066};
 
+template <typename T>
+const divtest_table_entry<uint32_t> basic_data<T>::divtest_table_for_pow5_32[] =
+    {{0x00000001, 0xffffffff}, {0xcccccccd, 0x33333333},
+     {0xc28f5c29, 0x0a3d70a3}, {0x26e978d5, 0x020c49ba},
+     {0x3afb7e91, 0x0068db8b}, {0x0bcbe61d, 0x0014f8b5},
+     {0x68c26139, 0x000431bd}, {0xae8d46a5, 0x0000d6bf},
+     {0x22e90e21, 0x00002af3}, {0x3a2e9c6d, 0x00000897},
+     {0x3ed61f49, 0x000001b7}};
+
+template <typename T>
+const divtest_table_entry<uint64_t> basic_data<T>::divtest_table_for_pow5_64[] =
+    {{0x0000000000000001, 0xffffffffffffffff},
+     {0xcccccccccccccccd, 0x3333333333333333},
+     {0x8f5c28f5c28f5c29, 0x0a3d70a3d70a3d70},
+     {0x1cac083126e978d5, 0x020c49ba5e353f7c},
+     {0xd288ce703afb7e91, 0x0068db8bac710cb2},
+     {0x5d4e8fb00bcbe61d, 0x0014f8b588e368f0},
+     {0x790fb65668c26139, 0x000431bde82d7b63},
+     {0xe5032477ae8d46a5, 0x0000d6bf94d5e57a},
+     {0xc767074b22e90e21, 0x00002af31dc46118},
+     {0x8e47ce423a2e9c6d, 0x0000089705f4136b},
+     {0x4fa7f60d3ed61f49, 0x000001b7cdfd9d7b},
+     {0x0fee64690c913975, 0x00000057f5ff85e5},
+     {0x3662e0e1cf503eb1, 0x000000119799812d},
+     {0xa47a2cf9f6433fbd, 0x0000000384b84d09},
+     {0x54186f653140a659, 0x00000000b424dc35},
+     {0x7738164770402145, 0x0000000024075f3d},
+     {0xe4a4d1417cd9a041, 0x000000000734aca5},
+     {0xc75429d9e5c5200d, 0x000000000170ef54},
+     {0xc1773b91fac10669, 0x000000000049c977},
+     {0x26b172506559ce15, 0x00000000000ec1e4},
+     {0xd489e3a9addec2d1, 0x000000000002f394},
+     {0x90e860bb892c8d5d, 0x000000000000971d},
+     {0x502e79bf1b6f4f79, 0x0000000000001e39},
+     {0xdcd618596be30fe5, 0x000000000000060b}};
+
+template <typename T>
+const uint64_t basic_data<T>::dragonbox_pow10_significands_64[] = {
+    0x81ceb32c4b43fcf5, 0xa2425ff75e14fc32, 0xcad2f7f5359a3b3f,
+    0xfd87b5f28300ca0e, 0x9e74d1b791e07e49, 0xc612062576589ddb,
+    0xf79687aed3eec552, 0x9abe14cd44753b53, 0xc16d9a0095928a28,
+    0xf1c90080baf72cb2, 0x971da05074da7bef, 0xbce5086492111aeb,
+    0xec1e4a7db69561a6, 0x9392ee8e921d5d08, 0xb877aa3236a4b44a,
+    0xe69594bec44de15c, 0x901d7cf73ab0acda, 0xb424dc35095cd810,
+    0xe12e13424bb40e14, 0x8cbccc096f5088cc, 0xafebff0bcb24aaff,
+    0xdbe6fecebdedd5bf, 0x89705f4136b4a598, 0xabcc77118461cefd,
+    0xd6bf94d5e57a42bd, 0x8637bd05af6c69b6, 0xa7c5ac471b478424,
+    0xd1b71758e219652c, 0x83126e978d4fdf3c, 0xa3d70a3d70a3d70b,
+    0xcccccccccccccccd, 0x8000000000000000, 0xa000000000000000,
+    0xc800000000000000, 0xfa00000000000000, 0x9c40000000000000,
+    0xc350000000000000, 0xf424000000000000, 0x9896800000000000,
+    0xbebc200000000000, 0xee6b280000000000, 0x9502f90000000000,
+    0xba43b74000000000, 0xe8d4a51000000000, 0x9184e72a00000000,
+    0xb5e620f480000000, 0xe35fa931a0000000, 0x8e1bc9bf04000000,
+    0xb1a2bc2ec5000000, 0xde0b6b3a76400000, 0x8ac7230489e80000,
+    0xad78ebc5ac620000, 0xd8d726b7177a8000, 0x878678326eac9000,
+    0xa968163f0a57b400, 0xd3c21bcecceda100, 0x84595161401484a0,
+    0xa56fa5b99019a5c8, 0xcecb8f27f4200f3a, 0x813f3978f8940984,
+    0xa18f07d736b90be5, 0xc9f2c9cd04674ede, 0xfc6f7c4045812296,
+    0x9dc5ada82b70b59d, 0xc5371912364ce305, 0xf684df56c3e01bc6,
+    0x9a130b963a6c115c, 0xc097ce7bc90715b3, 0xf0bdc21abb48db20,
+    0x96769950b50d88f4, 0xbc143fa4e250eb31, 0xeb194f8e1ae525fd,
+    0x92efd1b8d0cf37be, 0xb7abc627050305ad, 0xe596b7b0c643c719,
+    0x8f7e32ce7bea5c6f, 0xb35dbf821ae4f38b, 0xe0352f62a19e306e};
+
+template <typename T>
+const uint128_wrapper basic_data<T>::dragonbox_pow10_significands_128[] = {
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+    {0x9faacf3df73609b1, 0x77b191618c54e9ad},
+    {0xc795830d75038c1d, 0xd59df5b9ef6a2418},
+    {0xf97ae3d0d2446f25, 0x4b0573286b44ad1e},
+    {0x9becce62836ac577, 0x4ee367f9430aec33},
+    {0xc2e801fb244576d5, 0x229c41f793cda740},
+    {0xf3a20279ed56d48a, 0x6b43527578c11110},
+    {0x9845418c345644d6, 0x830a13896b78aaaa},
+    {0xbe5691ef416bd60c, 0x23cc986bc656d554},
+    {0xedec366b11c6cb8f, 0x2cbfbe86b7ec8aa9},
+    {0x94b3a202eb1c3f39, 0x7bf7d71432f3d6aa},
+    {0xb9e08a83a5e34f07, 0xdaf5ccd93fb0cc54},
+    {0xe858ad248f5c22c9, 0xd1b3400f8f9cff69},
+    {0x91376c36d99995be, 0x23100809b9c21fa2},
+    {0xb58547448ffffb2d, 0xabd40a0c2832a78b},
+    {0xe2e69915b3fff9f9, 0x16c90c8f323f516d},
+    {0x8dd01fad907ffc3b, 0xae3da7d97f6792e4},
+    {0xb1442798f49ffb4a, 0x99cd11cfdf41779d},
+    {0xdd95317f31c7fa1d, 0x40405643d711d584},
+    {0x8a7d3eef7f1cfc52, 0x482835ea666b2573},
+    {0xad1c8eab5ee43b66, 0xda3243650005eed0},
+    {0xd863b256369d4a40, 0x90bed43e40076a83},
+    {0x873e4f75e2224e68, 0x5a7744a6e804a292},
+    {0xa90de3535aaae202, 0x711515d0a205cb37},
+    {0xd3515c2831559a83, 0x0d5a5b44ca873e04},
+    {0x8412d9991ed58091, 0xe858790afe9486c3},
+    {0xa5178fff668ae0b6, 0x626e974dbe39a873},
+    {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+    {0x80fa687f881c7f8e, 0x7ce66634bc9d0b9a},
+    {0xa139029f6a239f72, 0x1c1fffc1ebc44e81},
+    {0xc987434744ac874e, 0xa327ffb266b56221},
+    {0xfbe9141915d7a922, 0x4bf1ff9f0062baa9},
+    {0x9d71ac8fada6c9b5, 0x6f773fc3603db4aa},
+    {0xc4ce17b399107c22, 0xcb550fb4384d21d4},
+    {0xf6019da07f549b2b, 0x7e2a53a146606a49},
+    {0x99c102844f94e0fb, 0x2eda7444cbfc426e},
+    {0xc0314325637a1939, 0xfa911155fefb5309},
+    {0xf03d93eebc589f88, 0x793555ab7eba27cb},
+    {0x96267c7535b763b5, 0x4bc1558b2f3458df},
+    {0xbbb01b9283253ca2, 0x9eb1aaedfb016f17},
+    {0xea9c227723ee8bcb, 0x465e15a979c1cadd},
+    {0x92a1958a7675175f, 0x0bfacd89ec191eca},
+    {0xb749faed14125d36, 0xcef980ec671f667c},
+    {0xe51c79a85916f484, 0x82b7e12780e7401b},
+    {0x8f31cc0937ae58d2, 0xd1b2ecb8b0908811},
+    {0xb2fe3f0b8599ef07, 0x861fa7e6dcb4aa16},
+    {0xdfbdcece67006ac9, 0x67a791e093e1d49b},
+    {0x8bd6a141006042bd, 0xe0c8bb2c5c6d24e1},
+    {0xaecc49914078536d, 0x58fae9f773886e19},
+    {0xda7f5bf590966848, 0xaf39a475506a899f},
+    {0x888f99797a5e012d, 0x6d8406c952429604},
+    {0xaab37fd7d8f58178, 0xc8e5087ba6d33b84},
+    {0xd5605fcdcf32e1d6, 0xfb1e4a9a90880a65},
+    {0x855c3be0a17fcd26, 0x5cf2eea09a550680},
+    {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+    {0xd0601d8efc57b08b, 0xf13b94daf124da27},
+    {0x823c12795db6ce57, 0x76c53d08d6b70859},
+    {0xa2cb1717b52481ed, 0x54768c4b0c64ca6f},
+    {0xcb7ddcdda26da268, 0xa9942f5dcf7dfd0a},
+    {0xfe5d54150b090b02, 0xd3f93b35435d7c4d},
+    {0x9efa548d26e5a6e1, 0xc47bc5014a1a6db0},
+    {0xc6b8e9b0709f109a, 0x359ab6419ca1091c},
+    {0xf867241c8cc6d4c0, 0xc30163d203c94b63},
+    {0x9b407691d7fc44f8, 0x79e0de63425dcf1e},
+    {0xc21094364dfb5636, 0x985915fc12f542e5},
+    {0xf294b943e17a2bc4, 0x3e6f5b7b17b2939e},
+    {0x979cf3ca6cec5b5a, 0xa705992ceecf9c43},
+    {0xbd8430bd08277231, 0x50c6ff782a838354},
+    {0xece53cec4a314ebd, 0xa4f8bf5635246429},
+    {0x940f4613ae5ed136, 0x871b7795e136be9a},
+    {0xb913179899f68584, 0x28e2557b59846e40},
+    {0xe757dd7ec07426e5, 0x331aeada2fe589d0},
+    {0x9096ea6f3848984f, 0x3ff0d2c85def7622},
+    {0xb4bca50b065abe63, 0x0fed077a756b53aa},
+    {0xe1ebce4dc7f16dfb, 0xd3e8495912c62895},
+    {0x8d3360f09cf6e4bd, 0x64712dd7abbbd95d},
+    {0xb080392cc4349dec, 0xbd8d794d96aacfb4},
+    {0xdca04777f541c567, 0xecf0d7a0fc5583a1},
+    {0x89e42caaf9491b60, 0xf41686c49db57245},
+    {0xac5d37d5b79b6239, 0x311c2875c522ced6},
+    {0xd77485cb25823ac7, 0x7d633293366b828c},
+    {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+    {0xa8530886b54dbdeb, 0xd9f57f830283fdfd},
+    {0xd267caa862a12d66, 0xd072df63c324fd7c},
+    {0x8380dea93da4bc60, 0x4247cb9e59f71e6e},
+    {0xa46116538d0deb78, 0x52d9be85f074e609},
+    {0xcd795be870516656, 0x67902e276c921f8c},
+    {0x806bd9714632dff6, 0x00ba1cd8a3db53b7},
+    {0xa086cfcd97bf97f3, 0x80e8a40eccd228a5},
+    {0xc8a883c0fdaf7df0, 0x6122cd128006b2ce},
+    {0xfad2a4b13d1b5d6c, 0x796b805720085f82},
+    {0x9cc3a6eec6311a63, 0xcbe3303674053bb1},
+    {0xc3f490aa77bd60fc, 0xbedbfc4411068a9d},
+    {0xf4f1b4d515acb93b, 0xee92fb5515482d45},
+    {0x991711052d8bf3c5, 0x751bdd152d4d1c4b},
+    {0xbf5cd54678eef0b6, 0xd262d45a78a0635e},
+    {0xef340a98172aace4, 0x86fb897116c87c35},
+    {0x9580869f0e7aac0e, 0xd45d35e6ae3d4da1},
+    {0xbae0a846d2195712, 0x8974836059cca10a},
+    {0xe998d258869facd7, 0x2bd1a438703fc94c},
+    {0x91ff83775423cc06, 0x7b6306a34627ddd0},
+    {0xb67f6455292cbf08, 0x1a3bc84c17b1d543},
+    {0xe41f3d6a7377eeca, 0x20caba5f1d9e4a94},
+    {0x8e938662882af53e, 0x547eb47b7282ee9d},
+    {0xb23867fb2a35b28d, 0xe99e619a4f23aa44},
+    {0xdec681f9f4c31f31, 0x6405fa00e2ec94d5},
+    {0x8b3c113c38f9f37e, 0xde83bc408dd3dd05},
+    {0xae0b158b4738705e, 0x9624ab50b148d446},
+    {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+    {0x87f8a8d4cfa417c9, 0xe54ca5d70a80e5d7},
+    {0xa9f6d30a038d1dbc, 0x5e9fcf4ccd211f4d},
+    {0xd47487cc8470652b, 0x7647c32000696720},
+    {0x84c8d4dfd2c63f3b, 0x29ecd9f40041e074},
+    {0xa5fb0a17c777cf09, 0xf468107100525891},
+    {0xcf79cc9db955c2cc, 0x7182148d4066eeb5},
+    {0x81ac1fe293d599bf, 0xc6f14cd848405531},
+    {0xa21727db38cb002f, 0xb8ada00e5a506a7d},
+    {0xca9cf1d206fdc03b, 0xa6d90811f0e4851d},
+    {0xfd442e4688bd304a, 0x908f4a166d1da664},
+    {0x9e4a9cec15763e2e, 0x9a598e4e043287ff},
+    {0xc5dd44271ad3cdba, 0x40eff1e1853f29fe},
+    {0xf7549530e188c128, 0xd12bee59e68ef47d},
+    {0x9a94dd3e8cf578b9, 0x82bb74f8301958cf},
+    {0xc13a148e3032d6e7, 0xe36a52363c1faf02},
+    {0xf18899b1bc3f8ca1, 0xdc44e6c3cb279ac2},
+    {0x96f5600f15a7b7e5, 0x29ab103a5ef8c0ba},
+    {0xbcb2b812db11a5de, 0x7415d448f6b6f0e8},
+    {0xebdf661791d60f56, 0x111b495b3464ad22},
+    {0x936b9fcebb25c995, 0xcab10dd900beec35},
+    {0xb84687c269ef3bfb, 0x3d5d514f40eea743},
+    {0xe65829b3046b0afa, 0x0cb4a5a3112a5113},
+    {0x8ff71a0fe2c2e6dc, 0x47f0e785eaba72ac},
+    {0xb3f4e093db73a093, 0x59ed216765690f57},
+    {0xe0f218b8d25088b8, 0x306869c13ec3532d},
+    {0x8c974f7383725573, 0x1e414218c73a13fc},
+    {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+    {0xdbac6c247d62a583, 0xdf45f746b74abf3a},
+    {0x894bc396ce5da772, 0x6b8bba8c328eb784},
+    {0xab9eb47c81f5114f, 0x066ea92f3f326565},
+    {0xd686619ba27255a2, 0xc80a537b0efefebe},
+    {0x8613fd0145877585, 0xbd06742ce95f5f37},
+    {0xa798fc4196e952e7, 0x2c48113823b73705},
+    {0xd17f3b51fca3a7a0, 0xf75a15862ca504c6},
+    {0x82ef85133de648c4, 0x9a984d73dbe722fc},
+    {0xa3ab66580d5fdaf5, 0xc13e60d0d2e0ebbb},
+    {0xcc963fee10b7d1b3, 0x318df905079926a9},
+    {0xffbbcfe994e5c61f, 0xfdf17746497f7053},
+    {0x9fd561f1fd0f9bd3, 0xfeb6ea8bedefa634},
+    {0xc7caba6e7c5382c8, 0xfe64a52ee96b8fc1},
+    {0xf9bd690a1b68637b, 0x3dfdce7aa3c673b1},
+    {0x9c1661a651213e2d, 0x06bea10ca65c084f},
+    {0xc31bfa0fe5698db8, 0x486e494fcff30a63},
+    {0xf3e2f893dec3f126, 0x5a89dba3c3efccfb},
+    {0x986ddb5c6b3a76b7, 0xf89629465a75e01d},
+    {0xbe89523386091465, 0xf6bbb397f1135824},
+    {0xee2ba6c0678b597f, 0x746aa07ded582e2d},
+    {0x94db483840b717ef, 0xa8c2a44eb4571cdd},
+    {0xba121a4650e4ddeb, 0x92f34d62616ce414},
+    {0xe896a0d7e51e1566, 0x77b020baf9c81d18},
+    {0x915e2486ef32cd60, 0x0ace1474dc1d122f},
+    {0xb5b5ada8aaff80b8, 0x0d819992132456bb},
+    {0xe3231912d5bf60e6, 0x10e1fff697ed6c6a},
+    {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+    {0xb1736b96b6fd83b3, 0xbd308ff8a6b17cb3},
+    {0xddd0467c64bce4a0, 0xac7cb3f6d05ddbdf},
+    {0x8aa22c0dbef60ee4, 0x6bcdf07a423aa96c},
+    {0xad4ab7112eb3929d, 0x86c16c98d2c953c7},
+    {0xd89d64d57a607744, 0xe871c7bf077ba8b8},
+    {0x87625f056c7c4a8b, 0x11471cd764ad4973},
+    {0xa93af6c6c79b5d2d, 0xd598e40d3dd89bd0},
+    {0xd389b47879823479, 0x4aff1d108d4ec2c4},
+    {0x843610cb4bf160cb, 0xcedf722a585139bb},
+    {0xa54394fe1eedb8fe, 0xc2974eb4ee658829},
+    {0xce947a3da6a9273e, 0x733d226229feea33},
+    {0x811ccc668829b887, 0x0806357d5a3f5260},
+    {0xa163ff802a3426a8, 0xca07c2dcb0cf26f8},
+    {0xc9bcff6034c13052, 0xfc89b393dd02f0b6},
+    {0xfc2c3f3841f17c67, 0xbbac2078d443ace3},
+    {0x9d9ba7832936edc0, 0xd54b944b84aa4c0e},
+    {0xc5029163f384a931, 0x0a9e795e65d4df12},
+    {0xf64335bcf065d37d, 0x4d4617b5ff4a16d6},
+    {0x99ea0196163fa42e, 0x504bced1bf8e4e46},
+    {0xc06481fb9bcf8d39, 0xe45ec2862f71e1d7},
+    {0xf07da27a82c37088, 0x5d767327bb4e5a4d},
+    {0x964e858c91ba2655, 0x3a6a07f8d510f870},
+    {0xbbe226efb628afea, 0x890489f70a55368c},
+    {0xeadab0aba3b2dbe5, 0x2b45ac74ccea842f},
+    {0x92c8ae6b464fc96f, 0x3b0b8bc90012929e},
+    {0xb77ada0617e3bbcb, 0x09ce6ebb40173745},
+    {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+    {0x8f57fa54c2a9eab6, 0x9fa946824a12232e},
+    {0xb32df8e9f3546564, 0x47939822dc96abfa},
+    {0xdff9772470297ebd, 0x59787e2b93bc56f8},
+    {0x8bfbea76c619ef36, 0x57eb4edb3c55b65b},
+    {0xaefae51477a06b03, 0xede622920b6b23f2},
+    {0xdab99e59958885c4, 0xe95fab368e45ecee},
+    {0x88b402f7fd75539b, 0x11dbcb0218ebb415},
+    {0xaae103b5fcd2a881, 0xd652bdc29f26a11a},
+    {0xd59944a37c0752a2, 0x4be76d3346f04960},
+    {0x857fcae62d8493a5, 0x6f70a4400c562ddc},
+    {0xa6dfbd9fb8e5b88e, 0xcb4ccd500f6bb953},
+    {0xd097ad07a71f26b2, 0x7e2000a41346a7a8},
+    {0x825ecc24c873782f, 0x8ed400668c0c28c9},
+    {0xa2f67f2dfa90563b, 0x728900802f0f32fb},
+    {0xcbb41ef979346bca, 0x4f2b40a03ad2ffba},
+    {0xfea126b7d78186bc, 0xe2f610c84987bfa9},
+    {0x9f24b832e6b0f436, 0x0dd9ca7d2df4d7ca},
+    {0xc6ede63fa05d3143, 0x91503d1c79720dbc},
+    {0xf8a95fcf88747d94, 0x75a44c6397ce912b},
+    {0x9b69dbe1b548ce7c, 0xc986afbe3ee11abb},
+    {0xc24452da229b021b, 0xfbe85badce996169},
+    {0xf2d56790ab41c2a2, 0xfae27299423fb9c4},
+    {0x97c560ba6b0919a5, 0xdccd879fc967d41b},
+    {0xbdb6b8e905cb600f, 0x5400e987bbc1c921},
+    {0xed246723473e3813, 0x290123e9aab23b69},
+    {0x9436c0760c86e30b, 0xf9a0b6720aaf6522},
+    {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+    {0xe7958cb87392c2c2, 0xb60b1d1230b20e05},
+    {0x90bd77f3483bb9b9, 0xb1c6f22b5e6f48c3},
+    {0xb4ecd5f01a4aa828, 0x1e38aeb6360b1af4},
+    {0xe2280b6c20dd5232, 0x25c6da63c38de1b1},
+    {0x8d590723948a535f, 0x579c487e5a38ad0f},
+    {0xb0af48ec79ace837, 0x2d835a9df0c6d852},
+    {0xdcdb1b2798182244, 0xf8e431456cf88e66},
+    {0x8a08f0f8bf0f156b, 0x1b8e9ecb641b5900},
+    {0xac8b2d36eed2dac5, 0xe272467e3d222f40},
+    {0xd7adf884aa879177, 0x5b0ed81dcc6abb10},
+    {0x86ccbb52ea94baea, 0x98e947129fc2b4ea},
+    {0xa87fea27a539e9a5, 0x3f2398d747b36225},
+    {0xd29fe4b18e88640e, 0x8eec7f0d19a03aae},
+    {0x83a3eeeef9153e89, 0x1953cf68300424ad},
+    {0xa48ceaaab75a8e2b, 0x5fa8c3423c052dd8},
+    {0xcdb02555653131b6, 0x3792f412cb06794e},
+    {0x808e17555f3ebf11, 0xe2bbd88bbee40bd1},
+    {0xa0b19d2ab70e6ed6, 0x5b6aceaeae9d0ec5},
+    {0xc8de047564d20a8b, 0xf245825a5a445276},
+    {0xfb158592be068d2e, 0xeed6e2f0f0d56713},
+    {0x9ced737bb6c4183d, 0x55464dd69685606c},
+    {0xc428d05aa4751e4c, 0xaa97e14c3c26b887},
+    {0xf53304714d9265df, 0xd53dd99f4b3066a9},
+    {0x993fe2c6d07b7fab, 0xe546a8038efe402a},
+    {0xbf8fdb78849a5f96, 0xde98520472bdd034},
+    {0xef73d256a5c0f77c, 0x963e66858f6d4441},
+    {0x95a8637627989aad, 0xdde7001379a44aa9},
+    {0xbb127c53b17ec159, 0x5560c018580d5d53},
+    {0xe9d71b689dde71af, 0xaab8f01e6e10b4a7},
+    {0x9226712162ab070d, 0xcab3961304ca70e9},
+    {0xb6b00d69bb55c8d1, 0x3d607b97c5fd0d23},
+    {0xe45c10c42a2b3b05, 0x8cb89a7db77c506b},
+    {0x8eb98a7a9a5b04e3, 0x77f3608e92adb243},
+    {0xb267ed1940f1c61c, 0x55f038b237591ed4},
+    {0xdf01e85f912e37a3, 0x6b6c46dec52f6689},
+    {0x8b61313bbabce2c6, 0x2323ac4b3b3da016},
+    {0xae397d8aa96c1b77, 0xabec975e0a0d081b},
+    {0xd9c7dced53c72255, 0x96e7bd358c904a22},
+    {0x881cea14545c7575, 0x7e50d64177da2e55},
+    {0xaa242499697392d2, 0xdde50bd1d5d0b9ea},
+    {0xd4ad2dbfc3d07787, 0x955e4ec64b44e865},
+    {0x84ec3c97da624ab4, 0xbd5af13bef0b113f},
+    {0xa6274bbdd0fadd61, 0xecb1ad8aeacdd58f},
+    {0xcfb11ead453994ba, 0x67de18eda5814af3},
+    {0x81ceb32c4b43fcf4, 0x80eacf948770ced8},
+    {0xa2425ff75e14fc31, 0xa1258379a94d028e},
+    {0xcad2f7f5359a3b3e, 0x096ee45813a04331},
+    {0xfd87b5f28300ca0d, 0x8bca9d6e188853fd},
+    {0x9e74d1b791e07e48, 0x775ea264cf55347e},
+    {0xc612062576589dda, 0x95364afe032a819e},
+    {0xf79687aed3eec551, 0x3a83ddbd83f52205},
+    {0x9abe14cd44753b52, 0xc4926a9672793543},
+    {0xc16d9a0095928a27, 0x75b7053c0f178294},
+    {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+    {0x971da05074da7bee, 0xd3f6fc16ebca5e04},
+    {0xbce5086492111aea, 0x88f4bb1ca6bcf585},
+    {0xec1e4a7db69561a5, 0x2b31e9e3d06c32e6},
+    {0x9392ee8e921d5d07, 0x3aff322e62439fd0},
+    {0xb877aa3236a4b449, 0x09befeb9fad487c3},
+    {0xe69594bec44de15b, 0x4c2ebe687989a9b4},
+    {0x901d7cf73ab0acd9, 0x0f9d37014bf60a11},
+    {0xb424dc35095cd80f, 0x538484c19ef38c95},
+    {0xe12e13424bb40e13, 0x2865a5f206b06fba},
+    {0x8cbccc096f5088cb, 0xf93f87b7442e45d4},
+    {0xafebff0bcb24aafe, 0xf78f69a51539d749},
+    {0xdbe6fecebdedd5be, 0xb573440e5a884d1c},
+    {0x89705f4136b4a597, 0x31680a88f8953031},
+    {0xabcc77118461cefc, 0xfdc20d2b36ba7c3e},
+    {0xd6bf94d5e57a42bc, 0x3d32907604691b4d},
+    {0x8637bd05af6c69b5, 0xa63f9a49c2c1b110},
+    {0xa7c5ac471b478423, 0x0fcf80dc33721d54},
+    {0xd1b71758e219652b, 0xd3c36113404ea4a9},
+    {0x83126e978d4fdf3b, 0x645a1cac083126ea},
+    {0xa3d70a3d70a3d70a, 0x3d70a3d70a3d70a4},
+    {0xcccccccccccccccc, 0xcccccccccccccccd},
+    {0x8000000000000000, 0x0000000000000000},
+    {0xa000000000000000, 0x0000000000000000},
+    {0xc800000000000000, 0x0000000000000000},
+    {0xfa00000000000000, 0x0000000000000000},
+    {0x9c40000000000000, 0x0000000000000000},
+    {0xc350000000000000, 0x0000000000000000},
+    {0xf424000000000000, 0x0000000000000000},
+    {0x9896800000000000, 0x0000000000000000},
+    {0xbebc200000000000, 0x0000000000000000},
+    {0xee6b280000000000, 0x0000000000000000},
+    {0x9502f90000000000, 0x0000000000000000},
+    {0xba43b74000000000, 0x0000000000000000},
+    {0xe8d4a51000000000, 0x0000000000000000},
+    {0x9184e72a00000000, 0x0000000000000000},
+    {0xb5e620f480000000, 0x0000000000000000},
+    {0xe35fa931a0000000, 0x0000000000000000},
+    {0x8e1bc9bf04000000, 0x0000000000000000},
+    {0xb1a2bc2ec5000000, 0x0000000000000000},
+    {0xde0b6b3a76400000, 0x0000000000000000},
+    {0x8ac7230489e80000, 0x0000000000000000},
+    {0xad78ebc5ac620000, 0x0000000000000000},
+    {0xd8d726b7177a8000, 0x0000000000000000},
+    {0x878678326eac9000, 0x0000000000000000},
+    {0xa968163f0a57b400, 0x0000000000000000},
+    {0xd3c21bcecceda100, 0x0000000000000000},
+    {0x84595161401484a0, 0x0000000000000000},
+    {0xa56fa5b99019a5c8, 0x0000000000000000},
+    {0xcecb8f27f4200f3a, 0x0000000000000000},
+    {0x813f3978f8940984, 0x4000000000000000},
+    {0xa18f07d736b90be5, 0x5000000000000000},
+    {0xc9f2c9cd04674ede, 0xa400000000000000},
+    {0xfc6f7c4045812296, 0x4d00000000000000},
+    {0x9dc5ada82b70b59d, 0xf020000000000000},
+    {0xc5371912364ce305, 0x6c28000000000000},
+    {0xf684df56c3e01bc6, 0xc732000000000000},
+    {0x9a130b963a6c115c, 0x3c7f400000000000},
+    {0xc097ce7bc90715b3, 0x4b9f100000000000},
+    {0xf0bdc21abb48db20, 0x1e86d40000000000},
+    {0x96769950b50d88f4, 0x1314448000000000},
+    {0xbc143fa4e250eb31, 0x17d955a000000000},
+    {0xeb194f8e1ae525fd, 0x5dcfab0800000000},
+    {0x92efd1b8d0cf37be, 0x5aa1cae500000000},
+    {0xb7abc627050305ad, 0xf14a3d9e40000000},
+    {0xe596b7b0c643c719, 0x6d9ccd05d0000000},
+    {0x8f7e32ce7bea5c6f, 0xe4820023a2000000},
+    {0xb35dbf821ae4f38b, 0xdda2802c8a800000},
+    {0xe0352f62a19e306e, 0xd50b2037ad200000},
+    {0x8c213d9da502de45, 0x4526f422cc340000},
+    {0xaf298d050e4395d6, 0x9670b12b7f410000},
+    {0xdaf3f04651d47b4c, 0x3c0cdd765f114000},
+    {0x88d8762bf324cd0f, 0xa5880a69fb6ac800},
+    {0xab0e93b6efee0053, 0x8eea0d047a457a00},
+    {0xd5d238a4abe98068, 0x72a4904598d6d880},
+    {0x85a36366eb71f041, 0x47a6da2b7f864750},
+    {0xa70c3c40a64e6c51, 0x999090b65f67d924},
+    {0xd0cf4b50cfe20765, 0xfff4b4e3f741cf6d},
+    {0x82818f1281ed449f, 0xbff8f10e7a8921a4},
+    {0xa321f2d7226895c7, 0xaff72d52192b6a0d},
+    {0xcbea6f8ceb02bb39, 0x9bf4f8a69f764490},
+    {0xfee50b7025c36a08, 0x02f236d04753d5b4},
+    {0x9f4f2726179a2245, 0x01d762422c946590},
+    {0xc722f0ef9d80aad6, 0x424d3ad2b7b97ef5},
+    {0xf8ebad2b84e0d58b, 0xd2e0898765a7deb2},
+    {0x9b934c3b330c8577, 0x63cc55f49f88eb2f},
+    {0xc2781f49ffcfa6d5, 0x3cbf6b71c76b25fb},
+    {0xf316271c7fc3908a, 0x8bef464e3945ef7a},
+    {0x97edd871cfda3a56, 0x97758bf0e3cbb5ac},
+    {0xbde94e8e43d0c8ec, 0x3d52eeed1cbea317},
+    {0xed63a231d4c4fb27, 0x4ca7aaa863ee4bdd},
+    {0x945e455f24fb1cf8, 0x8fe8caa93e74ef6a},
+    {0xb975d6b6ee39e436, 0xb3e2fd538e122b44},
+    {0xe7d34c64a9c85d44, 0x60dbbca87196b616},
+    {0x90e40fbeea1d3a4a, 0xbc8955e946fe31cd},
+    {0xb51d13aea4a488dd, 0x6babab6398bdbe41},
+    {0xe264589a4dcdab14, 0xc696963c7eed2dd1},
+    {0x8d7eb76070a08aec, 0xfc1e1de5cf543ca2},
+    {0xb0de65388cc8ada8, 0x3b25a55f43294bcb},
+    {0xdd15fe86affad912, 0x49ef0eb713f39ebe},
+    {0x8a2dbf142dfcc7ab, 0x6e3569326c784337},
+    {0xacb92ed9397bf996, 0x49c2c37f07965404},
+    {0xd7e77a8f87daf7fb, 0xdc33745ec97be906},
+    {0x86f0ac99b4e8dafd, 0x69a028bb3ded71a3},
+    {0xa8acd7c0222311bc, 0xc40832ea0d68ce0c},
+    {0xd2d80db02aabd62b, 0xf50a3fa490c30190},
+    {0x83c7088e1aab65db, 0x792667c6da79e0fa},
+    {0xa4b8cab1a1563f52, 0x577001b891185938},
+    {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
+    {0x80b05e5ac60b6178, 0x544f8158315b05b4},
+    {0xa0dc75f1778e39d6, 0x696361ae3db1c721},
+    {0xc913936dd571c84c, 0x03bc3a19cd1e38e9},
+    {0xfb5878494ace3a5f, 0x04ab48a04065c723},
+    {0x9d174b2dcec0e47b, 0x62eb0d64283f9c76},
+    {0xc45d1df942711d9a, 0x3ba5d0bd324f8394},
+    {0xf5746577930d6500, 0xca8f44ec7ee36479},
+    {0x9968bf6abbe85f20, 0x7e998b13cf4e1ecb},
+    {0xbfc2ef456ae276e8, 0x9e3fedd8c321a67e},
+    {0xefb3ab16c59b14a2, 0xc5cfe94ef3ea101e},
+    {0x95d04aee3b80ece5, 0xbba1f1d158724a12},
+    {0xbb445da9ca61281f, 0x2a8a6e45ae8edc97},
+    {0xea1575143cf97226, 0xf52d09d71a3293bd},
+    {0x924d692ca61be758, 0x593c2626705f9c56},
+    {0xb6e0c377cfa2e12e, 0x6f8b2fb00c77836c},
+    {0xe498f455c38b997a, 0x0b6dfb9c0f956447},
+    {0x8edf98b59a373fec, 0x4724bd4189bd5eac},
+    {0xb2977ee300c50fe7, 0x58edec91ec2cb657},
+    {0xdf3d5e9bc0f653e1, 0x2f2967b66737e3ed},
+    {0x8b865b215899f46c, 0xbd79e0d20082ee74},
+    {0xae67f1e9aec07187, 0xecd8590680a3aa11},
+    {0xda01ee641a708de9, 0xe80e6f4820cc9495},
+    {0x884134fe908658b2, 0x3109058d147fdcdd},
+    {0xaa51823e34a7eede, 0xbd4b46f0599fd415},
+    {0xd4e5e2cdc1d1ea96, 0x6c9e18ac7007c91a},
+    {0x850fadc09923329e, 0x03e2cf6bc604ddb0},
+    {0xa6539930bf6bff45, 0x84db8346b786151c},
+    {0xcfe87f7cef46ff16, 0xe612641865679a63},
+    {0x81f14fae158c5f6e, 0x4fcb7e8f3f60c07e},
+    {0xa26da3999aef7749, 0xe3be5e330f38f09d},
+    {0xcb090c8001ab551c, 0x5cadf5bfd3072cc5},
+    {0xfdcb4fa002162a63, 0x73d9732fc7c8f7f6},
+    {0x9e9f11c4014dda7e, 0x2867e7fddcdd9afa},
+    {0xc646d63501a1511d, 0xb281e1fd541501b8},
+    {0xf7d88bc24209a565, 0x1f225a7ca91a4226},
+    {0x9ae757596946075f, 0x3375788de9b06958},
+    {0xc1a12d2fc3978937, 0x0052d6b1641c83ae},
+    {0xf209787bb47d6b84, 0xc0678c5dbd23a49a},
+    {0x9745eb4d50ce6332, 0xf840b7ba963646e0},
+    {0xbd176620a501fbff, 0xb650e5a93bc3d898},
+    {0xec5d3fa8ce427aff, 0xa3e51f138ab4cebe},
+    {0x93ba47c980e98cdf, 0xc66f336c36b10137},
+    {0xb8a8d9bbe123f017, 0xb80b0047445d4184},
+    {0xe6d3102ad96cec1d, 0xa60dc059157491e5},
+    {0x9043ea1ac7e41392, 0x87c89837ad68db2f},
+    {0xb454e4a179dd1877, 0x29babe4598c311fb},
+    {0xe16a1dc9d8545e94, 0xf4296dd6fef3d67a},
+    {0x8ce2529e2734bb1d, 0x1899e4a65f58660c},
+    {0xb01ae745b101e9e4, 0x5ec05dcff72e7f8f},
+    {0xdc21a1171d42645d, 0x76707543f4fa1f73},
+    {0x899504ae72497eba, 0x6a06494a791c53a8},
+    {0xabfa45da0edbde69, 0x0487db9d17636892},
+    {0xd6f8d7509292d603, 0x45a9d2845d3c42b6},
+    {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
+    {0xa7f26836f282b732, 0x8e6cac7768d7141e},
+    {0xd1ef0244af2364ff, 0x3207d795430cd926},
+    {0x8335616aed761f1f, 0x7f44e6bd49e807b8},
+    {0xa402b9c5a8d3a6e7, 0x5f16206c9c6209a6},
+    {0xcd036837130890a1, 0x36dba887c37a8c0f},
+    {0x802221226be55a64, 0xc2494954da2c9789},
+    {0xa02aa96b06deb0fd, 0xf2db9baa10b7bd6c},
+    {0xc83553c5c8965d3d, 0x6f92829494e5acc7},
+    {0xfa42a8b73abbf48c, 0xcb772339ba1f17f9},
+    {0x9c69a97284b578d7, 0xff2a760414536efb},
+    {0xc38413cf25e2d70d, 0xfef5138519684aba},
+    {0xf46518c2ef5b8cd1, 0x7eb258665fc25d69},
+    {0x98bf2f79d5993802, 0xef2f773ffbd97a61},
+    {0xbeeefb584aff8603, 0xaafb550ffacfd8fa},
+    {0xeeaaba2e5dbf6784, 0x95ba2a53f983cf38},
+    {0x952ab45cfa97a0b2, 0xdd945a747bf26183},
+    {0xba756174393d88df, 0x94f971119aeef9e4},
+    {0xe912b9d1478ceb17, 0x7a37cd5601aab85d},
+    {0x91abb422ccb812ee, 0xac62e055c10ab33a},
+    {0xb616a12b7fe617aa, 0x577b986b314d6009},
+    {0xe39c49765fdf9d94, 0xed5a7e85fda0b80b},
+    {0x8e41ade9fbebc27d, 0x14588f13be847307},
+    {0xb1d219647ae6b31c, 0x596eb2d8ae258fc8},
+    {0xde469fbd99a05fe3, 0x6fca5f8ed9aef3bb},
+    {0x8aec23d680043bee, 0x25de7bb9480d5854},
+    {0xada72ccc20054ae9, 0xaf561aa79a10ae6a},
+    {0xd910f7ff28069da4, 0x1b2ba1518094da04},
+    {0x87aa9aff79042286, 0x90fb44d2f05d0842},
+    {0xa99541bf57452b28, 0x353a1607ac744a53},
+    {0xd3fa922f2d1675f2, 0x42889b8997915ce8},
+    {0x847c9b5d7c2e09b7, 0x69956135febada11},
+    {0xa59bc234db398c25, 0x43fab9837e699095},
+    {0xcf02b2c21207ef2e, 0x94f967e45e03f4bb},
+    {0x8161afb94b44f57d, 0x1d1be0eebac278f5},
+    {0xa1ba1ba79e1632dc, 0x6462d92a69731732},
+    {0xca28a291859bbf93, 0x7d7b8f7503cfdcfe},
+    {0xfcb2cb35e702af78, 0x5cda735244c3d43e},
+    {0x9defbf01b061adab, 0x3a0888136afa64a7},
+    {0xc56baec21c7a1916, 0x088aaa1845b8fdd0},
+    {0xf6c69a72a3989f5b, 0x8aad549e57273d45},
+    {0x9a3c2087a63f6399, 0x36ac54e2f678864b},
+    {0xc0cb28a98fcf3c7f, 0x84576a1bb416a7dd},
+    {0xf0fdf2d3f3c30b9f, 0x656d44a2a11c51d5},
+    {0x969eb7c47859e743, 0x9f644ae5a4b1b325},
+    {0xbc4665b596706114, 0x873d5d9f0dde1fee},
+    {0xeb57ff22fc0c7959, 0xa90cb506d155a7ea},
+    {0x9316ff75dd87cbd8, 0x09a7f12442d588f2},
+    {0xb7dcbf5354e9bece, 0x0c11ed6d538aeb2f},
+    {0xe5d3ef282a242e81, 0x8f1668c8a86da5fa},
+    {0x8fa475791a569d10, 0xf96e017d694487bc},
+    {0xb38d92d760ec4455, 0x37c981dcc395a9ac},
+    {0xe070f78d3927556a, 0x85bbe253f47b1417},
+    {0x8c469ab843b89562, 0x93956d7478ccec8e},
+    {0xaf58416654a6babb, 0x387ac8d1970027b2},
+    {0xdb2e51bfe9d0696a, 0x06997b05fcc0319e},
+    {0x88fcf317f22241e2, 0x441fece3bdf81f03},
+    {0xab3c2fddeeaad25a, 0xd527e81cad7626c3},
+    {0xd60b3bd56a5586f1, 0x8a71e223d8d3b074},
+    {0x85c7056562757456, 0xf6872d5667844e49},
+    {0xa738c6bebb12d16c, 0xb428f8ac016561db},
+    {0xd106f86e69d785c7, 0xe13336d701beba52},
+    {0x82a45b450226b39c, 0xecc0024661173473},
+    {0xa34d721642b06084, 0x27f002d7f95d0190},
+    {0xcc20ce9bd35c78a5, 0x31ec038df7b441f4},
+    {0xff290242c83396ce, 0x7e67047175a15271},
+    {0x9f79a169bd203e41, 0x0f0062c6e984d386},
+    {0xc75809c42c684dd1, 0x52c07b78a3e60868},
+    {0xf92e0c3537826145, 0xa7709a56ccdf8a82},
+    {0x9bbcc7a142b17ccb, 0x88a66076400bb691},
+    {0xc2abf989935ddbfe, 0x6acff893d00ea435},
+    {0xf356f7ebf83552fe, 0x0583f6b8c4124d43},
+    {0x98165af37b2153de, 0xc3727a337a8b704a},
+    {0xbe1bf1b059e9a8d6, 0x744f18c0592e4c5c},
+    {0xeda2ee1c7064130c, 0x1162def06f79df73},
+    {0x9485d4d1c63e8be7, 0x8addcb5645ac2ba8},
+    {0xb9a74a0637ce2ee1, 0x6d953e2bd7173692},
+    {0xe8111c87c5c1ba99, 0xc8fa8db6ccdd0437},
+    {0x910ab1d4db9914a0, 0x1d9c9892400a22a2},
+    {0xb54d5e4a127f59c8, 0x2503beb6d00cab4b},
+    {0xe2a0b5dc971f303a, 0x2e44ae64840fd61d},
+    {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
+    {0xb10d8e1456105dad, 0x7425a83e872c5f47},
+    {0xdd50f1996b947518, 0xd12f124e28f77719},
+    {0x8a5296ffe33cc92f, 0x82bd6b70d99aaa6f},
+    {0xace73cbfdc0bfb7b, 0x636cc64d1001550b},
+    {0xd8210befd30efa5a, 0x3c47f7e05401aa4e},
+    {0x8714a775e3e95c78, 0x65acfaec34810a71},
+    {0xa8d9d1535ce3b396, 0x7f1839a741a14d0d},
+    {0xd31045a8341ca07c, 0x1ede48111209a050},
+    {0x83ea2b892091e44d, 0x934aed0aab460432},
+    {0xa4e4b66b68b65d60, 0xf81da84d5617853f},
+    {0xce1de40642e3f4b9, 0x36251260ab9d668e},
+    {0x80d2ae83e9ce78f3, 0xc1d72b7c6b426019},
+    {0xa1075a24e4421730, 0xb24cf65b8612f81f},
+    {0xc94930ae1d529cfc, 0xdee033f26797b627},
+    {0xfb9b7cd9a4a7443c, 0x169840ef017da3b1},
+    {0x9d412e0806e88aa5, 0x8e1f289560ee864e},
+    {0xc491798a08a2ad4e, 0xf1a6f2bab92a27e2},
+    {0xf5b5d7ec8acb58a2, 0xae10af696774b1db},
+    {0x9991a6f3d6bf1765, 0xacca6da1e0a8ef29},
+    {0xbff610b0cc6edd3f, 0x17fd090a58d32af3},
+    {0xeff394dcff8a948e, 0xddfc4b4cef07f5b0},
+    {0x95f83d0a1fb69cd9, 0x4abdaf101564f98e},
+    {0xbb764c4ca7a4440f, 0x9d6d1ad41abe37f1},
+    {0xea53df5fd18d5513, 0x84c86189216dc5ed},
+    {0x92746b9be2f8552c, 0x32fd3cf5b4e49bb4},
+    {0xb7118682dbb66a77, 0x3fbc8c33221dc2a1},
+    {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
+    {0x8f05b1163ba6832d, 0x29cb4d87f2a7400e},
+    {0xb2c71d5bca9023f8, 0x743e20e9ef511012},
+    {0xdf78e4b2bd342cf6, 0x914da9246b255416},
+    {0x8bab8eefb6409c1a, 0x1ad089b6c2f7548e},
+    {0xae9672aba3d0c320, 0xa184ac2473b529b1},
+    {0xda3c0f568cc4f3e8, 0xc9e5d72d90a2741e},
+    {0x8865899617fb1871, 0x7e2fa67c7a658892},
+    {0xaa7eebfb9df9de8d, 0xddbb901b98feeab7},
+    {0xd51ea6fa85785631, 0x552a74227f3ea565},
+    {0x8533285c936b35de, 0xd53a88958f87275f},
+    {0xa67ff273b8460356, 0x8a892abaf368f137},
+    {0xd01fef10a657842c, 0x2d2b7569b0432d85},
+    {0x8213f56a67f6b29b, 0x9c3b29620e29fc73},
+    {0xa298f2c501f45f42, 0x8349f3ba91b47b8f},
+    {0xcb3f2f7642717713, 0x241c70a936219a73},
+    {0xfe0efb53d30dd4d7, 0xed238cd383aa0110},
+    {0x9ec95d1463e8a506, 0xf4363804324a40aa},
+    {0xc67bb4597ce2ce48, 0xb143c6053edcd0d5},
+    {0xf81aa16fdc1b81da, 0xdd94b7868e94050a},
+    {0x9b10a4e5e9913128, 0xca7cf2b4191c8326},
+    {0xc1d4ce1f63f57d72, 0xfd1c2f611f63a3f0},
+    {0xf24a01a73cf2dccf, 0xbc633b39673c8cec},
+    {0x976e41088617ca01, 0xd5be0503e085d813},
+    {0xbd49d14aa79dbc82, 0x4b2d8644d8a74e18},
+    {0xec9c459d51852ba2, 0xddf8e7d60ed1219e},
+    {0x93e1ab8252f33b45, 0xcabb90e5c942b503},
+    {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
+    {0xe7109bfba19c0c9d, 0x0cc512670a783ad4},
+    {0x906a617d450187e2, 0x27fb2b80668b24c5},
+    {0xb484f9dc9641e9da, 0xb1f9f660802dedf6},
+    {0xe1a63853bbd26451, 0x5e7873f8a0396973},
+    {0x8d07e33455637eb2, 0xdb0b487b6423e1e8},
+    {0xb049dc016abc5e5f, 0x91ce1a9a3d2cda62},
+    {0xdc5c5301c56b75f7, 0x7641a140cc7810fb},
+    {0x89b9b3e11b6329ba, 0xa9e904c87fcb0a9d},
+    {0xac2820d9623bf429, 0x546345fa9fbdcd44},
+    {0xd732290fbacaf133, 0xa97c177947ad4095},
+    {0x867f59a9d4bed6c0, 0x49ed8eabcccc485d},
+    {0xa81f301449ee8c70, 0x5c68f256bfff5a74},
+    {0xd226fc195c6a2f8c, 0x73832eec6fff3111},
+    {0x83585d8fd9c25db7, 0xc831fd53c5ff7eab},
+    {0xa42e74f3d032f525, 0xba3e7ca8b77f5e55},
+    {0xcd3a1230c43fb26f, 0x28ce1bd2e55f35eb},
+    {0x80444b5e7aa7cf85, 0x7980d163cf5b81b3},
+    {0xa0555e361951c366, 0xd7e105bcc332621f},
+    {0xc86ab5c39fa63440, 0x8dd9472bf3fefaa7},
+    {0xfa856334878fc150, 0xb14f98f6f0feb951},
+    {0x9c935e00d4b9d8d2, 0x6ed1bf9a569f33d3},
+    {0xc3b8358109e84f07, 0x0a862f80ec4700c8},
+    {0xf4a642e14c6262c8, 0xcd27bb612758c0fa},
+    {0x98e7e9cccfbd7dbd, 0x8038d51cb897789c},
+    {0xbf21e44003acdd2c, 0xe0470a63e6bd56c3},
+    {0xeeea5d5004981478, 0x1858ccfce06cac74},
+    {0x95527a5202df0ccb, 0x0f37801e0c43ebc8},
+    {0xbaa718e68396cffd, 0xd30560258f54e6ba},
+    {0xe950df20247c83fd, 0x47c6b82ef32a2069},
+    {0x91d28b7416cdd27e, 0x4cdc331d57fa5441},
+    {0xb6472e511c81471d, 0xe0133fe4adf8e952},
+    {0xe3d8f9e563a198e5, 0x58180fddd97723a6},
+    {0x8e679c2f5e44ff8f, 0x570f09eaa7ea7648},
+    {0xb201833b35d63f73, 0x2cd2cc6551e513da},
+    {0xde81e40a034bcf4f, 0xf8077f7ea65e58d1},
+    {0x8b112e86420f6191, 0xfb04afaf27faf782},
+    {0xadd57a27d29339f6, 0x79c5db9af1f9b563},
+    {0xd94ad8b1c7380874, 0x18375281ae7822bc},
+    {0x87cec76f1c830548, 0x8f2293910d0b15b5},
+    {0xa9c2794ae3a3c69a, 0xb2eb3875504ddb22},
+    {0xd433179d9c8cb841, 0x5fa60692a46151eb},
+    {0x849feec281d7f328, 0xdbc7c41ba6bcd333},
+    {0xa5c7ea73224deff3, 0x12b9b522906c0800},
+    {0xcf39e50feae16bef, 0xd768226b34870a00},
+    {0x81842f29f2cce375, 0xe6a1158300d46640},
+    {0xa1e53af46f801c53, 0x60495ae3c1097fd0},
+    {0xca5e89b18b602368, 0x385bb19cb14bdfc4},
+    {0xfcf62c1dee382c42, 0x46729e03dd9ed7b5},
+    {0x9e19db92b4e31ba9, 0x6c07a2c26a8346d1},
+    {0xc5a05277621be293, 0xc7098b7305241885},
+    {0xf70867153aa2db38, 0xb8cbee4fc66d1ea7}
+#else
+    {0xff77b1fcbebcdc4f, 0x25e8e89c13bb0f7b},
+    {0xce5d73ff402d98e3, 0xfb0a3d212dc81290},
+    {0xa6b34ad8c9dfc06f, 0xf42faa48c0ea481f},
+    {0x86a8d39ef77164bc, 0xae5dff9c02033198},
+    {0xd98ddaee19068c76, 0x3badd624dd9b0958},
+    {0xafbd2350644eeacf, 0xe5d1929ef90898fb},
+    {0x8df5efabc5979c8f, 0xca8d3ffa1ef463c2},
+    {0xe55990879ddcaabd, 0xcc420a6a101d0516},
+    {0xb94470938fa89bce, 0xf808e40e8d5b3e6a},
+    {0x95a8637627989aad, 0xdde7001379a44aa9},
+    {0xf1c90080baf72cb1, 0x5324c68b12dd6339},
+    {0xc350000000000000, 0x0000000000000000},
+    {0x9dc5ada82b70b59d, 0xf020000000000000},
+    {0xfee50b7025c36a08, 0x02f236d04753d5b4},
+    {0xcde6fd5e09abcf26, 0xed4c0226b55e6f86},
+    {0xa6539930bf6bff45, 0x84db8346b786151c},
+    {0x865b86925b9bc5c2, 0x0b8a2392ba45a9b2},
+    {0xd910f7ff28069da4, 0x1b2ba1518094da04},
+    {0xaf58416654a6babb, 0x387ac8d1970027b2},
+    {0x8da471a9de737e24, 0x5ceaecfed289e5d2},
+    {0xe4d5e82392a40515, 0x0fabaf3feaa5334a},
+    {0xb8da1662e7b00a17, 0x3d6a751f3b936243},
+    {0x95527a5202df0ccb, 0x0f37801e0c43ebc8}
+#endif
+};
+
+#if !FMT_USE_FULL_CACHE_DRAGONBOX
+template <typename T>
+const uint64_t basic_data<T>::powers_of_5_64[] = {
+    0x0000000000000001, 0x0000000000000005, 0x0000000000000019,
+    0x000000000000007d, 0x0000000000000271, 0x0000000000000c35,
+    0x0000000000003d09, 0x000000000001312d, 0x000000000005f5e1,
+    0x00000000001dcd65, 0x00000000009502f9, 0x0000000002e90edd,
+    0x000000000e8d4a51, 0x0000000048c27395, 0x000000016bcc41e9,
+    0x000000071afd498d, 0x0000002386f26fc1, 0x000000b1a2bc2ec5,
+    0x000003782dace9d9, 0x00001158e460913d, 0x000056bc75e2d631,
+    0x0001b1ae4d6e2ef5, 0x000878678326eac9, 0x002a5a058fc295ed,
+    0x00d3c21bcecceda1, 0x0422ca8b0a00a425, 0x14adf4b7320334b9};
+
+template <typename T>
+const uint32_t basic_data<T>::dragonbox_pow10_recovery_errors[] = {
+    0x50001400, 0x54044100, 0x54014555, 0x55954415, 0x54115555, 0x00000001,
+    0x50000000, 0x00104000, 0x54010004, 0x05004001, 0x55555544, 0x41545555,
+    0x54040551, 0x15445545, 0x51555514, 0x10000015, 0x00101100, 0x01100015,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x04450514, 0x45414110,
+    0x55555145, 0x50544050, 0x15040155, 0x11054140, 0x50111514, 0x11451454,
+    0x00400541, 0x00000000, 0x55555450, 0x10056551, 0x10054011, 0x55551014,
+    0x69514555, 0x05151109, 0x00155555};
+#endif
+
 template <typename T>
 const char basic_data<T>::foreground_color[] = "\x1b[38;2;";
 template <typename T>
@@ -366,6 +1088,10 @@ class fp {
  private:
   using significand_type = uint64_t;
 
+  template <typename Float>
+  using is_supported_float = bool_constant<sizeof(Float) == sizeof(uint64_t) ||
+                                           sizeof(Float) == sizeof(uint32_t)>;
+
  public:
   significand_type f;
   int e;
@@ -388,63 +1114,38 @@ class fp {
   template <typename Double> explicit fp(Double d) { assign(d); }
 
   // Assigns d to this and return true iff predecessor is closer than successor.
-  template <typename Double, FMT_ENABLE_IF(sizeof(Double) == sizeof(uint64_t))>
-  bool assign(Double d) {
-    // Assume double is in the format [sign][exponent][significand].
-    using limits = std::numeric_limits<Double>;
+  template <typename Float, FMT_ENABLE_IF(is_supported_float<Float>::value)>
+  bool assign(Float d) {
+    // Assume float is in the format [sign][exponent][significand].
+    using limits = std::numeric_limits<Float>;
+    const int float_significand_size = limits::digits - 1;
     const int exponent_size =
-        bits<Double>::value - double_significand_size - 1;  // -1 for sign
-    const uint64_t significand_mask = implicit_bit - 1;
+        bits<Float>::value - float_significand_size - 1;  // -1 for sign
+    const uint64_t float_implicit_bit = 1ULL << float_significand_size;
+    const uint64_t significand_mask = float_implicit_bit - 1;
     const uint64_t exponent_mask = (~0ULL >> 1) & ~significand_mask;
     const int exponent_bias = (1 << exponent_size) - limits::max_exponent - 1;
-    auto u = bit_cast<uint64_t>(d);
+    constexpr bool is_double = sizeof(Float) == sizeof(uint64_t);
+    auto u = bit_cast<conditional_t<is_double, uint64_t, uint32_t>>(d);
     f = u & significand_mask;
     int biased_e =
-        static_cast<int>((u & exponent_mask) >> double_significand_size);
+        static_cast<int>((u & exponent_mask) >> float_significand_size);
     // Predecessor is closer if d is a normalized power of 2 (f == 0) other than
     // the smallest normalized number (biased_e > 1).
     bool is_predecessor_closer = f == 0 && biased_e > 1;
     if (biased_e != 0)
-      f += implicit_bit;
+      f += float_implicit_bit;
     else
       biased_e = 1;  // Subnormals use biased exponent 1 (min exponent).
-    e = biased_e - exponent_bias - double_significand_size;
+    e = biased_e - exponent_bias - float_significand_size;
     return is_predecessor_closer;
   }
 
-  template <typename Double, FMT_ENABLE_IF(sizeof(Double) != sizeof(uint64_t))>
-  bool assign(Double) {
+  template <typename Float, FMT_ENABLE_IF(!is_supported_float<Float>::value)>
+  bool assign(Float) {
     *this = fp();
     return false;
   }
-
-  // Assigns d to this together with computing lower and upper boundaries,
-  // where a boundary is a value half way between the number and its predecessor
-  // (lower) or successor (upper). The upper boundary is normalized and lower
-  // has the same exponent but may be not normalized.
-  template <typename Double> boundaries assign_with_boundaries(Double d) {
-    bool is_lower_closer = assign(d);
-    fp lower =
-        is_lower_closer ? fp((f << 2) - 1, e - 2) : fp((f << 1) - 1, e - 1);
-    // 1 in normalize accounts for the exponent shift above.
-    fp upper = normalize<1>(fp((f << 1) + 1, e - 1));
-    lower.f <<= lower.e - upper.e;
-    return boundaries{lower.f, upper.f};
-  }
-
-  template <typename Double> boundaries assign_float_with_boundaries(Double d) {
-    assign(d);
-    constexpr int min_normal_e = std::numeric_limits<float>::min_exponent -
-                                 std::numeric_limits<double>::digits;
-    significand_type half_ulp = 1 << (std::numeric_limits<double>::digits -
-                                      std::numeric_limits<float>::digits - 1);
-    if (min_normal_e > e) half_ulp <<= min_normal_e - e;
-    fp upper = normalize<0>(fp(f + half_ulp, e));
-    fp lower = fp(
-        f - (half_ulp >> ((f == implicit_bit && e > min_normal_e) ? 1 : 0)), e);
-    lower.f <<= lower.e - upper.e;
-    return boundaries{lower.f, upper.f};
-  }
 };
 
 // Normalizes the value converted from double and multiplied by (1 << SHIFT).
@@ -488,11 +1189,12 @@ inline fp operator*(fp x, fp y) { return {multiply(x.f, y.f), x.e + y.e + 64}; }
 // Returns a cached power of 10 `c_k = c_k.f * pow(2, c_k.e)` such that its
 // (binary) exponent satisfies `min_exponent <= c_k.e <= min_exponent + 28`.
 inline fp get_cached_power(int min_exponent, int& pow10_exponent) {
-  const int64_t one_over_log2_10 = 0x4d104d42;  // round(pow(2, 32) / log2(10))
+  const int shift = 32;
+  const auto significand = static_cast<int64_t>(data::log10_2_significand);
   int index = static_cast<int>(
-      ((min_exponent + fp::significand_size - 1) * one_over_log2_10 +
-       ((int64_t(1) << 32) - 1))  // ceil
-      >> 32                       // arithmetic shift
+      ((min_exponent + fp::significand_size - 1) * (significand >> shift) +
+       ((int64_t(1) << shift) - 1))  // ceil
+      >> 32                          // arithmetic shift
   );
   // Decimal exponent of the first (smallest) cached power of 10.
   const int first_dec_exp = -348;
@@ -500,7 +1202,8 @@ inline fp get_cached_power(int min_exponent, int& pow10_exponent) {
   const int dec_exp_step = 8;
   index = (index - first_dec_exp - 1) / dec_exp_step + 1;
   pow10_exponent = first_dec_exp + index * dec_exp_step;
-  return {data::pow10_significands[index], data::pow10_exponents[index]};
+  return {data::grisu_pow10_significands[index],
+          data::grisu_pow10_exponents[index]};
 }
 
 // A simple accumulator to hold the sums of terms in bigint::square if uint128_t
@@ -559,9 +1262,8 @@ class bigint {
     FMT_ASSERT(compare(*this, other) >= 0, "");
     bigit borrow = 0;
     int i = other.exp_ - exp_;
-    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j) {
+    for (size_t j = 0, n = other.bigits_.size(); j != n; ++i, ++j)
       subtract_bigits(i, other.bigits_[j], borrow);
-    }
     while (borrow > 0) subtract_bigits(i, 0, borrow);
     remove_leading_zeros();
   }
@@ -733,22 +1435,26 @@ class bigint {
     exp_ *= 2;
   }
 
+  // If this bigint has a bigger exponent than other, adds trailing zero to make
+  // exponents equal. This simplifies some operations such as subtraction.
+  void align(const bigint& other) {
+    int exp_difference = exp_ - other.exp_;
+    if (exp_difference <= 0) return;
+    int num_bigits = static_cast<int>(bigits_.size());
+    bigits_.resize(to_unsigned(num_bigits + exp_difference));
+    for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
+      bigits_[j] = bigits_[i];
+    std::uninitialized_fill_n(bigits_.data(), exp_difference, 0);
+    exp_ -= exp_difference;
+  }
+
   // Divides this bignum by divisor, assigning the remainder to this and
   // returning the quotient.
   int divmod_assign(const bigint& divisor) {
     FMT_ASSERT(this != &divisor, "");
     if (compare(*this, divisor) < 0) return 0;
-    int num_bigits = static_cast<int>(bigits_.size());
     FMT_ASSERT(divisor.bigits_[divisor.bigits_.size() - 1u] != 0, "");
-    int exp_difference = exp_ - divisor.exp_;
-    if (exp_difference > 0) {
-      // Align bigints by adding trailing zeros to simplify subtraction.
-      bigits_.resize(to_unsigned(num_bigits + exp_difference));
-      for (int i = num_bigits - 1, j = i + exp_difference; i >= 0; --i, --j)
-        bigits_[j] = bigits_[i];
-      std::uninitialized_fill_n(bigits_.data(), exp_difference, 0);
-      exp_ -= exp_difference;
-    }
+    align(divisor);
     int quotient = 0;
     do {
       subtract_aligned(divisor);
@@ -788,20 +1494,6 @@ enum result {
 };
 }
 
-// A version of count_digits optimized for grisu_gen_digits.
-inline int grisu_count_digits(uint32_t n) {
-  if (n < 10) return 1;
-  if (n < 100) return 2;
-  if (n < 1000) return 3;
-  if (n < 10000) return 4;
-  if (n < 100000) return 5;
-  if (n < 1000000) return 6;
-  if (n < 10000000) return 7;
-  if (n < 100000000) return 8;
-  if (n < 1000000000) return 9;
-  return 10;
-}
-
 // Generates output using the Grisu digit-gen algorithm.
 // error: the size of the region (lower, upper) outside of which numbers
 // definitely do not round to value (Delta in Grisu3).
@@ -817,7 +1509,7 @@ FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
   FMT_ASSERT(integral == value.f >> -one.e, "");
   // The fractional part of scaled value (p2 in Grisu) c = value % one.
   uint64_t fractional = value.f & (one.f - 1);
-  exp = grisu_count_digits(integral);  // kappa in Grisu.
+  exp = count_digits(integral);  // kappa in Grisu.
   // Divide by 10 to prevent overflow.
   auto result = handler.on_start(data::powers_of_10_64[exp - 1] << -one.e,
                                  value.f / 10, error * 10, exp);
@@ -867,8 +1559,7 @@ FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
       FMT_ASSERT(false, "invalid number of digits");
     }
     --exp;
-    uint64_t remainder =
-        (static_cast<uint64_t>(integral) << -one.e) + fractional;
+    auto remainder = (static_cast<uint64_t>(integral) << -one.e) + fractional;
     result = handler.on_digit(static_cast<char>('0' + digit),
                               data::powers_of_10_64[exp] << -one.e, remainder,
                               error, exp, true);
@@ -878,8 +1569,7 @@ FMT_ALWAYS_INLINE digits::result grisu_gen_digits(fp value, uint64_t error,
   for (;;) {
     fractional *= 10;
     error *= 10;
-    char digit =
-        static_cast<char>('0' + static_cast<char>(fractional >> -one.e));
+    char digit = static_cast<char>('0' + (fractional >> -one.e));
     fractional &= one.f - 1;
     --exp;
     result = handler.on_digit(digit, one.f, fractional, error, exp, false);
@@ -916,6 +1606,7 @@ struct fixed_handler {
                           uint64_t error, int, bool integral) {
     FMT_ASSERT(remainder < divisor, "");
     buf[size++] = digit;
+    if (!integral && error >= remainder) return digits::error;
     if (size < precision) return digits::more;
     if (!integral) {
       // Check if error * 2 < divisor with overflow prevention.
@@ -935,59 +1626,684 @@ struct fixed_handler {
     }
     if (buf[0] > '9') {
       buf[0] = '1';
-      buf[size++] = '0';
+      if (fixed)
+        buf[size++] = '0';
+      else
+        ++exp10;
     }
     return digits::done;
   }
 };
 
-// The shortest representation digit handler.
-struct grisu_shortest_handler {
-  char* buf;
-  int size;
-  // Distance between scaled value and upper bound (wp_W in Grisu3).
-  uint64_t diff;
+// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
+namespace dragonbox {
+// Computes 128-bit result of multiplication of two 64-bit unsigned integers.
+FMT_SAFEBUFFERS inline uint128_wrapper umul128(uint64_t x,
+                                               uint64_t y) FMT_NOEXCEPT {
+#if FMT_USE_INT128
+  return static_cast<uint128_t>(x) * static_cast<uint128_t>(y);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  uint128_wrapper result;
+  result.low_ = _umul128(x, y, &result.high_);
+  return result;
+#else
+  const uint64_t mask = (uint64_t(1) << 32) - uint64_t(1);
 
-  digits::result on_start(uint64_t, uint64_t, uint64_t, int&) {
-    return digits::more;
+  uint64_t a = x >> 32;
+  uint64_t b = x & mask;
+  uint64_t c = y >> 32;
+  uint64_t d = y & mask;
+
+  uint64_t ac = a * c;
+  uint64_t bc = b * c;
+  uint64_t ad = a * d;
+  uint64_t bd = b * d;
+
+  uint64_t intermediate = (bd >> 32) + (ad & mask) + (bc & mask);
+
+  return {ac + (intermediate >> 32) + (ad >> 32) + (bc >> 32),
+          (intermediate << 32) + (bd & mask)};
+#endif
+}
+
+// Computes upper 64 bits of multiplication of two 64-bit unsigned integers.
+FMT_SAFEBUFFERS inline uint64_t umul128_upper64(uint64_t x,
+                                                uint64_t y) FMT_NOEXCEPT {
+#if FMT_USE_INT128
+  auto p = static_cast<uint128_t>(x) * static_cast<uint128_t>(y);
+  return static_cast<uint64_t>(p >> 64);
+#elif defined(_MSC_VER) && defined(_M_X64)
+  return __umulh(x, y);
+#else
+  return umul128(x, y).high();
+#endif
+}
+
+// Computes upper 64 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+FMT_SAFEBUFFERS inline uint64_t umul192_upper64(uint64_t x, uint128_wrapper y)
+    FMT_NOEXCEPT {
+  uint128_wrapper g0 = umul128(x, y.high());
+  g0 += umul128_upper64(x, y.low());
+  return g0.high();
+}
+
+// Computes upper 32 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline uint32_t umul96_upper32(uint32_t x, uint64_t y) FMT_NOEXCEPT {
+  return static_cast<uint32_t>(umul128_upper64(x, y));
+}
+
+// Computes middle 64 bits of multiplication of a 64-bit unsigned integer and a
+// 128-bit unsigned integer.
+FMT_SAFEBUFFERS inline uint64_t umul192_middle64(uint64_t x, uint128_wrapper y)
+    FMT_NOEXCEPT {
+  uint64_t g01 = x * y.high();
+  uint64_t g10 = umul128_upper64(x, y.low());
+  return g01 + g10;
+}
+
+// Computes lower 64 bits of multiplication of a 32-bit unsigned integer and a
+// 64-bit unsigned integer.
+inline uint64_t umul96_lower64(uint32_t x, uint64_t y) FMT_NOEXCEPT {
+  return x * y;
+}
+
+// Computes floor(log10(pow(2, e))) for e in [-1700, 1700] using the method from
+// https://fmt.dev/papers/Grisu-Exact.pdf#page=5, section 3.4.
+inline int floor_log10_pow2(int e) FMT_NOEXCEPT {
+  FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
+  const int shift = 22;
+  return (e * static_cast<int>(data::log10_2_significand >> (64 - shift))) >>
+         shift;
+}
+
+// Various fast log computations.
+inline int floor_log2_pow10(int e) FMT_NOEXCEPT {
+  FMT_ASSERT(e <= 1233 && e >= -1233, "too large exponent");
+  const uint64_t log2_10_integer_part = 3;
+  const uint64_t log2_10_fractional_digits = 0x5269e12f346e2bf9;
+  const int shift_amount = 19;
+  return (e * static_cast<int>(
+                  (log2_10_integer_part << shift_amount) |
+                  (log2_10_fractional_digits >> (64 - shift_amount)))) >>
+         shift_amount;
+}
+inline int floor_log10_pow2_minus_log10_4_over_3(int e) FMT_NOEXCEPT {
+  FMT_ASSERT(e <= 1700 && e >= -1700, "too large exponent");
+  const uint64_t log10_4_over_3_fractional_digits = 0x1ffbfc2bbc780375;
+  const int shift_amount = 22;
+  return (e * static_cast<int>(data::log10_2_significand >>
+                               (64 - shift_amount)) -
+          static_cast<int>(log10_4_over_3_fractional_digits >>
+                           (64 - shift_amount))) >>
+         shift_amount;
+}
+
+// Returns true iff x is divisible by pow(2, exp).
+inline bool divisible_by_power_of_2(uint32_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp >= 1, "");
+  FMT_ASSERT(x != 0, "");
+#ifdef FMT_BUILTIN_CTZ
+  return FMT_BUILTIN_CTZ(x) >= exp;
+#else
+  return exp < num_bits<uint32_t>() && x == ((x >> exp) << exp);
+#endif
+}
+inline bool divisible_by_power_of_2(uint64_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp >= 1, "");
+  FMT_ASSERT(x != 0, "");
+#ifdef FMT_BUILTIN_CTZLL
+  return FMT_BUILTIN_CTZLL(x) >= exp;
+#else
+  return exp < num_bits<uint64_t>()) && x == ((x >> exp) << exp);
+#endif
+}
+
+// Returns true iff x is divisible by pow(5, exp).
+inline bool divisible_by_power_of_5(uint32_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp <= 10, "too large exponent");
+  return x * data::divtest_table_for_pow5_32[exp].mod_inv <=
+         data::divtest_table_for_pow5_32[exp].max_quotient;
+}
+inline bool divisible_by_power_of_5(uint64_t x, int exp) FMT_NOEXCEPT {
+  FMT_ASSERT(exp <= 23, "too large exponent");
+  return x * data::divtest_table_for_pow5_64[exp].mod_inv <=
+         data::divtest_table_for_pow5_64[exp].max_quotient;
+}
+
+// Replaces n by floor(n / pow(5, N)) returning true if and only if n is
+// divisible by pow(5, N).
+// Precondition: n <= 2 * pow(5, N + 1).
+template <int N>
+bool check_divisibility_and_divide_by_pow5(uint32_t& n) FMT_NOEXCEPT {
+  static constexpr struct {
+    uint32_t magic_number;
+    int bits_for_comparison;
+    uint32_t threshold;
+    int shift_amount;
+  } infos[] = {{0xcccd, 16, 0x3333, 18}, {0xa429, 8, 0x0a, 20}};
+  constexpr auto info = infos[N - 1];
+  n *= info.magic_number;
+  const uint32_t comparison_mask = (1u << info.bits_for_comparison) - 1;
+  bool result = (n & comparison_mask) <= info.threshold;
+  n >>= info.shift_amount;
+  return result;
+}
+
+// Computes floor(n / pow(10, N)) for small n and N.
+// Precondition: n <= pow(10, N + 1).
+template <int N> uint32_t small_division_by_pow10(uint32_t n) FMT_NOEXCEPT {
+  static constexpr struct {
+    uint32_t magic_number;
+    int shift_amount;
+    uint32_t divisor_times_10;
+  } infos[] = {{0xcccd, 19, 100}, {0xa3d8, 22, 1000}};
+  constexpr auto info = infos[N - 1];
+  FMT_ASSERT(n <= info.divisor_times_10, "n is too large");
+  return n * info.magic_number >> info.shift_amount;
+}
+
+// Computes floor(n / 10^(kappa + 1)) (float)
+inline uint32_t divide_by_10_to_kappa_plus_1(uint32_t n) FMT_NOEXCEPT {
+  return n / float_info<float>::big_divisor;
+}
+// Computes floor(n / 10^(kappa + 1)) (double)
+inline uint64_t divide_by_10_to_kappa_plus_1(uint64_t n) FMT_NOEXCEPT {
+  return umul128_upper64(n, 0x83126e978d4fdf3c) >> 9;
+}
+
+// Various subroutines using pow10 cache
+template <class T> struct cache_accessor;
+
+template <> struct cache_accessor<float> {
+  using carrier_uint = float_info<float>::carrier_uint;
+  using cache_entry_type = uint64_t;
+
+  static uint64_t get_cached_power(int k) FMT_NOEXCEPT {
+    FMT_ASSERT(k >= float_info<float>::min_k && k <= float_info<float>::max_k,
+               "k is out of range");
+    return data::dragonbox_pow10_significands_64[k - float_info<float>::min_k];
   }
 
-  // Decrement the generated number approaching value from above.
-  void round(uint64_t d, uint64_t divisor, uint64_t& remainder,
-             uint64_t error) {
-    while (
-        remainder < d && error - remainder >= divisor &&
-        (remainder + divisor < d || d - remainder >= remainder + divisor - d)) {
-      --buf[size - 1];
-      remainder += divisor;
-    }
+  static carrier_uint compute_mul(carrier_uint u,
+                                  const cache_entry_type& cache) FMT_NOEXCEPT {
+    return umul96_upper32(u, cache);
   }
 
-  // Implements Grisu's round_weed.
-  digits::result on_digit(char digit, uint64_t divisor, uint64_t remainder,
-                          uint64_t error, int exp, bool integral) {
-    buf[size++] = digit;
-    if (remainder >= error) return digits::more;
-    uint64_t unit = integral ? 1 : data::powers_of_10_64[-exp];
-    uint64_t up = (diff - 1) * unit;  // wp_Wup
-    round(up, divisor, remainder, error);
-    uint64_t down = (diff + 1) * unit;  // wp_Wdown
-    if (remainder < down && error - remainder >= divisor &&
-        (remainder + divisor < down ||
-         down - remainder > remainder + divisor - down)) {
-      return digits::error;
-    }
-    return 2 * unit <= remainder && remainder <= error - 4 * unit
-               ? digits::done
-               : digits::error;
+  static uint32_t compute_delta(const cache_entry_type& cache,
+                                int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<uint32_t>(cache >> (64 - 1 - beta_minus_1));
+  }
+
+  static bool compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta_minus_1) FMT_NOEXCEPT {
+    FMT_ASSERT(beta_minus_1 >= 1, "");
+    FMT_ASSERT(beta_minus_1 < 64, "");
+
+    return ((umul96_lower64(two_f, cache) >> (64 - beta_minus_1)) & 1) != 0;
+  }
+
+  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<carrier_uint>(
+        (cache - (cache >> (float_info<float>::significand_bits + 2))) >>
+        (64 - float_info<float>::significand_bits - 1 - beta_minus_1));
+  }
+
+  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<carrier_uint>(
+        (cache + (cache >> (float_info<float>::significand_bits + 1))) >>
+        (64 - float_info<float>::significand_bits - 1 - beta_minus_1));
+  }
+
+  static carrier_uint compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return (static_cast<carrier_uint>(
+                cache >>
+                (64 - float_info<float>::significand_bits - 2 - beta_minus_1)) +
+            1) /
+           2;
   }
 };
 
+template <> struct cache_accessor<double> {
+  using carrier_uint = float_info<double>::carrier_uint;
+  using cache_entry_type = uint128_wrapper;
+
+  static uint128_wrapper get_cached_power(int k) FMT_NOEXCEPT {
+    FMT_ASSERT(k >= float_info<double>::min_k && k <= float_info<double>::max_k,
+               "k is out of range");
+
+#if FMT_USE_FULL_CACHE_DRAGONBOX
+    return data::dragonbox_pow10_significands_128[k -
+                                                  float_info<double>::min_k];
+#else
+    static const int compression_ratio = 27;
+
+    // Compute base index.
+    int cache_index = (k - float_info<double>::min_k) / compression_ratio;
+    int kb = cache_index * compression_ratio + float_info<double>::min_k;
+    int offset = k - kb;
+
+    // Get base cache.
+    uint128_wrapper base_cache =
+        data::dragonbox_pow10_significands_128[cache_index];
+    if (offset == 0) return base_cache;
+
+    // Compute the required amount of bit-shift.
+    int alpha = floor_log2_pow10(kb + offset) - floor_log2_pow10(kb) - offset;
+    FMT_ASSERT(alpha > 0 && alpha < 64, "shifting error detected");
+
+    // Try to recover the real cache.
+    uint64_t pow5 = data::powers_of_5_64[offset];
+    uint128_wrapper recovered_cache = umul128(base_cache.high(), pow5);
+    uint128_wrapper middle_low =
+        umul128(base_cache.low() - (kb < 0 ? 1 : 0), pow5);
+
+    recovered_cache += middle_low.high();
+
+    uint64_t high_to_middle = recovered_cache.high() << (64 - alpha);
+    uint64_t middle_to_low = recovered_cache.low() << (64 - alpha);
+
+    recovered_cache =
+        uint128_wrapper{(recovered_cache.low() >> alpha) | high_to_middle,
+                        ((middle_low.low() >> alpha) | middle_to_low)};
+
+    if (kb < 0) recovered_cache += 1;
+
+    // Get error.
+    int error_idx = (k - float_info<double>::min_k) / 16;
+    uint32_t error = (data::dragonbox_pow10_recovery_errors[error_idx] >>
+                      ((k - float_info<double>::min_k) % 16) * 2) &
+                     0x3;
+
+    // Add the error back.
+    FMT_ASSERT(recovered_cache.low() + error >= recovered_cache.low(), "");
+    return {recovered_cache.high(), recovered_cache.low() + error};
+#endif
+  }
+
+  static carrier_uint compute_mul(carrier_uint u,
+                                  const cache_entry_type& cache) FMT_NOEXCEPT {
+    return umul192_upper64(u, cache);
+  }
+
+  static uint32_t compute_delta(cache_entry_type const& cache,
+                                int beta_minus_1) FMT_NOEXCEPT {
+    return static_cast<uint32_t>(cache.high() >> (64 - 1 - beta_minus_1));
+  }
+
+  static bool compute_mul_parity(carrier_uint two_f,
+                                 const cache_entry_type& cache,
+                                 int beta_minus_1) FMT_NOEXCEPT {
+    FMT_ASSERT(beta_minus_1 >= 1, "");
+    FMT_ASSERT(beta_minus_1 < 64, "");
+
+    return ((umul192_middle64(two_f, cache) >> (64 - beta_minus_1)) & 1) != 0;
+  }
+
+  static carrier_uint compute_left_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return (cache.high() -
+            (cache.high() >> (float_info<double>::significand_bits + 2))) >>
+           (64 - float_info<double>::significand_bits - 1 - beta_minus_1);
+  }
+
+  static carrier_uint compute_right_endpoint_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return (cache.high() +
+            (cache.high() >> (float_info<double>::significand_bits + 1))) >>
+           (64 - float_info<double>::significand_bits - 1 - beta_minus_1);
+  }
+
+  static carrier_uint compute_round_up_for_shorter_interval_case(
+      const cache_entry_type& cache, int beta_minus_1) FMT_NOEXCEPT {
+    return ((cache.high() >>
+             (64 - float_info<double>::significand_bits - 2 - beta_minus_1)) +
+            1) /
+           2;
+  }
+};
+
+// Various integer checks
+template <class T>
+bool is_left_endpoint_integer_shorter_interval(int exponent) FMT_NOEXCEPT {
+  return exponent >=
+             float_info<
+                 T>::case_shorter_interval_left_endpoint_lower_threshold &&
+         exponent <=
+             float_info<T>::case_shorter_interval_left_endpoint_upper_threshold;
+}
+template <class T>
+bool is_endpoint_integer(typename float_info<T>::carrier_uint two_f,
+                         int exponent, int minus_k) FMT_NOEXCEPT {
+  if (exponent < float_info<T>::case_fc_pm_half_lower_threshold) return false;
+  // For k >= 0.
+  if (exponent <= float_info<T>::case_fc_pm_half_upper_threshold) return true;
+  // For k < 0.
+  if (exponent > float_info<T>::divisibility_check_by_5_threshold) return false;
+  return divisible_by_power_of_5(two_f, minus_k);
+}
+
+template <class T>
+bool is_center_integer(typename float_info<T>::carrier_uint two_f, int exponent,
+                       int minus_k) FMT_NOEXCEPT {
+  // Exponent for 5 is negative.
+  if (exponent > float_info<T>::divisibility_check_by_5_threshold) return false;
+  if (exponent > float_info<T>::case_fc_upper_threshold)
+    return divisible_by_power_of_5(two_f, minus_k);
+  // Both exponents are nonnegative.
+  if (exponent >= float_info<T>::case_fc_lower_threshold) return true;
+  // Exponent for 2 is negative.
+  return divisible_by_power_of_2(two_f, minus_k - exponent + 1);
+}
+
+// Remove trailing zeros from n and return the number of zeros removed (float)
+FMT_ALWAYS_INLINE int remove_trailing_zeros(uint32_t& n) FMT_NOEXCEPT {
+#ifdef FMT_BUILTIN_CTZ
+  int t = FMT_BUILTIN_CTZ(n);
+#else
+  int t = ctz(n);
+#endif
+  if (t > float_info<float>::max_trailing_zeros)
+    t = float_info<float>::max_trailing_zeros;
+
+  const uint32_t mod_inv1 = 0xcccccccd;
+  const uint32_t max_quotient1 = 0x33333333;
+  const uint32_t mod_inv2 = 0xc28f5c29;
+  const uint32_t max_quotient2 = 0x0a3d70a3;
+
+  int s = 0;
+  for (; s < t - 1; s += 2) {
+    if (n * mod_inv2 > max_quotient2) break;
+    n *= mod_inv2;
+  }
+  if (s < t && n * mod_inv1 <= max_quotient1) {
+    n *= mod_inv1;
+    ++s;
+  }
+  n >>= s;
+  return s;
+}
+
+// Removes trailing zeros and returns the number of zeros removed (double)
+FMT_ALWAYS_INLINE int remove_trailing_zeros(uint64_t& n) FMT_NOEXCEPT {
+#ifdef FMT_BUILTIN_CTZLL
+  int t = FMT_BUILTIN_CTZLL(n);
+#else
+  int t = ctzll(n);
+#endif
+  if (t > float_info<double>::max_trailing_zeros)
+    t = float_info<double>::max_trailing_zeros;
+  // Divide by 10^8 and reduce to 32-bits
+  // Since ret_value.significand <= (2^64 - 1) / 1000 < 10^17,
+  // both of the quotient and the r should fit in 32-bits
+
+  const uint32_t mod_inv1 = 0xcccccccd;
+  const uint32_t max_quotient1 = 0x33333333;
+  const uint64_t mod_inv8 = 0xc767074b22e90e21;
+  const uint64_t max_quotient8 = 0x00002af31dc46118;
+
+  // If the number is divisible by 1'0000'0000, work with the quotient
+  if (t >= 8) {
+    auto quotient_candidate = n * mod_inv8;
+
+    if (quotient_candidate <= max_quotient8) {
+      auto quotient = static_cast<uint32_t>(quotient_candidate >> 8);
+
+      int s = 8;
+      for (; s < t; ++s) {
+        if (quotient * mod_inv1 > max_quotient1) break;
+        quotient *= mod_inv1;
+      }
+      quotient >>= (s - 8);
+      n = quotient;
+      return s;
+    }
+  }
+
+  // Otherwise, work with the remainder
+  auto quotient = static_cast<uint32_t>(n / 100000000);
+  auto remainder = static_cast<uint32_t>(n - 100000000 * quotient);
+
+  if (t == 0 || remainder * mod_inv1 > max_quotient1) {
+    return 0;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 1 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 1) + quotient * 10000000ull;
+    return 1;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 2 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 2) + quotient * 1000000ull;
+    return 2;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 3 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 3) + quotient * 100000ull;
+    return 3;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 4 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 4) + quotient * 10000ull;
+    return 4;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 5 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 5) + quotient * 1000ull;
+    return 5;
+  }
+  remainder *= mod_inv1;
+
+  if (t == 6 || remainder * mod_inv1 > max_quotient1) {
+    n = (remainder >> 6) + quotient * 100ull;
+    return 6;
+  }
+  remainder *= mod_inv1;
+
+  n = (remainder >> 7) + quotient * 10ull;
+  return 7;
+}
+
+// The main algorithm for shorter interval case
+template <class T>
+FMT_ALWAYS_INLINE FMT_SAFEBUFFERS decimal_fp<T> shorter_interval_case(
+    int exponent) FMT_NOEXCEPT {
+  decimal_fp<T> ret_value;
+  // Compute k and beta
+  const int minus_k = floor_log10_pow2_minus_log10_4_over_3(exponent);
+  const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute xi and zi
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+
+  auto xi = cache_accessor<T>::compute_left_endpoint_for_shorter_interval_case(
+      cache, beta_minus_1);
+  auto zi = cache_accessor<T>::compute_right_endpoint_for_shorter_interval_case(
+      cache, beta_minus_1);
+
+  // If the left endpoint is not an integer, increase it
+  if (!is_left_endpoint_integer_shorter_interval<T>(exponent)) ++xi;
+
+  // Try bigger divisor
+  ret_value.significand = zi / 10;
+
+  // If succeed, remove trailing zeros if necessary and return
+  if (ret_value.significand * 10 >= xi) {
+    ret_value.exponent = minus_k + 1;
+    ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+    return ret_value;
+  }
+
+  // Otherwise, compute the round-up of y
+  ret_value.significand =
+      cache_accessor<T>::compute_round_up_for_shorter_interval_case(
+          cache, beta_minus_1);
+  ret_value.exponent = minus_k;
+
+  // When tie occurs, choose one of them according to the rule
+  if (exponent >= float_info<T>::shorter_interval_tie_lower_threshold &&
+      exponent <= float_info<T>::shorter_interval_tie_upper_threshold) {
+    ret_value.significand = ret_value.significand % 2 == 0
+                                ? ret_value.significand
+                                : ret_value.significand - 1;
+  } else if (ret_value.significand < xi) {
+    ++ret_value.significand;
+  }
+  return ret_value;
+}
+
+template <typename T>
+FMT_SAFEBUFFERS decimal_fp<T> to_decimal(T x) FMT_NOEXCEPT {
+  // Step 1: integer promotion & Schubfach multiplier calculation.
+
+  using carrier_uint = typename float_info<T>::carrier_uint;
+  using cache_entry_type = typename cache_accessor<T>::cache_entry_type;
+  auto br = bit_cast<carrier_uint>(x);
+
+  // Extract significand bits and exponent bits.
+  const carrier_uint significand_mask =
+      (static_cast<carrier_uint>(1) << float_info<T>::significand_bits) - 1;
+  carrier_uint significand = (br & significand_mask);
+  int exponent = static_cast<int>((br & exponent_mask<T>()) >>
+                                  float_info<T>::significand_bits);
+
+  if (exponent != 0) {  // Check if normal.
+    exponent += float_info<T>::exponent_bias - float_info<T>::significand_bits;
+
+    // Shorter interval case; proceed like Schubfach.
+    if (significand == 0) return shorter_interval_case<T>(exponent);
+
+    significand |=
+        (static_cast<carrier_uint>(1) << float_info<T>::significand_bits);
+  } else {
+    // Subnormal case; the interval is always regular.
+    if (significand == 0) return {0, 0};
+    exponent = float_info<T>::min_exponent - float_info<T>::significand_bits;
+  }
+
+  const bool include_left_endpoint = (significand % 2 == 0);
+  const bool include_right_endpoint = include_left_endpoint;
+
+  // Compute k and beta.
+  const int minus_k = floor_log10_pow2(exponent) - float_info<T>::kappa;
+  const cache_entry_type cache = cache_accessor<T>::get_cached_power(-minus_k);
+  const int beta_minus_1 = exponent + floor_log2_pow10(-minus_k);
+
+  // Compute zi and deltai
+  // 10^kappa <= deltai < 10^(kappa + 1)
+  const uint32_t deltai = cache_accessor<T>::compute_delta(cache, beta_minus_1);
+  const carrier_uint two_fc = significand << 1;
+  const carrier_uint two_fr = two_fc | 1;
+  const carrier_uint zi =
+      cache_accessor<T>::compute_mul(two_fr << beta_minus_1, cache);
+
+  // Step 2: Try larger divisor; remove trailing zeros if necessary
+
+  // Using an upper bound on zi, we might be able to optimize the division
+  // better than the compiler; we are computing zi / big_divisor here
+  decimal_fp<T> ret_value;
+  ret_value.significand = divide_by_10_to_kappa_plus_1(zi);
+  uint32_t r = static_cast<uint32_t>(zi - float_info<T>::big_divisor *
+                                              ret_value.significand);
+
+  if (r > deltai) {
+    goto small_divisor_case_label;
+  } else if (r < deltai) {
+    // Exclude the right endpoint if necessary
+    if (r == 0 && !include_right_endpoint &&
+        is_endpoint_integer<T>(two_fr, exponent, minus_k)) {
+      --ret_value.significand;
+      r = float_info<T>::big_divisor;
+      goto small_divisor_case_label;
+    }
+  } else {
+    // r == deltai; compare fractional parts
+    // Check conditions in the order different from the paper
+    // to take advantage of short-circuiting
+    const carrier_uint two_fl = two_fc - 1;
+    if ((!include_left_endpoint ||
+         !is_endpoint_integer<T>(two_fl, exponent, minus_k)) &&
+        !cache_accessor<T>::compute_mul_parity(two_fl, cache, beta_minus_1)) {
+      goto small_divisor_case_label;
+    }
+  }
+  ret_value.exponent = minus_k + float_info<T>::kappa + 1;
+
+  // We may need to remove trailing zeros
+  ret_value.exponent += remove_trailing_zeros(ret_value.significand);
+  return ret_value;
+
+  // Step 3: Find the significand with the smaller divisor
+
+small_divisor_case_label:
+  ret_value.significand *= 10;
+  ret_value.exponent = minus_k + float_info<T>::kappa;
+
+  const uint32_t mask = (1u << float_info<T>::kappa) - 1;
+  auto dist = r - (deltai / 2) + (float_info<T>::small_divisor / 2);
+
+  // Is dist divisible by 2^kappa?
+  if ((dist & mask) == 0) {
+    const bool approx_y_parity =
+        ((dist ^ (float_info<T>::small_divisor / 2)) & 1) != 0;
+    dist >>= float_info<T>::kappa;
+
+    // Is dist divisible by 5^kappa?
+    if (check_divisibility_and_divide_by_pow5<float_info<T>::kappa>(dist)) {
+      ret_value.significand += dist;
+
+      // Check z^(f) >= epsilon^(f)
+      // We have either yi == zi - epsiloni or yi == (zi - epsiloni) - 1,
+      // where yi == zi - epsiloni if and only if z^(f) >= epsilon^(f)
+      // Since there are only 2 possibilities, we only need to care about the
+      // parity. Also, zi and r should have the same parity since the divisor
+      // is an even number
+      if (cache_accessor<T>::compute_mul_parity(two_fc, cache, beta_minus_1) !=
+          approx_y_parity) {
+        --ret_value.significand;
+      } else {
+        // If z^(f) >= epsilon^(f), we might have a tie
+        // when z^(f) == epsilon^(f), or equivalently, when y is an integer
+        if (is_center_integer<T>(two_fc, exponent, minus_k)) {
+          ret_value.significand = ret_value.significand % 2 == 0
+                                      ? ret_value.significand
+                                      : ret_value.significand - 1;
+        }
+      }
+    }
+    // Is dist not divisible by 5^kappa?
+    else {
+      ret_value.significand += dist;
+    }
+  }
+  // Is dist not divisible by 2^kappa?
+  else {
+    // Since we know dist is small, we might be able to optimize the division
+    // better than the compiler; we are computing dist / small_divisor here
+    ret_value.significand +=
+        small_division_by_pow10<float_info<T>::kappa>(dist);
+  }
+  return ret_value;
+}
+}  // namespace dragonbox
+
 // Formats value using a variation of the Fixed-Precision Positive
 // Floating-Point Printout ((FPP)^2) algorithm by Steele & White:
 // https://fmt.dev/p372-steele.pdf.
 template <typename Double>
-void fallback_format(Double d, buffer<char>& buf, int& exp10) {
+void fallback_format(Double d, int num_digits, bool binary32, buffer<char>& buf,
+                     int& exp10) {
   bigint numerator;    // 2 * R in (FPP)^2.
   bigint denominator;  // 2 * S in (FPP)^2.
   // lower and upper are differences between value and corresponding boundaries.
@@ -998,8 +2314,9 @@ void fallback_format(Double d, buffer<char>& buf, int& exp10) {
   // Shift numerator and denominator by an extra bit or two (if lower boundary
   // is closer) to make lower and upper integers. This eliminates multiplication
   // by 2 during later computations.
-  // TODO: handle float
-  int shift = value.assign(d) ? 2 : 1;
+  const bool is_predecessor_closer =
+      binary32 ? value.assign(static_cast<float>(d)) : value.assign(d);
+  int shift = is_predecessor_closer ? 2 : 1;
   uint64_t significand = value.f << shift;
   if (value.e >= 0) {
     numerator.assign(significand);
@@ -1034,39 +2351,73 @@ void fallback_format(Double d, buffer<char>& buf, int& exp10) {
       upper = &upper_store;
     }
   }
-  if (!upper) upper = &lower;
   // Invariant: value == (numerator / denominator) * pow(10, exp10).
-  bool even = (value.f & 1) == 0;
-  int num_digits = 0;
-  char* data = buf.data();
-  for (;;) {
-    int digit = numerator.divmod_assign(denominator);
-    bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
-    // numerator + upper >[=] pow10:
-    bool high = add_compare(numerator, *upper, denominator) + even > 0;
-    data[num_digits++] = static_cast<char>('0' + digit);
-    if (low || high) {
-      if (!low) {
-        ++data[num_digits - 1];
-      } else if (high) {
-        int result = add_compare(numerator, numerator, denominator);
-        // Round half to even.
-        if (result > 0 || (result == 0 && (digit % 2) != 0))
+  if (num_digits < 0) {
+    // Generate the shortest representation.
+    if (!upper) upper = &lower;
+    bool even = (value.f & 1) == 0;
+    num_digits = 0;
+    char* data = buf.data();
+    for (;;) {
+      int digit = numerator.divmod_assign(denominator);
+      bool low = compare(numerator, lower) - even < 0;  // numerator <[=] lower.
+      // numerator + upper >[=] pow10:
+      bool high = add_compare(numerator, *upper, denominator) + even > 0;
+      data[num_digits++] = static_cast<char>('0' + digit);
+      if (low || high) {
+        if (!low) {
           ++data[num_digits - 1];
+        } else if (high) {
+          int result = add_compare(numerator, numerator, denominator);
+          // Round half to even.
+          if (result > 0 || (result == 0 && (digit % 2) != 0))
+            ++data[num_digits - 1];
+        }
+        buf.try_resize(to_unsigned(num_digits));
+        exp10 -= num_digits - 1;
+        return;
+      }
+      numerator *= 10;
+      lower *= 10;
+      if (upper != &lower) *upper *= 10;
+    }
+  }
+  // Generate the given number of digits.
+  exp10 -= num_digits - 1;
+  if (num_digits == 0) {
+    buf.try_resize(1);
+    denominator *= 10;
+    buf[0] = add_compare(numerator, numerator, denominator) > 0 ? '1' : '0';
+    return;
+  }
+  buf.try_resize(to_unsigned(num_digits));
+  for (int i = 0; i < num_digits - 1; ++i) {
+    int digit = numerator.divmod_assign(denominator);
+    buf[i] = static_cast<char>('0' + digit);
+    numerator *= 10;
+  }
+  int digit = numerator.divmod_assign(denominator);
+  auto result = add_compare(numerator, numerator, denominator);
+  if (result > 0 || (result == 0 && (digit % 2) != 0)) {
+    if (digit == 9) {
+      const auto overflow = '0' + 10;
+      buf[num_digits - 1] = overflow;
+      // Propagate the carry.
+      for (int i = num_digits - 1; i > 0 && buf[i] == overflow; --i) {
+        buf[i] = '0';
+        ++buf[i - 1];
+      }
+      if (buf[0] == overflow) {
+        buf[0] = '1';
+        ++exp10;
       }
-      buf.resize(to_unsigned(num_digits));
-      exp10 -= num_digits - 1;
       return;
     }
-    numerator *= 10;
-    lower *= 10;
-    if (upper != &lower) *upper *= 10;
+    ++digit;
   }
+  buf[num_digits - 1] = static_cast<char>('0' + digit);
 }
 
-// Formats value using the Grisu algorithm
-// (https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf)
-// if T is a IEEE754 binary32 or binary64 and snprintf otherwise.
 template <typename T>
 int format_float(T value, int precision, float_specs specs, buffer<char>& buf) {
   static_assert(!std::is_same<T, float>::value, "");
@@ -1078,66 +2429,57 @@ int format_float(T value, int precision, float_specs specs, buffer<char>& buf) {
       buf.push_back('0');
       return 0;
     }
-    buf.resize(to_unsigned(precision));
+    buf.try_resize(to_unsigned(precision));
     std::uninitialized_fill_n(buf.data(), precision, '0');
     return -precision;
   }
 
   if (!specs.use_grisu) return snprintf_float(value, precision, specs, buf);
 
+  if (precision < 0) {
+    // Use Dragonbox for the shortest format.
+    if (specs.binary32) {
+      auto dec = dragonbox::to_decimal(static_cast<float>(value));
+      write<char>(buffer_appender<char>(buf), dec.significand);
+      return dec.exponent;
+    }
+    auto dec = dragonbox::to_decimal(static_cast<double>(value));
+    write<char>(buffer_appender<char>(buf), dec.significand);
+    return dec.exponent;
+  }
+
+  // Use Grisu + Dragon4 for the given precision:
+  // https://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf.
   int exp = 0;
   const int min_exp = -60;  // alpha in Grisu.
   int cached_exp10 = 0;     // K in Grisu.
-  if (precision < 0) {
-    fp fp_value;
-    auto boundaries = specs.binary32
-                          ? fp_value.assign_float_with_boundaries(value)
-                          : fp_value.assign_with_boundaries(value);
-    fp_value = normalize(fp_value);
-    // Find a cached power of 10 such that multiplying value by it will bring
-    // the exponent in the range [min_exp, -32].
-    const fp cached_pow = get_cached_power(
-        min_exp - (fp_value.e + fp::significand_size), cached_exp10);
-    // Multiply value and boundaries by the cached power of 10.
-    fp_value = fp_value * cached_pow;
-    boundaries.lower = multiply(boundaries.lower, cached_pow.f);
-    boundaries.upper = multiply(boundaries.upper, cached_pow.f);
-    assert(min_exp <= fp_value.e && fp_value.e <= -32);
-    --boundaries.lower;  // \tilde{M}^- - 1 ulp -> M^-_{\downarrow}.
-    ++boundaries.upper;  // \tilde{M}^+ + 1 ulp -> M^+_{\uparrow}.
-    // Numbers outside of (lower, upper) definitely do not round to value.
-    grisu_shortest_handler handler{buf.data(), 0,
-                                   boundaries.upper - fp_value.f};
-    auto result =
-        grisu_gen_digits(fp(boundaries.upper, fp_value.e),
-                         boundaries.upper - boundaries.lower, exp, handler);
-    if (result == digits::error) {
-      exp += handler.size - cached_exp10 - 1;
-      fallback_format(value, buf, exp);
-      return exp;
-    }
-    buf.resize(to_unsigned(handler.size));
+  fp normalized = normalize(fp(value));
+  const auto cached_pow = get_cached_power(
+      min_exp - (normalized.e + fp::significand_size), cached_exp10);
+  normalized = normalized * cached_pow;
+  // Limit precision to the maximum possible number of significant digits in an
+  // IEEE754 double because we don't need to generate zeros.
+  const int max_double_digits = 767;
+  if (precision > max_double_digits) precision = max_double_digits;
+  fixed_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
+  if (grisu_gen_digits(normalized, 1, exp, handler) == digits::error) {
+    exp += handler.size - cached_exp10 - 1;
+    fallback_format(value, handler.precision, specs.binary32, buf, exp);
   } else {
-    if (precision > 17) return snprintf_float(value, precision, specs, buf);
-    fp normalized = normalize(fp(value));
-    const auto cached_pow = get_cached_power(
-        min_exp - (normalized.e + fp::significand_size), cached_exp10);
-    normalized = normalized * cached_pow;
-    fixed_handler handler{buf.data(), 0, precision, -cached_exp10, fixed};
-    if (grisu_gen_digits(normalized, 1, exp, handler) == digits::error)
-      return snprintf_float(value, precision, specs, buf);
-    int num_digits = handler.size;
-    if (!fixed) {
-      // Remove trailing zeros.
-      while (num_digits > 0 && buf[num_digits - 1] == '0') {
-        --num_digits;
-        ++exp;
-      }
-    }
-    buf.resize(to_unsigned(num_digits));
+    exp += handler.exp10;
+    buf.try_resize(to_unsigned(handler.size));
   }
-  return exp - cached_exp10;
-}
+  if (!fixed && !specs.showpoint) {
+    // Remove trailing zeros.
+    auto num_digits = buf.size();
+    while (num_digits > 0 && buf[num_digits - 1] == '0') {
+      --num_digits;
+      ++exp;
+    }
+    buf.try_resize(num_digits);
+  }
+  return exp;
+}  // namespace detail
 
 template <typename T>
 int snprintf_float(T value, int precision, float_specs specs,
@@ -1185,19 +2527,20 @@ int snprintf_float(T value, int precision, float_specs specs,
                      ? snprintf_ptr(begin, capacity, format, precision, value)
                      : snprintf_ptr(begin, capacity, format, value);
     if (result < 0) {
-      buf.reserve(buf.capacity() + 1);  // The buffer will grow exponentially.
+      // The buffer will grow exponentially.
+      buf.try_reserve(buf.capacity() + 1);
       continue;
     }
     auto size = to_unsigned(result);
     // Size equal to capacity means that the last character was truncated.
     if (size >= capacity) {
-      buf.reserve(size + offset + 1);  // Add 1 for the terminating '\0'.
+      buf.try_reserve(size + offset + 1);  // Add 1 for the terminating '\0'.
       continue;
     }
     auto is_digit = [](char c) { return c >= '0' && c <= '9'; };
     if (specs.format == float_format::fixed) {
       if (precision == 0) {
-        buf.resize(size);
+        buf.try_resize(size);
         return 0;
       }
       // Find and remove the decimal point.
@@ -1207,11 +2550,11 @@ int snprintf_float(T value, int precision, float_specs specs,
       } while (is_digit(*p));
       int fraction_size = static_cast<int>(end - p - 1);
       std::memmove(p, p + 1, to_unsigned(fraction_size));
-      buf.resize(size - 1);
+      buf.try_resize(size - 1);
       return -fraction_size;
     }
     if (specs.format == float_format::hex) {
-      buf.resize(size + offset);
+      buf.try_resize(size + offset);
       return 0;
     }
     // Find and parse the exponent.
@@ -1237,7 +2580,7 @@ int snprintf_float(T value, int precision, float_specs specs,
       fraction_size = static_cast<int>(fraction_end - begin - 1);
       std::memmove(begin + 1, begin + 2, to_unsigned(fraction_size));
     }
-    buf.resize(to_unsigned(fraction_size) + offset + 1);
+    buf.try_resize(to_unsigned(fraction_size) + offset + 1);
     return exp - fraction_size;
   }
 }
@@ -1259,25 +2602,18 @@ int snprintf_float(T value, int precision, float_specs specs,
  * occurs, this pointer will be a guess that depends on the particular
  * error, but it will always advance at least one byte.
  */
-FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
-  static const char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-                                 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
-                                 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
+inline const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
   static const int masks[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
   static const uint32_t mins[] = {4194304, 0, 128, 2048, 65536};
   static const int shiftc[] = {0, 18, 12, 6, 0};
   static const int shifte[] = {0, 6, 4, 2, 0};
 
-  auto s = reinterpret_cast<const unsigned char*>(buf);
-  int len = lengths[s[0] >> 3];
-
-  // Compute the pointer to the next character early so that the next
-  // iteration can start working on the next character. Neither Clang
-  // nor GCC figure out this reordering on their own.
-  const char* next = buf + len + !len;
+  int len = code_point_length(buf);
+  const char* next = buf + len;
 
   // Assume a four-byte character and load four bytes. Unused bits are
   // shifted out.
+  auto s = reinterpret_cast<const unsigned char*>(buf);
   *c = uint32_t(s[0] & masks[len]) << 18;
   *c |= uint32_t(s[1] & 0x3f) << 12;
   *c |= uint32_t(s[2] & 0x3f) << 6;
@@ -1296,6 +2632,19 @@ FMT_FUNC const char* utf8_decode(const char* buf, uint32_t* c, int* e) {
 
   return next;
 }
+
+struct stringifier {
+  template <typename T> FMT_INLINE std::string operator()(T value) const {
+    return to_string(value);
+  }
+  std::string operator()(basic_format_arg<format_context>::handle h) const {
+    memory_buffer buf;
+    format_parse_context parse_ctx({});
+    format_context format_ctx(buffer_appender<char>(buf), {}, {});
+    h.format(parse_ctx, format_ctx);
+    return to_string(buf);
+  }
+};
 }  // namespace detail
 
 template <> struct formatter<detail::bigint> {
@@ -1363,7 +2712,8 @@ FMT_FUNC void format_system_error(detail::buffer<char>& out, int error_code,
       int result =
           detail::safe_strerror(error_code, system_message, buf.size());
       if (result == 0) {
-        format_to(std::back_inserter(out), "{}: {}", message, system_message);
+        format_to(detail::buffer_appender<char>(out), "{}: {}", message,
+                  system_message);
         return;
       }
       if (result != ERANGE)
@@ -1384,20 +2734,6 @@ FMT_FUNC void report_system_error(int error_code,
   report_error(format_system_error, error_code, message);
 }
 
-struct stringifier {
-  template <typename T> FMT_INLINE std::string operator()(T value) const {
-    return to_string(value);
-  }
-  std::string operator()(basic_format_arg<format_context>::handle h) const {
-    memory_buffer buf;
-    detail::buffer<char>& base = buf;
-    format_parse_context parse_ctx({});
-    format_context format_ctx(std::back_inserter(base), {}, {});
-    h.format(parse_ctx, format_ctx);
-    return to_string(buf);
-  }
-};
-
 FMT_FUNC std::string detail::vformat(string_view format_str, format_args args) {
   if (format_str.size() == 2 && equal2(format_str.data(), "{}")) {
     auto arg = args.get(0);
@@ -1409,6 +2745,14 @@ FMT_FUNC std::string detail::vformat(string_view format_str, format_args args) {
   return to_string(buffer);
 }
 
+#ifdef _WIN32
+namespace detail {
+using dword = conditional_t<sizeof(long) == 4, unsigned long, unsigned>;
+extern "C" __declspec(dllimport) int __stdcall WriteConsoleW(  //
+    void*, const void*, dword, dword*, void*);
+}  // namespace detail
+#endif
+
 FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) {
   memory_buffer buffer;
   detail::vformat_to(buffer, format_str,
@@ -1417,10 +2761,10 @@ FMT_FUNC void vprint(std::FILE* f, string_view format_str, format_args args) {
   auto fd = _fileno(f);
   if (_isatty(fd)) {
     detail::utf8_to_utf16 u16(string_view(buffer.data(), buffer.size()));
-    auto written = DWORD();
-    if (!WriteConsoleW(reinterpret_cast<HANDLE>(_get_osfhandle(fd)),
-                       u16.c_str(), static_cast<DWORD>(u16.size()), &written,
-                       nullptr)) {
+    auto written = detail::dword();
+    if (!detail::WriteConsoleW(reinterpret_cast<void*>(_get_osfhandle(fd)),
+                               u16.c_str(), static_cast<uint32_t>(u16.size()),
+                               &written, nullptr)) {
       FMT_THROW(format_error("failed to write to console"));
     }
     return;
@@ -1446,8 +2790,4 @@ FMT_FUNC void vprint(string_view format_str, format_args args) {
 
 FMT_END_NAMESPACE
 
-#ifdef _MSC_VER
-#  pragma warning(pop)
-#endif
-
 #endif  // FMT_FORMAT_INL_H_
diff --git a/src/fmt/format.h b/src/fmt/format.h
index a4911b9fdb..fbe5045068 100644
--- a/src/fmt/format.h
+++ b/src/fmt/format.h
@@ -69,16 +69,12 @@
 #  define FMT_NOINLINE
 #endif
 
-// LAMMPS customizations:
-// 1) Intel compilers on MacOS have __clang__ defined
-//    but fail to recognize [[clang::fallthrough]]
-// 2) Intel compilers on Linux identify as GCC compatible
-//    but fail to recognize [[gnu::fallthrough]]
-
 #if __cplusplus == 201103L || __cplusplus == 201402L
-#  if defined(__clang__) && !defined(__INTEL_COMPILER)
+#  if defined(__INTEL_COMPILER) || defined(__PGI)
+#    define FMT_FALLTHROUGH
+#  elif defined(__clang__)
 #    define FMT_FALLTHROUGH [[clang::fallthrough]]
-#  elif FMT_GCC_VERSION >= 700 && !defined(__PGI) && !defined(__INTEL_COMPILER) && \
+#  elif FMT_GCC_VERSION >= 700 && \
       (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 520)
 #    define FMT_FALLTHROUGH [[gnu::fallthrough]]
 #  else
@@ -145,12 +141,13 @@ FMT_END_NAMESPACE
 #endif
 
 #ifndef FMT_USE_UDL_TEMPLATE
-// EDG frontend based compilers (icc, nvcc, etc) and GCC < 6.4 do not properly
-// support UDL templates and GCC >= 9 warns about them.
+// EDG frontend based compilers (icc, nvcc, PGI, etc) and GCC < 6.4 do not
+// properly support UDL templates and GCC >= 9 warns about them.
 #  if FMT_USE_USER_DEFINED_LITERALS &&                         \
       (!defined(__EDG_VERSION__) || __EDG_VERSION__ >= 501) && \
       ((FMT_GCC_VERSION >= 604 && __cplusplus >= 201402L) ||   \
-       FMT_CLANG_VERSION >= 304)
+       FMT_CLANG_VERSION >= 304) &&                            \
+      !defined(__PGI) && !defined(__NVCC__)
 #    define FMT_USE_UDL_TEMPLATE 1
 #  else
 #    define FMT_USE_UDL_TEMPLATE 0
@@ -169,6 +166,14 @@ FMT_END_NAMESPACE
 #  define FMT_USE_LONG_DOUBLE 1
 #endif
 
+// Defining FMT_REDUCE_INT_INSTANTIATIONS to 1, will reduce the number of
+// int_writer template instances to just one by only using the largest integer
+// type. This results in a reduction in binary size but will cause a decrease in
+// integer formatting performance.
+#if !defined(FMT_REDUCE_INT_INSTANTIATIONS)
+#  define FMT_REDUCE_INT_INSTANTIATIONS 0
+#endif
+
 // __builtin_clz is broken in clang with Microsoft CodeGen:
 // https://github.com/fmtlib/fmt/issues/519
 #if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clz)) && !FMT_MSC_VER
@@ -177,56 +182,87 @@ FMT_END_NAMESPACE
 #if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_clzll)) && !FMT_MSC_VER
 #  define FMT_BUILTIN_CLZLL(n) __builtin_clzll(n)
 #endif
+#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_ctz))
+#  define FMT_BUILTIN_CTZ(n) __builtin_ctz(n)
+#endif
+#if (FMT_GCC_VERSION || FMT_HAS_BUILTIN(__builtin_ctzll))
+#  define FMT_BUILTIN_CTZLL(n) __builtin_ctzll(n)
+#endif
+
+#if FMT_MSC_VER
+#  include <intrin.h>  // _BitScanReverse[64], _BitScanForward[64], _umul128
+#endif
 
 // Some compilers masquerade as both MSVC and GCC-likes or otherwise support
 // __builtin_clz and __builtin_clzll, so only define FMT_BUILTIN_CLZ using the
 // MSVC intrinsics if the clz and clzll builtins are not available.
-#if FMT_MSC_VER && !defined(FMT_BUILTIN_CLZLL) && !defined(_MANAGED)
-#  include <intrin.h>  // _BitScanReverse, _BitScanReverse64
-
+#if FMT_MSC_VER && !defined(FMT_BUILTIN_CLZLL) && \
+    !defined(FMT_BUILTIN_CTZLL) && !defined(_MANAGED)
 FMT_BEGIN_NAMESPACE
 namespace detail {
 // Avoid Clang with Microsoft CodeGen's -Wunknown-pragmas warning.
 #  ifndef __clang__
+#    pragma intrinsic(_BitScanForward)
 #    pragma intrinsic(_BitScanReverse)
 #  endif
-inline uint32_t clz(uint32_t x) {
+#  if defined(_WIN64) && !defined(__clang__)
+#    pragma intrinsic(_BitScanForward64)
+#    pragma intrinsic(_BitScanReverse64)
+#  endif
+
+inline int clz(uint32_t x) {
   unsigned long r = 0;
   _BitScanReverse(&r, x);
-
   FMT_ASSERT(x != 0, "");
   // Static analysis complains about using uninitialized data
   // "r", but the only way that can happen is if "x" is 0,
   // which the callers guarantee to not happen.
   FMT_SUPPRESS_MSC_WARNING(6102)
-  return 31 - r;
+  return 31 ^ static_cast<int>(r);
 }
 #  define FMT_BUILTIN_CLZ(n) detail::clz(n)
 
-#  if defined(_WIN64) && !defined(__clang__)
-#    pragma intrinsic(_BitScanReverse64)
-#  endif
-
-inline uint32_t clzll(uint64_t x) {
+inline int clzll(uint64_t x) {
   unsigned long r = 0;
 #  ifdef _WIN64
   _BitScanReverse64(&r, x);
 #  else
   // Scan the high 32 bits.
-  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32))) return 63 - (r + 32);
-
+  if (_BitScanReverse(&r, static_cast<uint32_t>(x >> 32))) return 63 ^ (r + 32);
   // Scan the low 32 bits.
   _BitScanReverse(&r, static_cast<uint32_t>(x));
 #  endif
-
   FMT_ASSERT(x != 0, "");
-  // Static analysis complains about using uninitialized data
-  // "r", but the only way that can happen is if "x" is 0,
-  // which the callers guarantee to not happen.
-  FMT_SUPPRESS_MSC_WARNING(6102)
-  return 63 - r;
+  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+  return 63 ^ static_cast<int>(r);
 }
 #  define FMT_BUILTIN_CLZLL(n) detail::clzll(n)
+
+inline int ctz(uint32_t x) {
+  unsigned long r = 0;
+  _BitScanForward(&r, x);
+  FMT_ASSERT(x != 0, "");
+  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZ(n) detail::ctz(n)
+
+inline int ctzll(uint64_t x) {
+  unsigned long r = 0;
+  FMT_ASSERT(x != 0, "");
+  FMT_SUPPRESS_MSC_WARNING(6102)  // Suppress a bogus static analysis warning.
+#  ifdef _WIN64
+  _BitScanForward64(&r, x);
+#  else
+  // Scan the low 32 bits.
+  if (_BitScanForward(&r, static_cast<uint32_t>(x))) return static_cast<int>(r);
+  // Scan the high 32 bits.
+  _BitScanForward(&r, static_cast<uint32_t>(x >> 32));
+  r += 32;
+#  endif
+  return static_cast<int>(r);
+}
+#  define FMT_BUILTIN_CTZLL(n) detail::ctzll(n)
 }  // namespace detail
 FMT_END_NAMESPACE
 #endif
@@ -304,50 +340,11 @@ FMT_INLINE void assume(bool condition) {
 #endif
 }
 
-// A workaround for gcc 4.8 to make void_t work in a SFINAE context.
-template <typename... Ts> struct void_t_impl { using type = void; };
-
-template <typename... Ts>
-using void_t = typename detail::void_t_impl<Ts...>::type;
-
 // An approximation of iterator_t for pre-C++20 systems.
 template <typename T>
 using iterator_t = decltype(std::begin(std::declval<T&>()));
 template <typename T> using sentinel_t = decltype(std::end(std::declval<T&>()));
 
-// Detect the iterator category of *any* given type in a SFINAE-friendly way.
-// Unfortunately, older implementations of std::iterator_traits are not safe
-// for use in a SFINAE-context.
-template <typename It, typename Enable = void>
-struct iterator_category : std::false_type {};
-
-template <typename T> struct iterator_category<T*> {
-  using type = std::random_access_iterator_tag;
-};
-
-template <typename It>
-struct iterator_category<It, void_t<typename It::iterator_category>> {
-  using type = typename It::iterator_category;
-};
-
-// Detect if *any* given type models the OutputIterator concept.
-template <typename It> class is_output_iterator {
-  // Check for mutability because all iterator categories derived from
-  // std::input_iterator_tag *may* also meet the requirements of an
-  // OutputIterator, thereby falling into the category of 'mutable iterators'
-  // [iterator.requirements.general] clause 4. The compiler reveals this
-  // property only at the point of *actually dereferencing* the iterator!
-  template <typename U>
-  static decltype(*(std::declval<U>())) test(std::input_iterator_tag);
-  template <typename U> static char& test(std::output_iterator_tag);
-  template <typename U> static const char& test(...);
-
-  using type = decltype(test<It>(typename iterator_category<It>::type{}));
-
- public:
-  enum { value = !std::is_const<remove_reference_t<type>>::value };
-};
-
 // A workaround for std::string not having mutable data() until C++17.
 template <typename Char> inline Char* get_data(std::basic_string<Char>& s) {
   return &s[0];
@@ -380,10 +377,29 @@ reserve(std::back_insert_iterator<Container> it, size_t n) {
   return make_checked(get_data(c) + size, n);
 }
 
+template <typename T>
+inline buffer_appender<T> reserve(buffer_appender<T> it, size_t n) {
+  buffer<T>& buf = get_container(it);
+  buf.try_reserve(buf.size() + n);
+  return it;
+}
+
 template <typename Iterator> inline Iterator& reserve(Iterator& it, size_t) {
   return it;
 }
 
+template <typename T, typename OutputIt>
+constexpr T* to_pointer(OutputIt, size_t) {
+  return nullptr;
+}
+template <typename T> T* to_pointer(buffer_appender<T> it, size_t n) {
+  buffer<T>& buf = get_container(it);
+  auto size = buf.size();
+  if (buf.capacity() < size + n) return nullptr;
+  buf.try_resize(size + n);
+  return buf.data() + size;
+}
+
 template <typename Container, FMT_ENABLE_IF(is_contiguous<Container>::value)>
 inline std::back_insert_iterator<Container> base_iterator(
     std::back_insert_iterator<Container>& it,
@@ -421,13 +437,17 @@ class counting_iterator {
     ++count_;
     return *this;
   }
-
   counting_iterator operator++(int) {
     auto it = *this;
     ++*this;
     return it;
   }
 
+  friend counting_iterator operator+(counting_iterator it, difference_type n) {
+    it.count_ += static_cast<size_t>(n);
+    return it;
+  }
+
   value_type operator*() const { return {}; }
 };
 
@@ -561,23 +581,38 @@ OutputIt copy_str(InputIt begin, InputIt end, OutputIt it) {
                         [](char c) { return static_cast<char8_type>(c); });
 }
 
-#ifndef FMT_USE_GRISU
-#  define FMT_USE_GRISU 1
-#endif
-
-template <typename T> constexpr bool use_grisu() {
-  return FMT_USE_GRISU && std::numeric_limits<double>::is_iec559 &&
-         sizeof(T) <= sizeof(double);
+template <typename Char, typename InputIt>
+inline counting_iterator copy_str(InputIt begin, InputIt end,
+                                  counting_iterator it) {
+  return it + (end - begin);
 }
 
+template <typename T>
+using is_fast_float = bool_constant<std::numeric_limits<T>::is_iec559 &&
+                                    sizeof(T) <= sizeof(double)>;
+
+#ifndef FMT_USE_FULL_CACHE_DRAGONBOX
+#  define FMT_USE_FULL_CACHE_DRAGONBOX 0
+#endif
+
 template <typename T>
 template <typename U>
 void buffer<T>::append(const U* begin, const U* end) {
-  size_t new_size = size_ + to_unsigned(end - begin);
-  reserve(new_size);
-  std::uninitialized_copy(begin, end,
-                          make_checked(ptr_ + size_, capacity_ - size_));
-  size_ = new_size;
+  do {
+    auto count = to_unsigned(end - begin);
+    try_reserve(size_ + count);
+    auto free_cap = capacity_ - size_;
+    if (free_cap < count) count = free_cap;
+    std::uninitialized_copy_n(begin, count, make_checked(ptr_ + size_, count));
+    size_ += count;
+    begin += count;
+  } while (begin != end);
+}
+
+template <typename OutputIt, typename T, typename Traits>
+void iterator_buffer<OutputIt, T, Traits>::flush() {
+  out_ = std::copy_n(data_, this->limit(this->size()), out_);
+  this->clear();
 }
 }  // namespace detail
 
@@ -616,7 +651,7 @@ enum { inline_buffer_size = 500 };
  */
 template <typename T, size_t SIZE = inline_buffer_size,
           typename Allocator = std::allocator<T>>
-class basic_memory_buffer : public detail::buffer<T> {
+class basic_memory_buffer final : public detail::buffer<T> {
  private:
   T store_[SIZE];
 
@@ -630,7 +665,7 @@ class basic_memory_buffer : public detail::buffer<T> {
   }
 
  protected:
-  void grow(size_t size) FMT_OVERRIDE;
+  void grow(size_t size) final FMT_OVERRIDE;
 
  public:
   using value_type = T;
@@ -640,7 +675,7 @@ class basic_memory_buffer : public detail::buffer<T> {
       : alloc_(alloc) {
     this->set(store_, SIZE);
   }
-  ~basic_memory_buffer() FMT_OVERRIDE { deallocate(); }
+  ~basic_memory_buffer() { deallocate(); }
 
  private:
   // Move data from other to this buffer.
@@ -684,6 +719,22 @@ class basic_memory_buffer : public detail::buffer<T> {
 
   // Returns a copy of the allocator associated with this buffer.
   Allocator get_allocator() const { return alloc_; }
+
+  /**
+    Resizes the buffer to contain *count* elements. If T is a POD type new
+    elements may not be initialized.
+   */
+  void resize(size_t count) { this->try_resize(count); }
+
+  /** Increases the buffer capacity to *new_capacity*. */
+  void reserve(size_t new_capacity) { this->try_reserve(new_capacity); }
+
+  // Directly append data into the buffer
+  using detail::buffer<T>::append;
+  template <typename ContiguousRange>
+  void append(const ContiguousRange& range) {
+    append(range.data(), range.data() + range.size());
+  }
 };
 
 template <typename T, size_t SIZE, typename Allocator>
@@ -754,19 +805,81 @@ FMT_CONSTEXPR bool is_supported_floating_point(T) {
 }
 
 // Smallest of uint32_t, uint64_t, uint128_t that is large enough to
-// represent all values of T.
+// represent all values of an integral type T.
 template <typename T>
 using uint32_or_64_or_128_t =
-    conditional_t<num_bits<T>() <= 32, uint32_t,
+    conditional_t<num_bits<T>() <= 32 && !FMT_REDUCE_INT_INSTANTIATIONS,
+                  uint32_t,
                   conditional_t<num_bits<T>() <= 64, uint64_t, uint128_t>>;
 
+// 128-bit integer type used internally
+struct FMT_EXTERN_TEMPLATE_API uint128_wrapper {
+  uint128_wrapper() = default;
+
+#if FMT_USE_INT128
+  uint128_t internal_;
+
+  uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT
+      : internal_{static_cast<uint128_t>(low) |
+                  (static_cast<uint128_t>(high) << 64)} {}
+
+  uint128_wrapper(uint128_t u) : internal_{u} {}
+
+  uint64_t high() const FMT_NOEXCEPT { return uint64_t(internal_ >> 64); }
+  uint64_t low() const FMT_NOEXCEPT { return uint64_t(internal_); }
+
+  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
+    internal_ += n;
+    return *this;
+  }
+#else
+  uint64_t high_;
+  uint64_t low_;
+
+  uint128_wrapper(uint64_t high, uint64_t low) FMT_NOEXCEPT : high_{high},
+                                                              low_{low} {}
+
+  uint64_t high() const FMT_NOEXCEPT { return high_; }
+  uint64_t low() const FMT_NOEXCEPT { return low_; }
+
+  uint128_wrapper& operator+=(uint64_t n) FMT_NOEXCEPT {
+#  if defined(_MSC_VER) && defined(_M_X64)
+    unsigned char carry = _addcarry_u64(0, low_, n, &low_);
+    _addcarry_u64(carry, high_, 0, &high_);
+    return *this;
+#  else
+    uint64_t sum = low_ + n;
+    high_ += (sum < low_ ? 1 : 0);
+    low_ = sum;
+    return *this;
+#  endif
+  }
+#endif
+};
+
+// Table entry type for divisibility test used internally
+template <typename T> struct FMT_EXTERN_TEMPLATE_API divtest_table_entry {
+  T mod_inv;
+  T max_quotient;
+};
+
 // Static data is placed in this class template for the header-only config.
 template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data {
   static const uint64_t powers_of_10_64[];
   static const uint32_t zero_or_powers_of_10_32[];
   static const uint64_t zero_or_powers_of_10_64[];
-  static const uint64_t pow10_significands[];
-  static const int16_t pow10_exponents[];
+  static const uint64_t grisu_pow10_significands[];
+  static const int16_t grisu_pow10_exponents[];
+  static const divtest_table_entry<uint32_t> divtest_table_for_pow5_32[];
+  static const divtest_table_entry<uint64_t> divtest_table_for_pow5_64[];
+  static const uint64_t dragonbox_pow10_significands_64[];
+  static const uint128_wrapper dragonbox_pow10_significands_128[];
+  // log10(2) = 0x0.4d104d427de7fbcc...
+  static const uint64_t log10_2_significand = 0x4d104d427de7fbcc;
+#if !FMT_USE_FULL_CACHE_DRAGONBOX
+  static const uint64_t powers_of_5_64[];
+  static const uint32_t dragonbox_pow10_recovery_errors[];
+#endif
   // GCC generates slightly better code for pairs than chars.
   using digit_pair = char[2];
   static const digit_pair digits[];
@@ -780,6 +893,17 @@ template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data {
   static const char right_padding_shifts[5];
 };
 
+// Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
+// This is a function instead of an array to workaround a bug in GCC10 (#1810).
+FMT_INLINE uint16_t bsr2log10(int bsr) {
+  static constexpr uint16_t data[] = {
+      1,  1,  1,  2,  2,  2,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,
+      6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  9,  9,  9,  10, 10, 10,
+      10, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 15, 15,
+      15, 16, 16, 16, 16, 17, 17, 17, 18, 18, 18, 19, 19, 19, 19, 20};
+  return data[bsr];
+}
+
 #ifndef FMT_EXPORTED
 FMT_EXTERN template struct basic_data<void>;
 #endif
@@ -791,10 +915,9 @@ struct data : basic_data<> {};
 // Returns the number of decimal digits in n. Leading zeros are not counted
 // except for n == 0 in which case count_digits returns 1.
 inline int count_digits(uint64_t n) {
-  // Based on http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10
-  // and the benchmark https://github.com/localvoid/cxx-benchmark-count-digits.
-  int t = (64 - FMT_BUILTIN_CLZLL(n | 1)) * 1233 >> 12;
-  return t - (n < data::zero_or_powers_of_10_64[t]) + 1;
+  // https://github.com/fmtlib/format-benchmark/blob/master/digits10
+  auto t = bsr2log10(FMT_BUILTIN_CLZLL(n | 1) ^ 63);
+  return t - (n < data::zero_or_powers_of_10_64[t]);
 }
 #else
 // Fallback version of count_digits used when __builtin_clz is not available.
@@ -844,15 +967,24 @@ template <> int count_digits<4>(detail::fallback_uintptr n);
 
 #if FMT_GCC_VERSION || FMT_CLANG_VERSION
 #  define FMT_ALWAYS_INLINE inline __attribute__((always_inline))
+#elif FMT_MSC_VER
+#  define FMT_ALWAYS_INLINE __forceinline
 #else
-#  define FMT_ALWAYS_INLINE
+#  define FMT_ALWAYS_INLINE inline
+#endif
+
+// To suppress unnecessary security cookie checks
+#if FMT_MSC_VER && !FMT_CLANG_VERSION
+#  define FMT_SAFEBUFFERS __declspec(safebuffers)
+#else
+#  define FMT_SAFEBUFFERS
 #endif
 
 #ifdef FMT_BUILTIN_CLZ
 // Optional version of count_digits for better performance on 32-bit platforms.
 inline int count_digits(uint32_t n) {
-  int t = (32 - FMT_BUILTIN_CLZ(n | 1)) * 1233 >> 12;
-  return t - (n < data::zero_or_powers_of_10_32[t]) + 1;
+  auto t = bsr2log10(FMT_BUILTIN_CLZ(n | 1) ^ 31);
+  return t - (n < data::zero_or_powers_of_10_32[t]);
 }
 #endif
 
@@ -899,7 +1031,7 @@ template <typename Char> void copy2(Char* dst, const char* src) {
   *dst++ = static_cast<Char>(*src++);
   *dst = static_cast<Char>(*src);
 }
-inline void copy2(char* dst, const char* src) { memcpy(dst, src, 2); }
+FMT_INLINE void copy2(char* dst, const char* src) { memcpy(dst, src, 2); }
 
 template <typename Iterator> struct format_decimal_result {
   Iterator begin;
@@ -935,11 +1067,10 @@ inline format_decimal_result<Char*> format_decimal(Char* out, UInt value,
 template <typename Char, typename UInt, typename Iterator,
           FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<Iterator>>::value)>
 inline format_decimal_result<Iterator> format_decimal(Iterator out, UInt value,
-                                                      int num_digits) {
-  // Buffer should be large enough to hold all digits (<= digits10 + 1).
-  enum { max_size = digits10<UInt>() + 1 };
-  Char buffer[2 * max_size];
-  auto end = format_decimal(buffer, value, num_digits).end;
+                                                      int size) {
+  // Buffer is large enough to hold all digits (digits10 + 1).
+  Char buffer[digits10<UInt>() + 1];
+  auto end = format_decimal(buffer, value, size).end;
   return {out, detail::copy_str<Char>(buffer, end, out)};
 }
 
@@ -981,6 +1112,10 @@ Char* format_uint(Char* buffer, detail::fallback_uintptr n, int num_digits,
 
 template <unsigned BASE_BITS, typename Char, typename It, typename UInt>
 inline It format_uint(It out, UInt value, int num_digits, bool upper = false) {
+  if (auto ptr = to_pointer<Char>(out, to_unsigned(num_digits))) {
+    format_uint<BASE_BITS>(ptr, value, num_digits, upper);
+    return out;
+  }
   // Buffer should be large enough to hold all digits (digits / BASE_BITS + 1).
   char buffer[num_bits<UInt>() / BASE_BITS + 1];
   format_uint<BASE_BITS>(buffer, value, num_digits, upper);
@@ -1072,6 +1207,78 @@ template <typename Char> struct basic_format_specs {
 using format_specs = basic_format_specs<char>;
 
 namespace detail {
+namespace dragonbox {
+
+// Type-specific information that Dragonbox uses.
+template <class T> struct float_info;
+
+template <> struct float_info<float> {
+  using carrier_uint = uint32_t;
+  static const int significand_bits = 23;
+  static const int exponent_bits = 8;
+  static const int min_exponent = -126;
+  static const int max_exponent = 127;
+  static const int exponent_bias = -127;
+  static const int decimal_digits = 9;
+  static const int kappa = 1;
+  static const int big_divisor = 100;
+  static const int small_divisor = 10;
+  static const int min_k = -31;
+  static const int max_k = 46;
+  static const int cache_bits = 64;
+  static const int divisibility_check_by_5_threshold = 39;
+  static const int case_fc_pm_half_lower_threshold = -1;
+  static const int case_fc_pm_half_upper_threshold = 6;
+  static const int case_fc_lower_threshold = -2;
+  static const int case_fc_upper_threshold = 6;
+  static const int case_shorter_interval_left_endpoint_lower_threshold = 2;
+  static const int case_shorter_interval_left_endpoint_upper_threshold = 3;
+  static const int shorter_interval_tie_lower_threshold = -35;
+  static const int shorter_interval_tie_upper_threshold = -35;
+  static const int max_trailing_zeros = 7;
+};
+
+template <> struct float_info<double> {
+  using carrier_uint = uint64_t;
+  static const int significand_bits = 52;
+  static const int exponent_bits = 11;
+  static const int min_exponent = -1022;
+  static const int max_exponent = 1023;
+  static const int exponent_bias = -1023;
+  static const int decimal_digits = 17;
+  static const int kappa = 2;
+  static const int big_divisor = 1000;
+  static const int small_divisor = 100;
+  static const int min_k = -292;
+  static const int max_k = 326;
+  static const int cache_bits = 128;
+  static const int divisibility_check_by_5_threshold = 86;
+  static const int case_fc_pm_half_lower_threshold = -2;
+  static const int case_fc_pm_half_upper_threshold = 9;
+  static const int case_fc_lower_threshold = -4;
+  static const int case_fc_upper_threshold = 9;
+  static const int case_shorter_interval_left_endpoint_lower_threshold = 2;
+  static const int case_shorter_interval_left_endpoint_upper_threshold = 3;
+  static const int shorter_interval_tie_lower_threshold = -77;
+  static const int shorter_interval_tie_upper_threshold = -77;
+  static const int max_trailing_zeros = 16;
+};
+
+template <typename T> struct decimal_fp {
+  using significand_type = typename float_info<T>::carrier_uint;
+  significand_type significand;
+  int exponent;
+};
+
+template <typename T> decimal_fp<T> to_decimal(T x) FMT_NOEXCEPT;
+}  // namespace dragonbox
+
+template <typename T>
+constexpr typename dragonbox::float_info<T>::carrier_uint exponent_mask() {
+  using uint = typename dragonbox::float_info<T>::carrier_uint;
+  return ((uint(1) << dragonbox::float_info<T>::exponent_bits) - 1)
+         << dragonbox::float_info<T>::significand_bits;
+}
 
 // A floating-point presentation format.
 enum class float_format : unsigned char {
@@ -1113,113 +1320,6 @@ template <typename Char, typename It> It write_exponent(int exp, It it) {
   return it;
 }
 
-template <typename Char> class float_writer {
- private:
-  // The number is given as v = digits_ * pow(10, exp_).
-  const char* digits_;
-  int num_digits_;
-  int exp_;
-  size_t size_;
-  float_specs specs_;
-  Char decimal_point_;
-
-  template <typename It> It prettify(It it) const {
-    // pow(10, full_exp - 1) <= v <= pow(10, full_exp).
-    int full_exp = num_digits_ + exp_;
-    if (specs_.format == float_format::exp) {
-      // Insert a decimal point after the first digit and add an exponent.
-      *it++ = static_cast<Char>(*digits_);
-      int num_zeros = specs_.precision - num_digits_;
-      if (num_digits_ > 1 || specs_.showpoint) *it++ = decimal_point_;
-      it = copy_str<Char>(digits_ + 1, digits_ + num_digits_, it);
-      if (num_zeros > 0 && specs_.showpoint)
-        it = std::fill_n(it, num_zeros, static_cast<Char>('0'));
-      *it++ = static_cast<Char>(specs_.upper ? 'E' : 'e');
-      return write_exponent<Char>(full_exp - 1, it);
-    }
-    if (num_digits_ <= full_exp) {
-      // 1234e7 -> 12340000000[.0+]
-      it = copy_str<Char>(digits_, digits_ + num_digits_, it);
-      it = std::fill_n(it, full_exp - num_digits_, static_cast<Char>('0'));
-      if (specs_.showpoint || specs_.precision < 0) {
-        *it++ = decimal_point_;
-        int num_zeros = specs_.precision - full_exp;
-        if (num_zeros <= 0) {
-          if (specs_.format != float_format::fixed)
-            *it++ = static_cast<Char>('0');
-          return it;
-        }
-#ifdef FMT_FUZZ
-        if (num_zeros > 5000)
-          throw std::runtime_error("fuzz mode - avoiding excessive cpu use");
-#endif
-        it = std::fill_n(it, num_zeros, static_cast<Char>('0'));
-      }
-    } else if (full_exp > 0) {
-      // 1234e-2 -> 12.34[0+]
-      it = copy_str<Char>(digits_, digits_ + full_exp, it);
-      if (!specs_.showpoint) {
-        // Remove trailing zeros.
-        int num_digits = num_digits_;
-        while (num_digits > full_exp && digits_[num_digits - 1] == '0')
-          --num_digits;
-        if (num_digits != full_exp) *it++ = decimal_point_;
-        return copy_str<Char>(digits_ + full_exp, digits_ + num_digits, it);
-      }
-      *it++ = decimal_point_;
-      it = copy_str<Char>(digits_ + full_exp, digits_ + num_digits_, it);
-      if (specs_.precision > num_digits_) {
-        // Add trailing zeros.
-        int num_zeros = specs_.precision - num_digits_;
-        it = std::fill_n(it, num_zeros, static_cast<Char>('0'));
-      }
-    } else {
-      // 1234e-6 -> 0.001234
-      *it++ = static_cast<Char>('0');
-      int num_zeros = -full_exp;
-      int num_digits = num_digits_;
-      if (num_digits == 0 && specs_.precision >= 0 &&
-          specs_.precision < num_zeros) {
-        num_zeros = specs_.precision;
-      }
-      // Remove trailing zeros.
-      if (!specs_.showpoint)
-        while (num_digits > 0 && digits_[num_digits - 1] == '0') --num_digits;
-      if (num_zeros != 0 || num_digits != 0 || specs_.showpoint) {
-        *it++ = decimal_point_;
-        it = std::fill_n(it, num_zeros, static_cast<Char>('0'));
-        it = copy_str<Char>(digits_, digits_ + num_digits, it);
-      }
-    }
-    return it;
-  }
-
- public:
-  float_writer(const char* digits, int num_digits, int exp, float_specs specs,
-               Char decimal_point)
-      : digits_(digits),
-        num_digits_(num_digits),
-        exp_(exp),
-        specs_(specs),
-        decimal_point_(decimal_point) {
-    int full_exp = num_digits + exp - 1;
-    int precision = specs.precision > 0 ? specs.precision : 16;
-    if (specs_.format == float_format::general &&
-        !(full_exp >= -4 && full_exp < precision)) {
-      specs_.format = float_format::exp;
-    }
-    size_ = prettify(counting_iterator()).count();
-    size_ += specs.sign ? 1 : 0;
-  }
-
-  size_t size() const { return size_; }
-
-  template <typename It> It operator()(It it) const {
-    if (specs_.sign) *it++ = static_cast<Char>(data::signs[specs_.sign]);
-    return prettify(it);
-  }
-};
-
 template <typename T>
 int format_float(T value, int precision, float_specs specs, buffer<char>& buf);
 
@@ -1398,7 +1498,7 @@ template <align::type align = align::left, typename OutputIt, typename Char,
           typename F>
 inline OutputIt write_padded(OutputIt out,
                              const basic_format_specs<Char>& specs, size_t size,
-                             size_t width, const F& f) {
+                             size_t width, F&& f) {
   static_assert(align == align::left || align == align::right, "");
   unsigned spec_width = to_unsigned(specs.width);
   size_t padding = spec_width > width ? spec_width - width : 0;
@@ -1416,7 +1516,7 @@ template <align::type align = align::left, typename OutputIt, typename Char,
           typename F>
 inline OutputIt write_padded(OutputIt out,
                              const basic_format_specs<Char>& specs, size_t size,
-                             const F& f) {
+                             F&& f) {
   return write_padded<align>(out, specs, size, size, f);
 }
 
@@ -1583,15 +1683,16 @@ template <typename OutputIt, typename Char, typename UInt> struct int_writer {
     char digits[40];
     format_decimal(digits, abs_value, num_digits);
     basic_memory_buffer<Char> buffer;
-    size += prefix_size;
-    buffer.resize(size);
+    size += static_cast<int>(prefix_size);
+    const auto usize = to_unsigned(size);
+    buffer.resize(usize);
     basic_string_view<Char> s(&sep, sep_size);
     // Index of a decimal digit with the least significant digit having index 0.
     int digit_index = 0;
     group = groups.cbegin();
-    auto p = buffer.data() + size;
-    for (int i = num_digits - 1; i >= 0; --i) {
-      *--p = static_cast<Char>(digits[i]);
+    auto p = buffer.data() + size - 1;
+    for (int i = num_digits - 1; i > 0; --i) {
+      *p-- = static_cast<Char>(digits[i]);
       if (*group <= 0 || ++digit_index % *group != 0 ||
           *group == max_value<char>())
         continue;
@@ -1599,16 +1700,16 @@ template <typename OutputIt, typename Char, typename UInt> struct int_writer {
         digit_index = 0;
         ++group;
       }
-      p -= s.size();
       std::uninitialized_copy(s.data(), s.data() + s.size(),
                               make_checked(p, s.size()));
+      p -= s.size();
     }
-    if (prefix_size != 0) p[-1] = static_cast<Char>('-');
-    using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+    *p-- = static_cast<Char>(*digits);
+    if (prefix_size != 0) *p = static_cast<Char>('-');
     auto data = buffer.data();
-    out = write_padded<align::right>(out, specs, size, size, [=](iterator it) {
-      return copy_str<Char>(data, data + size, it);
-    });
+    out = write_padded<align::right>(
+        out, specs, usize, usize,
+        [=](iterator it) { return copy_str<Char>(data, data + size, it); });
   }
 
   void on_chr() { *out++ = static_cast<Char>(abs_value); }
@@ -1634,6 +1735,168 @@ OutputIt write_nonfinite(OutputIt out, bool isinf,
   });
 }
 
+// A decimal floating-point number significand * pow(10, exp).
+struct big_decimal_fp {
+  const char* significand;
+  int significand_size;
+  int exponent;
+};
+
+inline int get_significand_size(const big_decimal_fp& fp) {
+  return fp.significand_size;
+}
+template <typename T>
+inline int get_significand_size(const dragonbox::decimal_fp<T>& fp) {
+  return count_digits(fp.significand);
+}
+
+template <typename Char, typename OutputIt>
+inline OutputIt write_significand(OutputIt out, const char* significand,
+                                  int& significand_size) {
+  return copy_str<Char>(significand, significand + significand_size, out);
+}
+template <typename Char, typename OutputIt, typename UInt>
+inline OutputIt write_significand(OutputIt out, UInt significand,
+                                  int significand_size) {
+  return format_decimal<Char>(out, significand, significand_size).end;
+}
+
+template <typename Char, typename UInt,
+          FMT_ENABLE_IF(std::is_integral<UInt>::value)>
+inline Char* write_significand(Char* out, UInt significand,
+                               int significand_size, int integral_size,
+                               Char decimal_point) {
+  if (!decimal_point)
+    return format_decimal(out, significand, significand_size).end;
+  auto end = format_decimal(out + 1, significand, significand_size).end;
+  if (integral_size == 1)
+    out[0] = out[1];
+  else
+    std::copy_n(out + 1, integral_size, out);
+  out[integral_size] = decimal_point;
+  return end;
+}
+
+template <typename OutputIt, typename UInt, typename Char,
+          FMT_ENABLE_IF(!std::is_pointer<remove_cvref_t<OutputIt>>::value)>
+inline OutputIt write_significand(OutputIt out, UInt significand,
+                                  int significand_size, int integral_size,
+                                  Char decimal_point) {
+  // Buffer is large enough to hold digits (digits10 + 1) and a decimal point.
+  Char buffer[digits10<UInt>() + 2];
+  auto end = write_significand(buffer, significand, significand_size,
+                               integral_size, decimal_point);
+  return detail::copy_str<Char>(buffer, end, out);
+}
+
+template <typename OutputIt, typename Char>
+inline OutputIt write_significand(OutputIt out, const char* significand,
+                                  int significand_size, int integral_size,
+                                  Char decimal_point) {
+  out = detail::copy_str<Char>(significand, significand + integral_size, out);
+  if (!decimal_point) return out;
+  *out++ = decimal_point;
+  return detail::copy_str<Char>(significand + integral_size,
+                                significand + significand_size, out);
+}
+
+template <typename OutputIt, typename DecimalFP, typename Char>
+OutputIt write_float(OutputIt out, const DecimalFP& fp,
+                     const basic_format_specs<Char>& specs, float_specs fspecs,
+                     Char decimal_point) {
+  auto significand = fp.significand;
+  int significand_size = get_significand_size(fp);
+  static const Char zero = static_cast<Char>('0');
+  auto sign = fspecs.sign;
+  size_t size = to_unsigned(significand_size) + (sign ? 1 : 0);
+  using iterator = remove_reference_t<decltype(reserve(out, 0))>;
+
+  int output_exp = fp.exponent + significand_size - 1;
+  auto use_exp_format = [=]() {
+    if (fspecs.format == float_format::exp) return true;
+    if (fspecs.format != float_format::general) return false;
+    // Use the fixed notation if the exponent is in [exp_lower, exp_upper),
+    // e.g. 0.0001 instead of 1e-04. Otherwise use the exponent notation.
+    const int exp_lower = -4, exp_upper = 16;
+    return output_exp < exp_lower ||
+           output_exp >= (fspecs.precision > 0 ? fspecs.precision : exp_upper);
+  };
+  if (use_exp_format()) {
+    int num_zeros = 0;
+    if (fspecs.showpoint) {
+      num_zeros = (std::max)(fspecs.precision - significand_size, 0);
+      size += to_unsigned(num_zeros);
+    } else if (significand_size == 1) {
+      decimal_point = Char();
+    }
+    auto abs_output_exp = output_exp >= 0 ? output_exp : -output_exp;
+    int exp_digits = 2;
+    if (abs_output_exp >= 100) exp_digits = abs_output_exp >= 1000 ? 4 : 3;
+
+    size += to_unsigned((decimal_point ? 1 : 0) + 2 + exp_digits);
+    char exp_char = fspecs.upper ? 'E' : 'e';
+    auto write = [=](iterator it) {
+      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      // Insert a decimal point after the first digit and add an exponent.
+      it = write_significand(it, significand, significand_size, 1,
+                             decimal_point);
+      if (num_zeros > 0) it = std::fill_n(it, num_zeros, zero);
+      *it++ = static_cast<Char>(exp_char);
+      return write_exponent<Char>(output_exp, it);
+    };
+    return specs.width > 0 ? write_padded<align::right>(out, specs, size, write)
+                           : base_iterator(out, write(reserve(out, size)));
+  }
+
+  int exp = fp.exponent + significand_size;
+  if (fp.exponent >= 0) {
+    // 1234e5 -> 123400000[.0+]
+    size += to_unsigned(fp.exponent);
+    int num_zeros = fspecs.precision - exp;
+#ifdef FMT_FUZZ
+    if (num_zeros > 5000)
+      throw std::runtime_error("fuzz mode - avoiding excessive cpu use");
+#endif
+    if (fspecs.showpoint) {
+      if (num_zeros <= 0 && fspecs.format != float_format::fixed) num_zeros = 1;
+      if (num_zeros > 0) size += to_unsigned(num_zeros);
+    }
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      it = write_significand<Char>(it, significand, significand_size);
+      it = std::fill_n(it, fp.exponent, zero);
+      if (!fspecs.showpoint) return it;
+      *it++ = decimal_point;
+      return num_zeros > 0 ? std::fill_n(it, num_zeros, zero) : it;
+    });
+  } else if (exp > 0) {
+    // 1234e-2 -> 12.34[0+]
+    int num_zeros = fspecs.showpoint ? fspecs.precision - significand_size : 0;
+    size += 1 + to_unsigned(num_zeros > 0 ? num_zeros : 0);
+    return write_padded<align::right>(out, specs, size, [&](iterator it) {
+      if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+      it = write_significand(it, significand, significand_size, exp,
+                             decimal_point);
+      return num_zeros > 0 ? std::fill_n(it, num_zeros, zero) : it;
+    });
+  }
+  // 1234e-6 -> 0.001234
+  int num_zeros = -exp;
+  if (significand_size == 0 && fspecs.precision >= 0 &&
+      fspecs.precision < num_zeros) {
+    num_zeros = fspecs.precision;
+  }
+  size += 2 + to_unsigned(num_zeros);
+  return write_padded<align::right>(out, specs, size, [&](iterator it) {
+    if (sign) *it++ = static_cast<Char>(data::signs[sign]);
+    *it++ = zero;
+    if (num_zeros == 0 && significand_size == 0 && !fspecs.showpoint) return it;
+    *it++ = decimal_point;
+    it = std::fill_n(it, num_zeros, zero);
+    return write_significand<Char>(it, significand, significand_size);
+  });
+}
+
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(std::is_floating_point<T>::value)>
 OutputIt write(OutputIt out, T value, basic_format_specs<Char> specs,
@@ -1673,39 +1936,45 @@ OutputIt write(OutputIt out, T value, basic_format_specs<Char> specs,
       ++precision;
   }
   if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
-  fspecs.use_grisu = use_grisu<T>();
+  fspecs.use_grisu = is_fast_float<T>();
   int exp = format_float(promote_float(value), precision, fspecs, buffer);
   fspecs.precision = precision;
   Char point =
       fspecs.locale ? decimal_point<Char>(loc) : static_cast<Char>('.');
-  float_writer<Char> w(buffer.data(), static_cast<int>(buffer.size()), exp,
-                       fspecs, point);
-  return write_padded<align::right>(out, specs, w.size(), w);
+  auto fp = big_decimal_fp{buffer.data(), static_cast<int>(buffer.size()), exp};
+  return write_float(out, fp, specs, fspecs, point);
 }
 
 template <typename Char, typename OutputIt, typename T,
-          FMT_ENABLE_IF(std::is_floating_point<T>::value)>
+          FMT_ENABLE_IF(is_fast_float<T>::value)>
 OutputIt write(OutputIt out, T value) {
   if (const_check(!is_supported_floating_point(value))) return out;
+
+  using floaty = conditional_t<std::is_same<T, long double>::value, double, T>;
+  using uint = typename dragonbox::float_info<floaty>::carrier_uint;
+  auto bits = bit_cast<uint>(value);
+
   auto fspecs = float_specs();
-  if (std::signbit(value)) {  // value < 0 is false for NaN so use signbit.
+  auto sign_bit = bits & (uint(1) << (num_bits<uint>() - 1));
+  if (sign_bit != 0) {
     fspecs.sign = sign::minus;
     value = -value;
   }
 
-  auto specs = basic_format_specs<Char>();
-  if (!std::isfinite(value))
+  static const auto specs = basic_format_specs<Char>();
+  uint mask = exponent_mask<floaty>();
+  if ((bits & mask) == mask)
     return write_nonfinite(out, std::isinf(value), specs, fspecs);
 
-  memory_buffer buffer;
-  int precision = -1;
-  if (const_check(std::is_same<T, float>())) fspecs.binary32 = true;
-  fspecs.use_grisu = use_grisu<T>();
-  int exp = format_float(promote_float(value), precision, fspecs, buffer);
-  fspecs.precision = precision;
-  float_writer<Char> w(buffer.data(), static_cast<int>(buffer.size()), exp,
-                       fspecs, static_cast<Char>('.'));
-  return base_iterator(out, w(reserve(out, w.size())));
+  auto dec = dragonbox::to_decimal(static_cast<floaty>(value));
+  return write_float(out, dec, specs, fspecs, static_cast<Char>('.'));
+}
+
+template <typename Char, typename OutputIt, typename T,
+          FMT_ENABLE_IF(std::is_floating_point<T>::value &&
+                        !is_fast_float<T>::value)>
+inline OutputIt write(OutputIt out, T value) {
+  return write(out, value, basic_format_specs<Char>());
 }
 
 template <typename Char, typename OutputIt>
@@ -1758,6 +2027,13 @@ OutputIt write(OutputIt out, basic_string_view<Char> value) {
   return base_iterator(out, it);
 }
 
+template <typename Char>
+buffer_appender<Char> write(buffer_appender<Char> out,
+                            basic_string_view<Char> value) {
+  get_container(out).append(value.begin(), value.end());
+  return out;
+}
+
 template <typename Char, typename OutputIt, typename T,
           FMT_ENABLE_IF(is_integral<T>::value &&
                         !std::is_same<T, bool>::value &&
@@ -1768,7 +2044,13 @@ OutputIt write(OutputIt out, T value) {
   // Don't do -abs_value since it trips unsigned-integer-overflow sanitizer.
   if (negative) abs_value = ~abs_value + 1;
   int num_digits = count_digits(abs_value);
-  auto it = reserve(out, (negative ? 1 : 0) + static_cast<size_t>(num_digits));
+  auto size = (negative ? 1 : 0) + static_cast<size_t>(num_digits);
+  auto it = reserve(out, size);
+  if (auto ptr = to_pointer<Char>(it, size)) {
+    if (negative) *ptr++ = static_cast<Char>('-');
+    format_decimal<Char>(ptr, abs_value, num_digits);
+    return out;
+  }
   if (negative) *it++ = static_cast<Char>('-');
   it = format_decimal<Char>(it, abs_value, num_digits).end;
   return base_iterator(out, it);
@@ -1807,8 +2089,13 @@ auto write(OutputIt out, const T& value) -> typename std::enable_if<
     mapped_type_constant<T, basic_format_context<OutputIt, Char>>::value ==
         type::custom_type,
     OutputIt>::type {
-  basic_format_context<OutputIt, Char> ctx(out, {}, {});
-  return formatter<T>().format(value, ctx);
+  using context_type = basic_format_context<OutputIt, Char>;
+  using formatter_type =
+      conditional_t<has_formatter<T, context_type>::value,
+                    typename context_type::template formatter_type<T>,
+                    fallback_formatter<T, Char>>;
+  context_type ctx(out, {}, {});
+  return formatter_type().format(value, ctx);
 }
 
 // An argument visitor that formats the argument and writes it via the output
@@ -2014,6 +2301,48 @@ class arg_formatter_base {
   }
 };
 
+/** The default argument formatter. */
+template <typename OutputIt, typename Char>
+class arg_formatter : public arg_formatter_base<OutputIt, Char> {
+ private:
+  using char_type = Char;
+  using base = arg_formatter_base<OutputIt, Char>;
+  using context_type = basic_format_context<OutputIt, Char>;
+
+  context_type& ctx_;
+  basic_format_parse_context<char_type>* parse_ctx_;
+  const Char* ptr_;
+
+ public:
+  using iterator = typename base::iterator;
+  using format_specs = typename base::format_specs;
+
+  /**
+    \rst
+    Constructs an argument formatter object.
+    *ctx* is a reference to the formatting context,
+    *specs* contains format specifier information for standard argument types.
+    \endrst
+   */
+  explicit arg_formatter(
+      context_type& ctx,
+      basic_format_parse_context<char_type>* parse_ctx = nullptr,
+      format_specs* specs = nullptr, const Char* ptr = nullptr)
+      : base(ctx.out(), specs, ctx.locale()),
+        ctx_(ctx),
+        parse_ctx_(parse_ctx),
+        ptr_(ptr) {}
+
+  using base::operator();
+
+  /** Formats an argument of a user-defined type. */
+  iterator operator()(typename basic_format_arg<context_type>::handle handle) {
+    if (ptr_) advance_to(*parse_ctx_, ptr_);
+    handle.format(*parse_ctx_, ctx_);
+    return ctx_.out();
+  }
+};
+
 template <typename Char> FMT_CONSTEXPR bool is_name_start(Char c) {
   return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || '_' == c;
 }
@@ -2053,12 +2382,11 @@ template <typename Context> class custom_formatter {
                             Context& ctx)
       : parse_ctx_(parse_ctx), ctx_(ctx) {}
 
-  bool operator()(typename basic_format_arg<Context>::handle h) const {
+  void operator()(typename basic_format_arg<Context>::handle h) const {
     h.format(parse_ctx_, ctx_);
-    return true;
   }
 
-  template <typename T> bool operator()(T) const { return false; }
+  template <typename T> void operator()(T) const {}
 };
 
 template <typename T>
@@ -2440,12 +2768,30 @@ template <typename SpecHandler, typename Char> struct precision_adapter {
 };
 
 template <typename Char>
-FMT_CONSTEXPR const Char* next_code_point(const Char* begin, const Char* end) {
-  if (const_check(sizeof(Char) != 1) || (*begin & 0x80) == 0) return begin + 1;
-  do {
-    ++begin;
-  } while (begin != end && (*begin & 0xc0) == 0x80);
-  return begin;
+FMT_CONSTEXPR int code_point_length(const Char* begin) {
+  if (const_check(sizeof(Char) != 1)) return 1;
+  constexpr char lengths[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+                              0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 3, 3, 4, 0};
+  int len = lengths[static_cast<unsigned char>(*begin) >> 3];
+
+  // Compute the pointer to the next character early so that the next
+  // iteration can start working on the next character. Neither Clang
+  // nor GCC figure out this reordering on their own.
+  return len + !len;
+}
+
+template <typename Char> constexpr bool is_ascii_letter(Char c) {
+  return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
+}
+
+// Converts a character to ASCII. Returns a number > 127 on conversion failure.
+template <typename Char, FMT_ENABLE_IF(std::is_integral<Char>::value)>
+constexpr Char to_ascii(Char value) {
+  return value;
+}
+template <typename Char, FMT_ENABLE_IF(std::is_enum<Char>::value)>
+constexpr typename std::underlying_type<Char>::type to_ascii(Char value) {
+  return value;
 }
 
 // Parses fill and alignment.
@@ -2454,10 +2800,10 @@ FMT_CONSTEXPR const Char* parse_align(const Char* begin, const Char* end,
                                       Handler&& handler) {
   FMT_ASSERT(begin != end, "");
   auto align = align::none;
-  auto p = next_code_point(begin, end);
-  if (p == end) p = begin;
+  auto p = begin + code_point_length(begin);
+  if (p >= end) p = begin;
   for (;;) {
-    switch (static_cast<char>(*p)) {
+    switch (to_ascii(*p)) {
     case '<':
       align = align::left;
       break;
@@ -2536,13 +2882,13 @@ FMT_CONSTEXPR const Char* parse_precision(const Char* begin, const Char* end,
 template <typename Char, typename SpecHandler>
 FMT_CONSTEXPR const Char* parse_format_specs(const Char* begin, const Char* end,
                                              SpecHandler&& handler) {
-  if (begin == end || *begin == '}') return begin;
+  if (begin == end) return begin;
 
   begin = parse_align(begin, end, handler);
   if (begin == end) return begin;
 
   // Parse sign.
-  switch (static_cast<char>(*begin)) {
+  switch (to_ascii(*begin)) {
   case '+':
     handler.on_plus();
     ++begin;
@@ -2619,7 +2965,7 @@ FMT_CONSTEXPR const Char* parse_replacement_field(const Char* begin,
                                                   Handler&& handler) {
   ++begin;
   if (begin == end) return handler.on_error("invalid format string"), end;
-  if (static_cast<char>(*begin) == '}') {
+  if (*begin == '}') {
     handler.on_replacement_field(handler.on_arg_id(), begin);
   } else if (*begin == '{') {
     handler.on_text(begin, begin + 1);
@@ -2664,17 +3010,17 @@ FMT_CONSTEXPR_DECL FMT_INLINE void parse_format_string(
     return;
   }
   struct writer {
-    FMT_CONSTEXPR void operator()(const Char* begin, const Char* end) {
-      if (begin == end) return;
+    FMT_CONSTEXPR void operator()(const Char* pbegin, const Char* pend) {
+      if (pbegin == pend) return;
       for (;;) {
         const Char* p = nullptr;
-        if (!find<IS_CONSTEXPR>(begin, end, '}', p))
-          return handler_.on_text(begin, end);
+        if (!find<IS_CONSTEXPR>(pbegin, pend, '}', p))
+          return handler_.on_text(pbegin, pend);
         ++p;
-        if (p == end || *p != '}')
+        if (p == pend || *p != '}')
           return handler_.on_error("unmatched '}' in format string");
-        handler_.on_text(begin, p);
-        begin = p + 1;
+        handler_.on_text(pbegin, p);
+        pbegin = p + 1;
       }
     }
     Handler& handler_;
@@ -2705,12 +3051,12 @@ FMT_CONSTEXPR const typename ParseContext::char_type* parse_format_specs(
   return f.parse(ctx);
 }
 
-template <typename ArgFormatter, typename Char, typename Context>
+template <typename OutputIt, typename Char, typename Context>
 struct format_handler : detail::error_handler {
   basic_format_parse_context<Char> parse_context;
   Context context;
 
-  format_handler(typename ArgFormatter::iterator out,
+  format_handler(OutputIt out,
                  basic_string_view<Char> str,
                  basic_format_args<Context> format_args, detail::locale_ref loc)
       : parse_context(str), context(out, format_args, loc) {}
@@ -2734,26 +3080,33 @@ struct format_handler : detail::error_handler {
   FMT_INLINE void on_replacement_field(int id, const Char*) {
     auto arg = get_arg(context, id);
     context.advance_to(visit_format_arg(
-        default_arg_formatter<typename ArgFormatter::iterator, Char>{
+        default_arg_formatter<OutputIt, Char>{
             context.out(), context.args(), context.locale()},
         arg));
   }
 
   const Char* on_format_specs(int id, const Char* begin, const Char* end) {
-    advance_to(parse_context, begin);
     auto arg = get_arg(context, id);
-    custom_formatter<Context> f(parse_context, context);
-    if (visit_format_arg(f, arg)) return parse_context.begin();
-    basic_format_specs<Char> specs;
-    using parse_context_t = basic_format_parse_context<Char>;
-    specs_checker<specs_handler<parse_context_t, Context>> handler(
-        specs_handler<parse_context_t, Context>(specs, parse_context, context),
-        arg.type());
-    begin = parse_format_specs(begin, end, handler);
-    if (begin == end || *begin != '}') on_error("missing '}' in format string");
-    advance_to(parse_context, begin);
+    if (arg.type() == type::custom_type) {
+      advance_to(parse_context, begin);
+      visit_format_arg(custom_formatter<Context>(parse_context, context), arg);
+      return parse_context.begin();
+    }
+    auto specs = basic_format_specs<Char>();
+    if (begin + 1 < end && begin[1] == '}' && is_ascii_letter(*begin)) {
+      specs.type = static_cast<char>(*begin++);
+    } else {
+      using parse_context_t = basic_format_parse_context<Char>;
+      specs_checker<specs_handler<parse_context_t, Context>> handler(
+          specs_handler<parse_context_t, Context>(specs, parse_context,
+                                                  context),
+          arg.type());
+      begin = parse_format_specs(begin, end, handler);
+      if (begin == end || *begin != '}')
+        on_error("missing '}' in format string");
+    }
     context.advance_to(
-        visit_format_arg(ArgFormatter(context, &parse_context, &specs), arg));
+        visit_format_arg(arg_formatter<OutputIt, Char>(context, &parse_context, &specs), arg));
     return begin;
   }
 };
@@ -2905,53 +3258,11 @@ FMT_API void format_error_code(buffer<char>& out, int error_code,
 
 FMT_API void report_error(format_func func, int error_code,
                           string_view message) FMT_NOEXCEPT;
-
-/** The default argument formatter. */
-template <typename OutputIt, typename Char>
-class arg_formatter : public arg_formatter_base<OutputIt, Char> {
- private:
-  using char_type = Char;
-  using base = arg_formatter_base<OutputIt, Char>;
-  using context_type = basic_format_context<OutputIt, Char>;
-
-  context_type& ctx_;
-  basic_format_parse_context<char_type>* parse_ctx_;
-  const Char* ptr_;
-
- public:
-  using iterator = typename base::iterator;
-  using format_specs = typename base::format_specs;
-
-  /**
-    \rst
-    Constructs an argument formatter object.
-    *ctx* is a reference to the formatting context,
-    *specs* contains format specifier information for standard argument types.
-    \endrst
-   */
-  explicit arg_formatter(
-      context_type& ctx,
-      basic_format_parse_context<char_type>* parse_ctx = nullptr,
-      format_specs* specs = nullptr, const Char* ptr = nullptr)
-      : base(ctx.out(), specs, ctx.locale()),
-        ctx_(ctx),
-        parse_ctx_(parse_ctx),
-        ptr_(ptr) {}
-
-  using base::operator();
-
-  /** Formats an argument of a user-defined type. */
-  iterator operator()(typename basic_format_arg<context_type>::handle handle) {
-    if (ptr_) advance_to(*parse_ctx_, ptr_);
-    handle.format(*parse_ctx_, ctx_);
-    return ctx_.out();
-  }
-};
 }  // namespace detail
 
 template <typename OutputIt, typename Char>
 using arg_formatter FMT_DEPRECATED_ALIAS =
-  detail::arg_formatter<OutputIt, Char>;
+    detail::arg_formatter<OutputIt, Char>;
 
 /**
  An error returned by an operating system or a language runtime,
@@ -3214,8 +3525,10 @@ struct formatter<Char[N], Char> : formatter<basic_string_view<Char>, Char> {
 //   using variant = std::variant<int, std::string>;
 //   template <>
 //   struct formatter<variant>: dynamic_formatter<> {
-//     void format(buffer &buf, const variant &v, context &ctx) {
-//       visit([&](const auto &val) { format(buf, val, ctx); }, v);
+//     auto format(const variant& v, format_context& ctx) {
+//       return visit([&](const auto& val) {
+//           return dynamic_formatter<>::format(val, ctx);
+//       }, v);
 //     }
 //   };
 template <typename Char = char> class dynamic_formatter {
@@ -3283,28 +3596,15 @@ FMT_CONSTEXPR void advance_to(
   ctx.advance_to(ctx.begin() + (p - &*ctx.begin()));
 }
 
-/** Formats arguments and writes the output to the range. */
-template <typename ArgFormatter, typename Char, typename Context>
-typename Context::iterator vformat_to(
-    typename ArgFormatter::iterator out, basic_string_view<Char> format_str,
-    basic_format_args<Context> args,
-    detail::locale_ref loc = detail::locale_ref()) {
-  if (format_str.size() == 2 && detail::equal2(format_str.data(), "{}")) {
-    auto arg = args.get(0);
-    if (!arg) detail::error_handler().on_error("argument not found");
-    using iterator = typename ArgFormatter::iterator;
-    return visit_format_arg(
-        detail::default_arg_formatter<iterator, Char>{out, args, loc}, arg);
-  }
-  detail::format_handler<ArgFormatter, Char, Context> h(out, format_str, args,
-                                                        loc);
-  detail::parse_format_string<false>(format_str, h);
-  return h.context.out();
-}
+/**
+  \rst
+  Converts ``p`` to ``const void*`` for pointer formatting.
 
-// Casts ``p`` to ``const void*`` for pointer formatting.
-// Example:
-//   auto s = format("{}", ptr(p));
+  **Example**::
+
+    auto s = fmt::format("{}", fmt::ptr(p));
+  \endrst
+ */
 template <typename T> inline const void* ptr(const T* p) { return p; }
 template <typename T> inline const void* ptr(const std::unique_ptr<T>& p) {
   return p.get();
@@ -3323,6 +3623,10 @@ class bytes {
 };
 
 template <> struct formatter<bytes> {
+ private:
+  detail::dynamic_format_specs<char> specs_;
+
+ public:
   template <typename ParseContext>
   FMT_CONSTEXPR auto parse(ParseContext& ctx) -> decltype(ctx.begin()) {
     using handler_type = detail::dynamic_specs_handler<ParseContext>;
@@ -3341,9 +3645,6 @@ template <> struct formatter<bytes> {
         specs_.precision, specs_.precision_ref, ctx);
     return detail::write_bytes(ctx.out(), b.data_, specs_);
   }
-
- private:
-  detail::dynamic_format_specs<char> specs_;
 };
 
 template <typename It, typename Sentinel, typename Char>
@@ -3408,15 +3709,14 @@ arg_join<It, Sentinel, wchar_t> join(It begin, Sentinel end, wstring_view sep) {
   \endrst
  */
 template <typename Range>
-arg_join<detail::iterator_t<const Range>, detail::sentinel_t<const Range>, char>
-join(const Range& range, string_view sep) {
+arg_join<detail::iterator_t<Range>, detail::sentinel_t<Range>, char> join(
+    Range&& range, string_view sep) {
   return join(std::begin(range), std::end(range), sep);
 }
 
 template <typename Range>
-arg_join<detail::iterator_t<const Range>, detail::sentinel_t<const Range>,
-         wchar_t>
-join(const Range& range, wstring_view sep) {
+arg_join<detail::iterator_t<Range>, detail::sentinel_t<Range>, wchar_t> join(
+    Range&& range, wstring_view sep) {
   return join(std::begin(range), std::end(range), sep);
 }
 
@@ -3443,7 +3743,7 @@ inline std::string to_string(T value) {
   // The buffer should be large enough to store the number including the sign or
   // "false" for bool.
   constexpr int max_size = detail::digits10<T>() + 2;
-  char buffer[max_size > 5 ? max_size : 5];
+  char buffer[max_size > 5 ? static_cast<unsigned>(max_size) : 5];
   char* begin = buffer;
   return std::string(begin, detail::write<char>(begin, value));
 }
@@ -3463,17 +3763,28 @@ std::basic_string<Char> to_string(const basic_memory_buffer<Char, SIZE>& buf) {
 }
 
 template <typename Char>
-typename buffer_context<Char>::iterator detail::vformat_to(
+void detail::vformat_to(
     detail::buffer<Char>& buf, basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  using af = arg_formatter<typename buffer_context<Char>::iterator, Char>;
-  return vformat_to<af>(std::back_inserter(buf), to_string_view(format_str),
-                        args);
+    basic_format_args<buffer_context<type_identity_t<Char>>> args,
+    detail::locale_ref loc) {
+  using iterator = typename buffer_context<Char>::iterator;
+  auto out = buffer_appender<Char>(buf);
+  if (format_str.size() == 2 && equal2(format_str.data(), "{}")) {
+    auto arg = args.get(0);
+    if (!arg) error_handler().on_error("argument not found");
+    visit_format_arg(default_arg_formatter<iterator, Char>{out, args, loc},
+                     arg);
+    return;
+  }
+  format_handler<iterator, Char, buffer_context<Char>> h(
+      out, format_str, args, loc);
+  parse_format_string<false>(format_str, h);
 }
 
 #ifndef FMT_HEADER_ONLY
-extern template format_context::iterator detail::vformat_to(
-    detail::buffer<char>&, string_view, basic_format_args<format_context>);
+extern template void detail::vformat_to(detail::buffer<char>&, string_view,
+                                        basic_format_args<format_context>,
+                                        detail::locale_ref);
 namespace detail {
 extern template FMT_API std::string grouping_impl<char>(locale_ref loc);
 extern template FMT_API std::string grouping_impl<wchar_t>(locale_ref loc);
@@ -3500,7 +3811,7 @@ extern template int snprintf_float<long double>(long double value,
 
 template <typename S, typename Char = char_t<S>,
           FMT_ENABLE_IF(detail::is_string<S>::value)>
-inline typename FMT_BUFFER_CONTEXT(Char)::iterator vformat_to(
+inline void vformat_to(
     detail::buffer<Char>& buf, const S& format_str,
     basic_format_args<FMT_BUFFER_CONTEXT(type_identity_t<Char>)> args) {
   return detail::vformat_to(buf, to_string_view(format_str), args);
@@ -3510,10 +3821,9 @@ template <typename S, typename... Args, size_t SIZE = inline_buffer_size,
           typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
 inline typename buffer_context<Char>::iterator format_to(
     basic_memory_buffer<Char, SIZE>& buf, const S& format_str, Args&&... args) {
-  detail::check_format_string<Args...>(format_str);
-  using context = buffer_context<Char>;
-  return detail::vformat_to(buf, to_string_view(format_str),
-                            make_format_args<context>(args...));
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  detail::vformat_to(buf, to_string_view(format_str), vargs);
+  return detail::buffer_appender<Char>(buf);
 }
 
 template <typename OutputIt, typename Char = char>
@@ -3522,88 +3832,17 @@ using format_context_t = basic_format_context<OutputIt, Char>;
 template <typename OutputIt, typename Char = char>
 using format_args_t = basic_format_args<format_context_t<OutputIt, Char>>;
 
-template <
-    typename S, typename OutputIt, typename... Args,
-    FMT_ENABLE_IF(detail::is_output_iterator<OutputIt>::value &&
-                  !detail::is_contiguous_back_insert_iterator<OutputIt>::value)>
-inline OutputIt vformat_to(
-    OutputIt out, const S& format_str,
-    format_args_t<type_identity_t<OutputIt>, char_t<S>> args) {
-  using af = detail::arg_formatter<OutputIt, char_t<S>>;
-  return vformat_to<af>(out, to_string_view(format_str), args);
-}
-
-/**
- \rst
- Formats arguments, writes the result to the output iterator ``out`` and returns
- the iterator past the end of the output range.
-
- **Example**::
-
-   std::vector<char> out;
-   fmt::format_to(std::back_inserter(out), "{}", 42);
- \endrst
- */
-template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(
-              detail::is_output_iterator<OutputIt>::value &&
-              !detail::is_contiguous_back_insert_iterator<OutputIt>::value &&
-              detail::is_string<S>::value)>
-inline OutputIt format_to(OutputIt out, const S& format_str, Args&&... args) {
-  detail::check_format_string<Args...>(format_str);
-  using context = format_context_t<OutputIt, char_t<S>>;
-  return vformat_to(out, to_string_view(format_str),
-                    make_format_args<context>(args...));
-}
-
-template <typename OutputIt> struct format_to_n_result {
-  /** Iterator past the end of the output range. */
-  OutputIt out;
-  /** Total (not truncated) output size. */
-  size_t size;
-};
+template <typename OutputIt, typename Char = typename OutputIt::value_type>
+using format_to_n_context FMT_DEPRECATED_ALIAS = buffer_context<Char>;
 
 template <typename OutputIt, typename Char = typename OutputIt::value_type>
-using format_to_n_context =
-    format_context_t<detail::truncating_iterator<OutputIt>, Char>;
-
-template <typename OutputIt, typename Char = typename OutputIt::value_type>
-using format_to_n_args = basic_format_args<format_to_n_context<OutputIt, Char>>;
+using format_to_n_args FMT_DEPRECATED_ALIAS =
+    basic_format_args<buffer_context<Char>>;
 
 template <typename OutputIt, typename Char, typename... Args>
-inline format_arg_store<format_to_n_context<OutputIt, Char>, Args...>
+FMT_DEPRECATED format_arg_store<buffer_context<Char>, Args...>
 make_format_to_n_args(const Args&... args) {
-  return format_arg_store<format_to_n_context<OutputIt, Char>, Args...>(
-      args...);
-}
-
-template <typename OutputIt, typename Char, typename... Args,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt>::value)>
-inline format_to_n_result<OutputIt> vformat_to_n(
-    OutputIt out, size_t n, basic_string_view<Char> format_str,
-    format_to_n_args<type_identity_t<OutputIt>, type_identity_t<Char>> args) {
-  auto it = vformat_to(detail::truncating_iterator<OutputIt>(out, n),
-                       format_str, args);
-  return {it.base(), it.count()};
-}
-
-/**
- \rst
- Formats arguments, writes up to ``n`` characters of the result to the output
- iterator ``out`` and returns the total output size and the iterator past the
- end of the output range.
- \endrst
- */
-template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(detail::is_string<S>::value&&
-                            detail::is_output_iterator<OutputIt>::value)>
-inline format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
-                                                const S& format_str,
-                                                const Args&... args) {
-  detail::check_format_string<Args...>(format_str);
-  using context = format_to_n_context<OutputIt, char_t<S>>;
-  return vformat_to_n(out, n, to_string_view(format_str),
-                      make_format_args<context>(args...));
+  return format_arg_store<buffer_context<Char>, Args...>(args...);
 }
 
 template <typename Char, enable_if_t<(!std::is_same<Char, char>::value), int>>
@@ -3615,15 +3854,6 @@ std::basic_string<Char> detail::vformat(
   return to_string(buffer);
 }
 
-/**
-  Returns the number of characters in the output of
-  ``format(format_str, args...)``.
- */
-template <typename... Args>
-inline size_t formatted_size(string_view format_str, const Args&... args) {
-  return format_to(detail::counting_iterator(), format_str, args...).count();
-}
-
 template <typename Char, FMT_ENABLE_IF(std::is_same<Char, wchar_t>::value)>
 void vprint(std::FILE* f, basic_string_view<Char> format_str,
             wformat_args args) {
@@ -3648,8 +3878,7 @@ template <typename Char, Char... CHARS> class udl_formatter {
   template <typename... Args>
   std::basic_string<Char> operator()(Args&&... args) const {
     static FMT_CONSTEXPR_DECL Char s[] = {CHARS..., '\0'};
-    check_format_string<remove_cvref_t<Args>...>(FMT_STRING(s));
-    return format(s, std::forward<Args>(args)...);
+    return format(FMT_STRING(s), std::forward<Args>(args)...);
   }
 };
 #  else
diff --git a/src/fmt/locale.h b/src/fmt/locale.h
index 988d15cdf8..517f65054f 100644
--- a/src/fmt/locale.h
+++ b/src/fmt/locale.h
@@ -15,22 +15,12 @@
 FMT_BEGIN_NAMESPACE
 
 namespace detail {
-template <typename Char>
-typename buffer_context<Char>::iterator vformat_to(
-    const std::locale& loc, buffer<Char>& buf,
-    basic_string_view<Char> format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
-  using af = arg_formatter<typename buffer_context<Char>::iterator, Char>;
-  return vformat_to<af>(std::back_inserter(buf), to_string_view(format_str),
-                        args, detail::locale_ref(loc));
-}
-
 template <typename Char>
 std::basic_string<Char> vformat(
     const std::locale& loc, basic_string_view<Char> format_str,
     basic_format_args<buffer_context<type_identity_t<Char>>> args) {
   basic_memory_buffer<Char> buffer;
-  detail::vformat_to(loc, buffer, format_str, args);
+  detail::vformat_to(buffer, format_str, args, detail::locale_ref(loc));
   return fmt::to_string(buffer);
 }
 }  // namespace detail
@@ -45,32 +35,28 @@ inline std::basic_string<Char> vformat(
 template <typename S, typename... Args, typename Char = char_t<S>>
 inline std::basic_string<Char> format(const std::locale& loc,
                                       const S& format_str, Args&&... args) {
-  return detail::vformat(
-      loc, to_string_view(format_str),
-      detail::make_args_checked<Args...>(format_str, args...));
+  return detail::vformat(loc, to_string_view(format_str),
+                         fmt::make_args_checked<Args...>(format_str, args...));
 }
 
 template <typename S, typename OutputIt, typename... Args,
-          typename Char = enable_if_t<
-              detail::is_output_iterator<OutputIt>::value, char_t<S>>>
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
 inline OutputIt vformat_to(
     OutputIt out, const std::locale& loc, const S& format_str,
-    format_args_t<type_identity_t<OutputIt>, Char> args) {
-  using af = detail::arg_formatter<OutputIt, Char>;
-  return vformat_to<af>(out, to_string_view(format_str), args,
-                        detail::locale_ref(loc));
+    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+  decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
+  vformat_to(buf, to_string_view(format_str), args, detail::locale_ref(loc));
+  return detail::get_iterator(buf);
 }
 
 template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt>::value&&
-                            detail::is_string<S>::value)>
+          typename Char = char_t<S>,
+          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
 inline OutputIt format_to(OutputIt out, const std::locale& loc,
                           const S& format_str, Args&&... args) {
-  detail::check_format_string<Args...>(format_str);
-  using context = format_context_t<OutputIt, char_t<S>>;
-  format_arg_store<context, Args...> as{args...};
-  return vformat_to(out, loc, to_string_view(format_str),
-                    basic_format_args<context>(as));
+  const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
+  return vformat_to(out, loc, to_string_view(format_str), vargs);
 }
 
 FMT_END_NAMESPACE
diff --git a/src/fmt/os.h b/src/fmt/os.h
index a9517ef800..881510065a 100644
--- a/src/fmt/os.h
+++ b/src/fmt/os.h
@@ -29,7 +29,8 @@
 #if FMT_HAS_INCLUDE("winapifamily.h")
 #  include <winapifamily.h>
 #endif
-#if FMT_HAS_INCLUDE("fcntl.h") && \
+#if (FMT_HAS_INCLUDE(<fcntl.h>) || defined(__APPLE__) || \
+     defined(__linux__)) &&                              \
     (!defined(WINAPI_FAMILY) || (WINAPI_FAMILY == WINAPI_FAMILY_DESKTOP_APP))
 #  include <fcntl.h>  // for O_RDONLY
 #  define FMT_USE_FCNTL 1
@@ -278,7 +279,8 @@ class file {
     RDONLY = FMT_POSIX(O_RDONLY),  // Open for reading only.
     WRONLY = FMT_POSIX(O_WRONLY),  // Open for writing only.
     RDWR = FMT_POSIX(O_RDWR),      // Open for reading and writing.
-    CREATE = FMT_POSIX(O_CREAT)    // Create if the file doesn't exist.
+    CREATE = FMT_POSIX(O_CREAT),   // Create if the file doesn't exist.
+    APPEND = FMT_POSIX(O_APPEND)   // Open in append mode.
   };
 
   // Constructs a file object which doesn't represent any file.
@@ -343,36 +345,69 @@ class file {
 // Returns the memory page size.
 long getpagesize();
 
-class direct_buffered_file;
+namespace detail {
 
-template <typename S, typename... Args>
-void print(direct_buffered_file& f, const S& format_str,
-           const Args&... args);
+struct buffer_size {
+  size_t value = 0;
+  buffer_size operator=(size_t val) const {
+    auto bs = buffer_size();
+    bs.value = val;
+    return bs;
+  }
+};
 
-// A buffered file with a direct buffer access and no synchronization.
-class direct_buffered_file {
+struct ostream_params {
+  int oflag = file::WRONLY | file::CREATE;
+  size_t buffer_size = BUFSIZ > 32768 ? BUFSIZ : 32768;
+
+  ostream_params() {}
+
+  template <typename... T>
+  ostream_params(T... params, int oflag) : ostream_params(params...) {
+    this->oflag = oflag;
+  }
+
+  template <typename... T>
+  ostream_params(T... params, detail::buffer_size bs)
+      : ostream_params(params...) {
+    this->buffer_size = bs.value;
+  }
+};
+}  // namespace detail
+
+static constexpr detail::buffer_size buffer_size;
+
+// A fast output stream which is not thread-safe.
+class ostream final : private detail::buffer<char> {
  private:
   file file_;
 
-  enum { buffer_size = 4096 };
-  char buffer_[buffer_size];
-  int pos_;
-
   void flush() {
-    if (pos_ == 0) return;
-    file_.write(buffer_, pos_);
-    pos_ = 0;
+    if (size() == 0) return;
+    file_.write(data(), size());
+    clear();
   }
 
-  int free_capacity() const { return buffer_size - pos_; }
+  void grow(size_t) final;
+
+  ostream(cstring_view path, const detail::ostream_params& params)
+      : file_(path, params.oflag) {
+    set(new char[params.buffer_size], params.buffer_size);
+  }
 
  public:
-  direct_buffered_file(cstring_view path, int oflag)
-    : file_(path, oflag), pos_(0) {}
-
-  ~direct_buffered_file() {
-    flush();
+  ostream(ostream&& other)
+      : detail::buffer<char>(other.data(), other.size(), other.capacity()),
+        file_(std::move(other.file_)) {
+    other.set(nullptr, 0);
   }
+  ~ostream() {
+    flush();
+    delete[] data();
+  }
+
+  template <typename... T>
+  friend ostream output_file(cstring_view path, T... params);
 
   void close() {
     flush();
@@ -380,25 +415,20 @@ class direct_buffered_file {
   }
 
   template <typename S, typename... Args>
-  friend void print(direct_buffered_file& f, const S& format_str,
-                    const Args&... args) {
-    // We could avoid double buffering.
-    auto buf = fmt::memory_buffer();
-    fmt::format_to(std::back_inserter(buf), format_str, args...);
-    auto remaining_pos = 0;
-    auto remaining_size = buf.size();
-    while (remaining_size > detail::to_unsigned(f.free_capacity())) {
-      auto size = f.free_capacity();
-      memcpy(f.buffer_ + f.pos_, buf.data() + remaining_pos, size);
-      f.pos_ += size;
-      f.flush();
-      remaining_pos += size;
-      remaining_size -= size;
-    }
-    memcpy(f.buffer_ + f.pos_, buf.data() + remaining_pos, remaining_size);
-    f.pos_ += static_cast<int>(remaining_size);
+  void print(const S& format_str, const Args&... args) {
+    format_to(detail::buffer_appender<char>(*this), format_str, args...);
   }
 };
+
+/**
+  Opens a file for writing. Supported parameters passed in `params`:
+  * ``<integer>``: Output flags (``file::WRONLY | file::CREATE`` by default)
+  * ``buffer_size=<integer>``: Output buffer size
+ */
+template <typename... T>
+inline ostream output_file(cstring_view path, T... params) {
+  return {path, detail::ostream_params(params...)};
+}
 #endif  // FMT_USE_FCNTL
 
 #ifdef FMT_LOCALE
diff --git a/src/fmt/ostream.h b/src/fmt/ostream.h
index c16107f79b..29c58ec13b 100644
--- a/src/fmt/ostream.h
+++ b/src/fmt/ostream.h
@@ -49,17 +49,27 @@ template <class Char> class formatbuf : public std::basic_streambuf<Char> {
   }
 };
 
+struct converter {
+  template <typename T, FMT_ENABLE_IF(is_integral<T>::value)> converter(T);
+};
+
 template <typename Char> struct test_stream : std::basic_ostream<Char> {
  private:
-  // Hide all operator<< from std::basic_ostream<Char>.
-  void_t<> operator<<(null<>);
-  void_t<> operator<<(const Char*);
-
-  template <typename T, FMT_ENABLE_IF(std::is_convertible<T, int>::value &&
-                                      !std::is_enum<T>::value)>
-  void_t<> operator<<(T);
+  void_t<> operator<<(converter);
 };
 
+// Hide insertion operators for built-in types.
+template <typename Char, typename Traits>
+void_t<> operator<<(std::basic_ostream<Char, Traits>&, Char);
+template <typename Char, typename Traits>
+void_t<> operator<<(std::basic_ostream<Char, Traits>&, char);
+template <typename Traits>
+void_t<> operator<<(std::basic_ostream<char, Traits>&, char);
+template <typename Traits>
+void_t<> operator<<(std::basic_ostream<char, Traits>&, signed char);
+template <typename Traits>
+void_t<> operator<<(std::basic_ostream<char, Traits>&, unsigned char);
+
 // Checks if T has a user-defined operator<< (e.g. not a member of
 // std::ostream).
 template <typename T, typename Char> class is_streamable {
@@ -103,7 +113,7 @@ void format_value(buffer<Char>& buf, const T& value,
 #endif
   output << value;
   output.exceptions(std::ios_base::failbit | std::ios_base::badbit);
-  buf.resize(buf.size());
+  buf.try_resize(buf.size());
 }
 
 // Formats an object of type T that has an overloaded ostream operator<<.
@@ -160,7 +170,7 @@ template <typename S, typename... Args,
           typename Char = enable_if_t<detail::is_string<S>::value, char_t<S>>>
 void print(std::basic_ostream<Char>& os, const S& format_str, Args&&... args) {
   vprint(os, to_string_view(format_str),
-         detail::make_args_checked<Args...>(format_str, args...));
+         fmt::make_args_checked<Args...>(format_str, args...));
 }
 FMT_END_NAMESPACE
 
diff --git a/src/fmt/printf.h b/src/fmt/printf.h
index d4440ed168..8c28ac2327 100644
--- a/src/fmt/printf.h
+++ b/src/fmt/printf.h
@@ -181,7 +181,7 @@ template <typename Char> class printf_width_handler {
 template <typename Char, typename Context>
 void vprintf(buffer<Char>& buf, basic_string_view<Char> format,
              basic_format_args<Context> args) {
-  Context(std::back_inserter(buf), format, args).format();
+  Context(buffer_appender<Char>(buf), format, args).format();
 }
 }  // namespace detail
 
@@ -598,7 +598,7 @@ OutputIt basic_printf_context<OutputIt, Char>::format() {
 
 template <typename Char>
 using basic_printf_context_t =
-    basic_printf_context<std::back_insert_iterator<detail::buffer<Char>>, Char>;
+    basic_printf_context<detail::buffer_appender<Char>, Char>;
 
 using printf_context = basic_printf_context_t<char>;
 using wprintf_context = basic_printf_context_t<wchar_t>;
diff --git a/src/fmt/ranges.h b/src/fmt/ranges.h
index c48f1727d5..b603d637d7 100644
--- a/src/fmt/ranges.h
+++ b/src/fmt/ranges.h
@@ -157,6 +157,9 @@ template <class Tuple, class F> void for_each(Tuple&& tup, F&& f) {
   for_each(indexes, std::forward<Tuple>(tup), std::forward<F>(f));
 }
 
+template <typename Range>
+using value_type = remove_cvref_t<decltype(*std::declval<Range>().begin())>;
+
 template <typename Arg, FMT_ENABLE_IF(!is_like_std_string<
                                       typename std::decay<Arg>::type>::value)>
 FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const Arg&) {
@@ -182,7 +185,6 @@ FMT_CONSTEXPR const char* format_str_quoted(bool add_space, const char) {
 FMT_CONSTEXPR const wchar_t* format_str_quoted(bool add_space, const wchar_t) {
   return add_space ? L" '{}'" : L"'{}'";
 }
-
 }  // namespace detail
 
 template <typename T> struct is_tuple_like {
@@ -246,9 +248,15 @@ template <typename T, typename Char> struct is_range {
       !std::is_constructible<detail::std_string_view<Char>, T>::value;
 };
 
-template <typename RangeT, typename Char>
-struct formatter<RangeT, Char,
-                 enable_if_t<fmt::is_range<RangeT, Char>::value>> {
+template <typename T, typename Char>
+struct formatter<
+    T, Char,
+    enable_if_t<fmt::is_range<T, Char>::value
+// Workaround a bug in MSVC 2017 and earlier.
+#if !FMT_MSC_VER || FMT_MSC_VER >= 1927
+                && has_formatter<detail::value_type<T>, format_context>::value
+#endif
+                >> {
   formatting_range<Char> formatting;
 
   template <typename ParseContext>
@@ -257,8 +265,7 @@ struct formatter<RangeT, Char,
   }
 
   template <typename FormatContext>
-  typename FormatContext::iterator format(const RangeT& values,
-                                          FormatContext& ctx) {
+  typename FormatContext::iterator format(const T& values, FormatContext& ctx) {
     auto out = detail::copy(formatting.prefix, ctx.out());
     size_t i = 0;
     auto it = values.begin();
diff --git a/src/fmtlib_format.cpp b/src/fmtlib_format.cpp
index a64a1f3893..7271341664 100644
--- a/src/fmtlib_format.cpp
+++ b/src/fmtlib_format.cpp
@@ -44,9 +44,9 @@ template FMT_API char detail::decimal_point_impl(locale_ref);
 
 template FMT_API void detail::buffer<char>::append(const char*, const char*);
 
-template FMT_API FMT_BUFFER_CONTEXT(char)::iterator detail::vformat_to(
+template FMT_API void detail::vformat_to(
     detail::buffer<char>&, string_view,
-    basic_format_args<FMT_BUFFER_CONTEXT(char)>);
+    basic_format_args<FMT_BUFFER_CONTEXT(char)>, detail::locale_ref);
 
 template FMT_API int detail::snprintf_float(double, int, detail::float_specs,
                                             detail::buffer<char>&);
diff --git a/src/fmtlib_os.cpp b/src/fmtlib_os.cpp
index 386119db17..a07e782441 100644
--- a/src/fmtlib_os.cpp
+++ b/src/fmtlib_os.cpp
@@ -62,7 +62,7 @@ using RWResult = int;
 inline unsigned convert_rwcount(std::size_t count) {
   return count <= UINT_MAX ? static_cast<unsigned>(count) : UINT_MAX;
 }
-#else
+#elif FMT_USE_FCNTL
 // Return type of read and write functions.
 using RWResult = ssize_t;
 
@@ -124,7 +124,8 @@ void detail::format_windows_error(detail::buffer<char>& out, int error_code,
       if (result != 0) {
         utf16_to_utf8 utf8_message;
         if (utf8_message.convert(system_message) == ERROR_SUCCESS) {
-          format_to(std::back_inserter(out), "{}: {}", message, utf8_message);
+          format_to(buffer_appender<char>(out), "{}: {}", message,
+                    utf8_message);
           return;
         }
         break;
@@ -288,12 +289,12 @@ void file::pipe(file& read_end, file& write_end) {
 }
 
 buffered_file file::fdopen(const char* mode) {
-  // Don't retry as fdopen doesn't return EINTR.
-  #if defined(__MINGW32__) && defined(_POSIX_)
+// Don't retry as fdopen doesn't return EINTR.
+#  if defined(__MINGW32__) && defined(_POSIX_)
   FILE* f = ::fdopen(fd_, mode);
-  #else
+#  else
   FILE* f = FMT_POSIX_CALL(fdopen(fd_, mode));
-  #endif
+#  endif
   if (!f)
     FMT_THROW(
         system_error(errno, "cannot associate stream with file descriptor"));
@@ -313,5 +314,9 @@ long getpagesize() {
   return size;
 #  endif
 }
+
+void ostream::grow(size_t) {
+  if (this->size() == this->capacity()) flush();
+}
 #endif  // FMT_USE_FCNTL
 FMT_END_NAMESPACE
diff --git a/unittest/force-styles/test_error_stats.cpp b/unittest/force-styles/test_error_stats.cpp
index 0873757acd..52ba3baae3 100644
--- a/unittest/force-styles/test_error_stats.cpp
+++ b/unittest/force-styles/test_error_stats.cpp
@@ -26,7 +26,7 @@ TEST(ErrorStats, test)
 
     std::stringstream out;
     out << stats;
-    ASSERT_EQ(out.str(), "Average:  5.800e-01 StdDev:  7.305e-01 MaxErr:  2.000e+00 @ item: 3.0");
+    ASSERT_EQ(out.str(), "Average:  5.800e-01 StdDev:  7.305e-01 MaxErr:  2.000e+00 @ item: 3");
 
     stats.reset();
     ASSERT_EQ(stats.has_data(), false);

From 756e97954542356417dac9e4a7cb71fefa4fc24e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Thu, 29 Oct 2020 21:38:01 -0400
Subject: [PATCH 20/64] update DOIs for new stable release. fix URLs.

---
 doc/src/Intro_citing.rst | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/doc/src/Intro_citing.rst b/doc/src/Intro_citing.rst
index 4346e00bc0..a74d3134f3 100644
--- a/doc/src/Intro_citing.rst
+++ b/doc/src/Intro_citing.rst
@@ -24,13 +24,15 @@ DOI for the LAMMPS code
 LAMMPS developers use the `Zenodo service at CERN
 <https://zenodo.org/>`_ to create digital object identifies (DOI) for
 stable releases of the LAMMPS code. There are two types of DOIs for the
-LAMMPS source code: 1) the canonical DOI for **all** versions of LAMMPS,
-which will always point to the latest stable release version is:
+LAMMPS source code: the canonical DOI for **all** versions of LAMMPS,
+which will always point to the **latest** stable release version is:
 
-  `DOI: 10.5281/zenodo.3726416 <https://dx.doi/org/10.5281/zenodo.3726416>`_
+- DOI: `10.5281/zenodo.3726416 <https://dx.doi.org/10.5281/zenodo.3726416>`_
 
-In addition there are DOIs for individual stable releases starting with
-the `3 March 2020 version, DOI:10.5281/zenodo.3726417 <https://dx.doi/org/10.5281/zenodo.3726416>`_
+In addition there are DOIs for individual stable releases. Currently there are:
+
+- 3 March 2020 version: `DOI:10.5281/zenodo.3726417 <https://dx.doi.org/10.5281/zenodo.3726417>`_
+- 29 October 2020 version: `DOI:10.5281/zenodo.4157471 <https://dx.doi.org/10.5281/zenodo.4157471>`_
 
 
 Home page

From 980fce06de53ffb864e655a89f6d08887c6faad6 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 30 Oct 2020 13:21:01 -0400
Subject: [PATCH 21/64] pretty

---
 src/atom_vec_ellipsoid.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/atom_vec_ellipsoid.cpp b/src/atom_vec_ellipsoid.cpp
index 3bdeff67af..b4cb4616a7 100644
--- a/src/atom_vec_ellipsoid.cpp
+++ b/src/atom_vec_ellipsoid.cpp
@@ -533,8 +533,7 @@ void AtomVecEllipsoid::write_data_bonus(FILE *fp, int n, double *buf, int /*flag
    this may create or delete entry in bonus data
 ------------------------------------------------------------------------- */
 
-void AtomVecEllipsoid::
-set_shape(int i, double shapex, double shapey, double shapez)
+void AtomVecEllipsoid::set_shape(int i, double shapex, double shapey, double shapez)
 {
   if (ellipsoid[i] < 0) {
     if (shapex == 0.0 && shapey == 0.0 && shapez == 0.0) return;

From 66ed16760f93a928196d49e7d60697b93cb9e412 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sat, 31 Oct 2020 11:48:29 -0400
Subject: [PATCH 22/64] do not allow access to rigid body properties before
 they are fully set up

---
 src/RIGID/fix_rigid.cpp       | 4 ++++
 src/RIGID/fix_rigid_small.cpp | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/src/RIGID/fix_rigid.cpp b/src/RIGID/fix_rigid.cpp
index 68c44b3e26..9ce28438d3 100644
--- a/src/RIGID/fix_rigid.cpp
+++ b/src/RIGID/fix_rigid.cpp
@@ -2710,11 +2710,15 @@ double FixRigid::compute_scalar()
 
 void *FixRigid::extract(const char *str, int &dim)
 {
+  dim = 0;
+
   if (strcmp(str,"body") == 0) {
+    if (!setupflag) return nullptr;
     dim = 1;
     return body;
   }
   if (strcmp(str,"masstotal") == 0) {
+    if (!setupflag) return nullptr;
     dim = 1;
     return masstotal;
   }
diff --git a/src/RIGID/fix_rigid_small.cpp b/src/RIGID/fix_rigid_small.cpp
index e867e5bb68..1b022f35c4 100644
--- a/src/RIGID/fix_rigid_small.cpp
+++ b/src/RIGID/fix_rigid_small.cpp
@@ -3424,7 +3424,10 @@ int FixRigidSmall::modify_param(int narg, char **arg)
 
 void *FixRigidSmall::extract(const char *str, int &dim)
 {
+  dim = 0;
+
   if (strcmp(str,"body") == 0) {
+    if (!setupflag) return nullptr;
     dim = 1;
     return atom2body;
   }
@@ -3438,6 +3441,7 @@ void *FixRigidSmall::extract(const char *str, int &dim)
   // used by granular pair styles, indexed by atom2body
 
   if (strcmp(str,"masstotal") == 0) {
+    if (!setupflag) return nullptr;
     dim = 1;
 
     if (nmax_mass < nmax_body) {

From 4dac7625c565658413b4c0e2fb341e40bd1ef411 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 1 Nov 2020 14:53:33 -0500
Subject: [PATCH 23/64] error out instead of segfaulting when calling
 Pair::single() on pair style granular without active history

---
 src/GRANULAR/pair_granular.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/GRANULAR/pair_granular.cpp b/src/GRANULAR/pair_granular.cpp
index 270234821c..41dbee3394 100644
--- a/src/GRANULAR/pair_granular.cpp
+++ b/src/GRANULAR/pair_granular.cpp
@@ -1536,6 +1536,8 @@ double PairGranular::single(int i, int j, int itype, int jtype,
   jlist = list->firstneigh[i];
 
   if (use_history) {
+    if (fix_history == nullptr)
+      error->one(FLERR,"Pair::single() computation needs history");
     allhistory = fix_history->firstvalue[i];
     for (int jj = 0; jj < jnum; jj++) {
       neighprev++;

From 2dc80e9521e2d9b6001b5f38f3f0637f4d2974ff Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 1 Nov 2020 15:19:52 -0500
Subject: [PATCH 24/64] avoid crash in the case of checking an empty potential
 file

---
 src/utils.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/utils.cpp b/src/utils.cpp
index 5ae76ed00d..8bd36a8065 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -862,6 +862,7 @@ std::string utils::get_potential_date(const std::string &path, const std::string
   reader.ignore_comments = false;
 
   char *line = reader.next_line();
+  if (line == nullptr) return "";
   Tokenizer words(line);
   while (words.has_next()) {
     if (words.next() == "DATE:") {
@@ -881,6 +882,7 @@ std::string utils::get_potential_units(const std::string &path, const std::strin
   reader.ignore_comments = false;
 
   char *line = reader.next_line();
+  if (line == nullptr) return "";
   Tokenizer words(line);
   while (words.has_next()) {
     if (words.next() == "UNITS:") {

From 6e64ce7228d29d51aaaf18cd1a32cb0a3aa8044e Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 1 Nov 2020 16:05:41 -0500
Subject: [PATCH 25/64] add Modify::replace_fix() convenience function

---
 src/modify.cpp | 17 +++++++++++++++++
 src/modify.h   |  1 +
 2 files changed, 18 insertions(+)

diff --git a/src/modify.cpp b/src/modify.cpp
index a88d6d54cc..2d0c23d125 100644
--- a/src/modify.cpp
+++ b/src/modify.cpp
@@ -998,6 +998,23 @@ void Modify::replace_fix(const char *replaceID,
   add_fix(narg,arg,trysuffix);
 }
 
+/* ----------------------------------------------------------------------
+   convenience function to allow replacing a fix from a single string
+------------------------------------------------------------------------- */
+
+void Modify::replace_fix(const std::string &oldfix,
+                         const std::string &fixcmd, int trysuffix)
+{
+  auto args = utils::split_words(fixcmd);
+  char **newarg = new char*[args.size()];
+  int i=0;
+  for (const auto &arg : args) {
+    newarg[i++] = (char *)arg.c_str();
+  }
+  replace_fix(oldfix.c_str(),args.size(),newarg,trysuffix);
+  delete[] newarg;
+}
+
 /* ----------------------------------------------------------------------
    one instance per fix in style_fix.h
 ------------------------------------------------------------------------- */
diff --git a/src/modify.h b/src/modify.h
index ee14baf638..a347e8486d 100644
--- a/src/modify.h
+++ b/src/modify.h
@@ -101,6 +101,7 @@ class Modify : protected Pointers {
   void add_fix(int, char **, int trysuffix=1);
   void add_fix(const std::string &, int trysuffix=1);
   void replace_fix(const char *, int, char **, int trysuffix=1);
+  void replace_fix(const std::string &, const std::string &, int trysuffix=1);
   void modify_fix(int, char **);
   void delete_fix(const std::string &);
   void delete_fix(int);

From 4e147632bea1f2d28b6c36c252c2b92d754dcc47 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 1 Nov 2020 16:08:57 -0500
Subject: [PATCH 26/64] simplify call to replace_fix()

---
 src/GRANULAR/pair_granular.cpp | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/src/GRANULAR/pair_granular.cpp b/src/GRANULAR/pair_granular.cpp
index 41dbee3394..a11ba0bf3d 100644
--- a/src/GRANULAR/pair_granular.cpp
+++ b/src/GRANULAR/pair_granular.cpp
@@ -1102,15 +1102,8 @@ void PairGranular::init_style()
   // this is so its order in the fix list is preserved
 
   if (use_history && fix_history == nullptr) {
-    char dnumstr[16];
-    sprintf(dnumstr,"%d",size_history);
-    char **fixarg = new char*[4];
-    fixarg[0] = (char *) "NEIGH_HISTORY_GRANULAR";
-    fixarg[1] = (char *) "all";
-    fixarg[2] = (char *) "NEIGH_HISTORY";
-    fixarg[3] = dnumstr;
-    modify->replace_fix("NEIGH_HISTORY_GRANULAR_DUMMY",4,fixarg,1);
-    delete [] fixarg;
+    modify->replace_fix("NEIGH_HISTORY_GRANULAR_DUMMY","NEIGH_HISTORY_GRANULAR"
+                        " all NEIGH_HISTORY " + std::to_string(size_history),1);
     int ifix = modify->find_fix("NEIGH_HISTORY_GRANULAR");
     fix_history = (FixNeighHistory *) modify->fix[ifix];
     fix_history->pair = this;

From 0b51bba75ca1e58730bc2d7aa1f46fed9b090c47 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 1 Nov 2020 16:09:30 -0500
Subject: [PATCH 27/64] avoid division by zero

---
 src/GRANULAR/pair_granular.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/GRANULAR/pair_granular.cpp b/src/GRANULAR/pair_granular.cpp
index a11ba0bf3d..da1e7a12ad 100644
--- a/src/GRANULAR/pair_granular.cpp
+++ b/src/GRANULAR/pair_granular.cpp
@@ -1402,7 +1402,7 @@ double PairGranular::single(int i, int j, int itype, int jtype,
   radi = radius[i];
   radj = radius[j];
   radsum = radi + radj;
-  Reff = radi*radj/radsum;
+  Reff = (radsum > 0.0) ? radi*radj/radsum : 0.0;
 
   bool touchflag;
   E = normal_coeffs[itype][jtype][0];

From b1de97a3cdbdd81659b9671234e8ca1e9d6e86d1 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Sun, 1 Nov 2020 16:10:16 -0500
Subject: [PATCH 28/64] abort with an error if Pair::single() cannot work for
 granular pair style

---
 src/GRANULAR/pair_granular.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/GRANULAR/pair_granular.cpp b/src/GRANULAR/pair_granular.cpp
index da1e7a12ad..1d2c7c3627 100644
--- a/src/GRANULAR/pair_granular.cpp
+++ b/src/GRANULAR/pair_granular.cpp
@@ -1398,6 +1398,10 @@ double PairGranular::single(int i, int j, int itype, int jtype,
   int *jlist;
   double *history,*allhistory;
 
+  int nall = atom->nlocal + atom->nghost;
+  if ((i >= nall) || (j >= nall))
+    error->all(FLERR,"Not enough atoms for pair granular single function");
+
   double *radius = atom->radius;
   radi = radius[i];
   radj = radius[j];
@@ -1529,8 +1533,8 @@ double PairGranular::single(int i, int j, int itype, int jtype,
   jlist = list->firstneigh[i];
 
   if (use_history) {
-    if (fix_history == nullptr)
-      error->one(FLERR,"Pair::single() computation needs history");
+    if ((fix_history == nullptr) || (fix_history->firstvalue == nullptr))
+      error->one(FLERR,"Pair granular single computation needs history");
     allhistory = fix_history->firstvalue[i];
     for (int jj = 0; jj < jnum; jj++) {
       neighprev++;

From 702041858983b844289c34f53121ac324694f3f6 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Mon, 2 Nov 2020 10:31:46 -0500
Subject: [PATCH 29/64] update fmtlib version 7.1.0 to 7.1.1

---
 src/fmt/color.h       |  9 +++++----
 src/fmt/compile.h     | 17 +++++++++--------
 src/fmt/core.h        | 28 ++++++++++++++++++----------
 src/fmt/format-inl.h  | 16 ++++++++++++----
 src/fmt/format.h      | 28 ++++++++++++++++------------
 src/fmt/locale.h      |  8 ++++----
 src/fmtlib_format.cpp | 30 ++++++++++++++++++++++++++++++
 7 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/src/fmt/color.h b/src/fmt/color.h
index 7891058950..94e3419d1d 100644
--- a/src/fmt/color.h
+++ b/src/fmt/color.h
@@ -589,10 +589,11 @@ OutputIt vformat_to(
   \endrst
 */
 template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char_t<S>>::value&&
-                            detail::is_string<S>::value)>
-inline OutputIt format_to(OutputIt out, const text_style& ts,
-                          const S& format_str, Args&&... args) {
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value&&
+              detail::is_string<S>::value>
+inline auto format_to(OutputIt out, const text_style& ts, const S& format_str,
+                      Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
   return vformat_to(out, ts, to_string_view(format_str),
                     fmt::make_args_checked<Args...>(format_str, args...));
 }
diff --git a/src/fmt/compile.h b/src/fmt/compile.h
index 7db610d90f..3a33b02014 100644
--- a/src/fmt/compile.h
+++ b/src/fmt/compile.h
@@ -667,14 +667,15 @@ OutputIt format_to(OutputIt out, const S&, const Args&... args) {
   return format_to(out, compiled, args...);
 }
 
-template <typename OutputIt, typename CompiledFormat, typename... Args,
-          FMT_ENABLE_IF(detail::is_output_iterator<
-                        OutputIt, typename CompiledFormat::char_type>::value&&
-                            std::is_base_of<detail::basic_compiled_format,
-                                            CompiledFormat>::value)>
-format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
-                                         const CompiledFormat& cf,
-                                         const Args&... args) {
+template <typename OutputIt, typename CompiledFormat, typename... Args>
+auto format_to_n(OutputIt out, size_t n, const CompiledFormat& cf,
+                 const Args&... args) ->
+    typename std::enable_if<
+        detail::is_output_iterator<OutputIt,
+                                   typename CompiledFormat::char_type>::value &&
+            std::is_base_of<detail::basic_compiled_format,
+                            CompiledFormat>::value,
+        format_to_n_result<OutputIt>>::type {
   auto it =
       format_to(detail::truncating_iterator<OutputIt>(out, n), cf, args...);
   return {it.base(), it.count()};
diff --git a/src/fmt/core.h b/src/fmt/core.h
index 317292288d..9bd2003b28 100644
--- a/src/fmt/core.h
+++ b/src/fmt/core.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 // The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 70100
+#define FMT_VERSION 70101
 
 #ifdef __clang__
 #  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
@@ -1937,7 +1937,14 @@ template <typename Context> class basic_format_args {
   }
 };
 
-/** An alias to ``basic_format_args<context>``. */
+#ifdef FMT_ARM_ABI_COMPATIBILITY
+/** An alias to ``basic_format_args<format_context>``. */
+// Separate types would result in shorter symbols but break ABI compatibility
+// between clang and gcc on ARM (#1919).
+using format_args = basic_format_args<format_context>;
+using wformat_args = basic_format_args<wformat_context>;
+#else
+// DEPRECATED! These are kept for ABI compatibility.
 // It is a separate type rather than an alias to make symbols readable.
 struct format_args : basic_format_args<format_context> {
   template <typename... Args>
@@ -1946,6 +1953,7 @@ struct format_args : basic_format_args<format_context> {
 struct wformat_args : basic_format_args<wformat_context> {
   using basic_format_args::basic_format_args;
 };
+#endif
 
 namespace detail {
 
@@ -1976,10 +1984,10 @@ inline void vprint_mojibake(std::FILE*, string_view, format_args) {}
 // GCC 8 and earlier cannot handle std::back_insert_iterator<Container> with
 // vformat_to<ArgFormatter>(...) overload, so SFINAE on iterator type instead.
 template <typename OutputIt, typename S, typename Char = char_t<S>,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
-OutputIt vformat_to(
-    OutputIt out, const S& format_str,
-    basic_format_args<buffer_context<type_identity_t<Char>>> args) {
+          bool enable = detail::is_output_iterator<OutputIt, Char>::value>
+auto vformat_to(OutputIt out, const S& format_str,
+                basic_format_args<buffer_context<type_identity_t<Char>>> args)
+    -> typename std::enable_if<enable, OutputIt>::type {
   decltype(detail::get_buffer<Char>(out)) buf(detail::get_buffer_init(out));
   detail::vformat_to(buf, to_string_view(format_str), args);
   return detail::get_iterator(buf);
@@ -2031,10 +2039,10 @@ inline format_to_n_result<OutputIt> vformat_to_n(
  \endrst
  */
 template <typename OutputIt, typename S, typename... Args,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, char_t<S>>::value)>
-inline format_to_n_result<OutputIt> format_to_n(OutputIt out, size_t n,
-                                                const S& format_str,
-                                                const Args&... args) {
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
+inline auto format_to_n(OutputIt out, size_t n, const S& format_str,
+                        const Args&... args) ->
+    typename std::enable_if<enable, format_to_n_result<OutputIt>>::type {
   const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
   return vformat_to_n(out, n, to_string_view(format_str), vargs);
 }
diff --git a/src/fmt/format-inl.h b/src/fmt/format-inl.h
index b7cb3209c8..5d466eebbc 100644
--- a/src/fmt/format-inl.h
+++ b/src/fmt/format-inl.h
@@ -261,11 +261,19 @@ const uint64_t basic_data<T>::powers_of_10_64[] = {
     10000000000000000000ULL};
 
 template <typename T>
-const uint32_t basic_data<T>::zero_or_powers_of_10_32[] = {0, 0,
+const uint32_t basic_data<T>::zero_or_powers_of_10_32[] = {0,
                                                            FMT_POWERS_OF_10(1)};
-
 template <typename T>
 const uint64_t basic_data<T>::zero_or_powers_of_10_64[] = {
+    0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
+    10000000000000000000ULL};
+
+template <typename T>
+const uint32_t basic_data<T>::zero_or_powers_of_10_32_new[] = {
+    0, 0, FMT_POWERS_OF_10(1)};
+
+template <typename T>
+const uint64_t basic_data<T>::zero_or_powers_of_10_64_new[] = {
     0, 0, FMT_POWERS_OF_10(1), FMT_POWERS_OF_10(1000000000ULL),
     10000000000000000000ULL};
 
@@ -1756,7 +1764,7 @@ inline bool divisible_by_power_of_2(uint64_t x, int exp) FMT_NOEXCEPT {
 #ifdef FMT_BUILTIN_CTZLL
   return FMT_BUILTIN_CTZLL(x) >= exp;
 #else
-  return exp < num_bits<uint64_t>()) && x == ((x >> exp) << exp);
+  return exp < num_bits<uint64_t>() && x == ((x >> exp) << exp);
 #endif
 }
 
@@ -1901,7 +1909,7 @@ template <> struct cache_accessor<double> {
     uint64_t pow5 = data::powers_of_5_64[offset];
     uint128_wrapper recovered_cache = umul128(base_cache.high(), pow5);
     uint128_wrapper middle_low =
-        umul128(base_cache.low() - (kb < 0 ? 1 : 0), pow5);
+        umul128(base_cache.low() - (kb < 0 ? 1u : 0u), pow5);
 
     recovered_cache += middle_low.high();
 
diff --git a/src/fmt/format.h b/src/fmt/format.h
index fbe5045068..13b8da3028 100644
--- a/src/fmt/format.h
+++ b/src/fmt/format.h
@@ -866,8 +866,8 @@ template <typename T> struct FMT_EXTERN_TEMPLATE_API divtest_table_entry {
 // Static data is placed in this class template for the header-only config.
 template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data {
   static const uint64_t powers_of_10_64[];
-  static const uint32_t zero_or_powers_of_10_32[];
-  static const uint64_t zero_or_powers_of_10_64[];
+  static const uint32_t zero_or_powers_of_10_32_new[];
+  static const uint64_t zero_or_powers_of_10_64_new[];
   static const uint64_t grisu_pow10_significands[];
   static const int16_t grisu_pow10_exponents[];
   static const divtest_table_entry<uint32_t> divtest_table_for_pow5_32[];
@@ -891,6 +891,10 @@ template <typename T = void> struct FMT_EXTERN_TEMPLATE_API basic_data {
   static const char signs[];
   static const char left_padding_shifts[5];
   static const char right_padding_shifts[5];
+
+  // DEPRECATED! These are for ABI compatibility.
+  static const uint32_t zero_or_powers_of_10_32[];
+  static const uint64_t zero_or_powers_of_10_64[];
 };
 
 // Maps bsr(n) to ceil(log10(pow(2, bsr(n) + 1) - 1)).
@@ -917,7 +921,7 @@ struct data : basic_data<> {};
 inline int count_digits(uint64_t n) {
   // https://github.com/fmtlib/format-benchmark/blob/master/digits10
   auto t = bsr2log10(FMT_BUILTIN_CLZLL(n | 1) ^ 63);
-  return t - (n < data::zero_or_powers_of_10_64[t]);
+  return t - (n < data::zero_or_powers_of_10_64_new[t]);
 }
 #else
 // Fallback version of count_digits used when __builtin_clz is not available.
@@ -984,7 +988,7 @@ template <> int count_digits<4>(detail::fallback_uintptr n);
 // Optional version of count_digits for better performance on 32-bit platforms.
 inline int count_digits(uint32_t n) {
   auto t = bsr2log10(FMT_BUILTIN_CLZ(n | 1) ^ 31);
-  return t - (n < data::zero_or_powers_of_10_32[t]);
+  return t - (n < data::zero_or_powers_of_10_32_new[t]);
 }
 #endif
 
@@ -3056,8 +3060,7 @@ struct format_handler : detail::error_handler {
   basic_format_parse_context<Char> parse_context;
   Context context;
 
-  format_handler(OutputIt out,
-                 basic_string_view<Char> str,
+  format_handler(OutputIt out, basic_string_view<Char> str,
                  basic_format_args<Context> format_args, detail::locale_ref loc)
       : parse_context(str), context(out, format_args, loc) {}
 
@@ -3080,8 +3083,8 @@ struct format_handler : detail::error_handler {
   FMT_INLINE void on_replacement_field(int id, const Char*) {
     auto arg = get_arg(context, id);
     context.advance_to(visit_format_arg(
-        default_arg_formatter<OutputIt, Char>{
-            context.out(), context.args(), context.locale()},
+        default_arg_formatter<OutputIt, Char>{context.out(), context.args(),
+                                              context.locale()},
         arg));
   }
 
@@ -3105,8 +3108,8 @@ struct format_handler : detail::error_handler {
       if (begin == end || *begin != '}')
         on_error("missing '}' in format string");
     }
-    context.advance_to(
-        visit_format_arg(arg_formatter<OutputIt, Char>(context, &parse_context, &specs), arg));
+    context.advance_to(visit_format_arg(
+        arg_formatter<OutputIt, Char>(context, &parse_context, &specs), arg));
     return begin;
   }
 };
@@ -3776,8 +3779,8 @@ void detail::vformat_to(
                      arg);
     return;
   }
-  format_handler<iterator, Char, buffer_context<Char>> h(
-      out, format_str, args, loc);
+  format_handler<iterator, Char, buffer_context<Char>> h(out, format_str, args,
+                                                         loc);
   parse_format_string<false>(format_str, h);
 }
 
@@ -3786,6 +3789,7 @@ extern template void detail::vformat_to(detail::buffer<char>&, string_view,
                                         basic_format_args<format_context>,
                                         detail::locale_ref);
 namespace detail {
+
 extern template FMT_API std::string grouping_impl<char>(locale_ref loc);
 extern template FMT_API std::string grouping_impl<wchar_t>(locale_ref loc);
 extern template FMT_API char thousands_sep_impl<char>(locale_ref loc);
diff --git a/src/fmt/locale.h b/src/fmt/locale.h
index 517f65054f..7301bf92a2 100644
--- a/src/fmt/locale.h
+++ b/src/fmt/locale.h
@@ -51,10 +51,10 @@ inline OutputIt vformat_to(
 }
 
 template <typename OutputIt, typename S, typename... Args,
-          typename Char = char_t<S>,
-          FMT_ENABLE_IF(detail::is_output_iterator<OutputIt, Char>::value)>
-inline OutputIt format_to(OutputIt out, const std::locale& loc,
-                          const S& format_str, Args&&... args) {
+          bool enable = detail::is_output_iterator<OutputIt, char_t<S>>::value>
+inline auto format_to(OutputIt out, const std::locale& loc,
+                      const S& format_str, Args&&... args) ->
+    typename std::enable_if<enable, OutputIt>::type {
   const auto& vargs = fmt::make_args_checked<Args...>(format_str, args...);
   return vformat_to(out, loc, to_string_view(format_str), vargs);
 }
diff --git a/src/fmtlib_format.cpp b/src/fmtlib_format.cpp
index 7271341664..bca87b033b 100644
--- a/src/fmtlib_format.cpp
+++ b/src/fmtlib_format.cpp
@@ -23,6 +23,36 @@ int format_float(char* buf, std::size_t size, const char* format, int precision,
   return precision < 0 ? snprintf_ptr(buf, size, format, value)
                        : snprintf_ptr(buf, size, format, precision, value);
 }
+
+template dragonbox::decimal_fp<float> dragonbox::to_decimal(float x)
+    FMT_NOEXCEPT;
+template dragonbox::decimal_fp<double> dragonbox::to_decimal(double x)
+    FMT_NOEXCEPT;
+
+// DEPRECATED! This function exists for ABI compatibility.
+template <typename Char>
+typename basic_format_context<std::back_insert_iterator<buffer<Char>>,
+                              Char>::iterator
+vformat_to(buffer<Char>& buf, basic_string_view<Char> format_str,
+           basic_format_args<basic_format_context<
+               std::back_insert_iterator<buffer<type_identity_t<Char>>>,
+               type_identity_t<Char>>>
+               args) {
+  using iterator = std::back_insert_iterator<buffer<char>>;
+  using context = basic_format_context<
+      std::back_insert_iterator<buffer<type_identity_t<Char>>>,
+      type_identity_t<Char>>;
+  auto out = iterator(buf);
+  format_handler<iterator, Char, context> h(out, format_str, args, {});
+  parse_format_string<false>(format_str, h);
+  return out;
+}
+template basic_format_context<std::back_insert_iterator<buffer<char>>,
+                              char>::iterator
+vformat_to(buffer<char>&, string_view,
+           basic_format_args<basic_format_context<
+               std::back_insert_iterator<buffer<type_identity_t<char>>>,
+               type_identity_t<char>>>);
 }  // namespace detail
 
 template struct FMT_INSTANTIATION_DEF_API detail::basic_data<void>;

From e8337fd128f61c8af971897e2b28b9a6a0e90ce6 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 2 Nov 2020 10:52:59 -0500
Subject: [PATCH 30/64] Safeguard against possible string overflow

---
 src/USER-NETCDF/dump_netcdf.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/USER-NETCDF/dump_netcdf.cpp b/src/USER-NETCDF/dump_netcdf.cpp
index a50e01ce95..e1a8dfab69 100644
--- a/src/USER-NETCDF/dump_netcdf.cpp
+++ b/src/USER-NETCDF/dump_netcdf.cpp
@@ -183,7 +183,7 @@ DumpNetCDF::DumpNetCDF(LAMMPS *lmp, int narg, char **arg) :
       for (int j = 0; j < DUMP_NC_MAX_DIMS; j++) {
         perat[inc].field[j] = -1;
       }
-      strcpy(perat[inc].name, mangled);
+      strncpy(perat[inc].name, mangled, NC_FIELD_NAME_MAX);
       n_perat++;
     }
 

From 0e8e93b2a0c9bbc1ef9e62cdf13fcc660970fd45 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 2 Nov 2020 10:59:33 -0500
Subject: [PATCH 31/64] Add space for readability

---
 src/integrate.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/integrate.cpp b/src/integrate.cpp
index 7fc74c5db2..7d4bf36929 100644
--- a/src/integrate.cpp
+++ b/src/integrate.cpp
@@ -23,7 +23,7 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-Integrate::Integrate(LAMMPS *lmp, int /*narg*/, char **/*arg*/) : Pointers(lmp)
+Integrate::Integrate(LAMMPS *lmp, int /*narg*/, char ** /*arg*/) : Pointers(lmp)
 {
   elist_global = elist_atom = nullptr;
   vlist_global = vlist_atom = cvlist_atom = nullptr;

From c24f7acdd0d031944e90187149faba0e54cfb9eb Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 2 Nov 2020 11:00:36 -0500
Subject: [PATCH 32/64] Avoid passing invalid pointer during integrator
 creation

---
 src/update.cpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/update.cpp b/src/update.cpp
index eebebb91be..a8a698a7f3 100644
--- a/src/update.cpp
+++ b/src/update.cpp
@@ -323,7 +323,12 @@ void Update::create_integrate(int narg, char **arg, int trysuffix)
   delete integrate;
 
   int sflag;
-  new_integrate(arg[0],narg-1,&arg[1],trysuffix,sflag);
+
+  if(narg-1 > 0) {
+    new_integrate(arg[0],narg-1,&arg[1],trysuffix,sflag);
+  } else {
+    new_integrate(arg[0],0,nullptr,trysuffix,sflag);
+  }
 
   std::string estyle = arg[0];
   if (sflag) {

From e86b4d3a78dbaaf454bd734491c4dcc4c4bfb449 Mon Sep 17 00:00:00 2001
From: iafoss <49990208+iafoss@users.noreply.github.com>
Date: Mon, 2 Nov 2020 11:54:11 -0500
Subject: [PATCH 33/64] bug fix in pair_mesont_tpm.cpp

---
 src/USER-MESONT/pair_mesont_tpm.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index b15a1e5b1f..9185786341 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -510,6 +510,12 @@ void PairMESONTTPM::compute(int eflag, int vflag){
 
   // set per atom values and accumulators
   // reallocate per-atom arrays if necessary
+  if (eatom_s == nullptr)
+   memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
+  if (eatom_b == nullptr)
+   memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
+  if (eatom_t == nullptr)
+   memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
   if (atom->nmax > maxeatom) {
     maxeatom = atom->nmax;
     memory->destroy(eatom);

From e6643979516195965c7261053878f4b88d9aaa2b Mon Sep 17 00:00:00 2001
From: iafoss <49990208+iafoss@users.noreply.github.com>
Date: Mon, 2 Nov 2020 16:12:57 -0500
Subject: [PATCH 34/64] Add files via upload

---
 src/USER-MESONT/pair_mesont_tpm.cpp | 1611 ++++++++++++++-------------
 src/USER-MESONT/pair_mesont_tpm.h   |  197 ++--
 2 files changed, 907 insertions(+), 901 deletions(-)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index 9185786341..a58f9892ed 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -1,803 +1,808 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://lammps.sandia.gov/, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
-------------------------------------------------------------------------- */
-
-#include "pair_mesont_tpm.h"
-#include "export_mesont.h"
-
-
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "memory.h"
-#include "error.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-
-#include <cstring>
-#include <vector>
-#include <cmath>
-
-#include <fstream>
-#include <sstream>
-#include <algorithm>
-
-using namespace LAMMPS_NS;
-
-//since LAMMPS is compiled with C++ 2003, define a substitution for std::array
-template<typename T, int N>
-class array2003{
-public:
-  T& operator[] (int idx){ return data[idx];};
-  const T& operator[] (int idx) const{ return data[idx];};
-private:
-  T data[N];
-};
-
-
-class MESONTList {
-public:
-  MESONTList(const Atom* atom, const NeighList* nblist, double rc2);
-  ~MESONTList() {};
-  //list of segments
-  const std::vector<array2003<int,2> >& get_segments() const;
-  //list of triplets
-  const std::vector<array2003<int,3> >& get_triplets() const;
-  //list of neighbor chains [start,end] for segments
-  //(use idx() to get real indexes)
-  const std::vector<std::vector<array2003<int,2> > >& get_nbs() const;
-  //convert idx from sorted representation to real idx
-  int get_idx(int idx) const;
-  //return list of indexes for conversion from sorted representation
-  const std::vector<int>& get_idx_list() const;
-  //convert idx from real idx to sorted representation
-  int get_idxb(int idx) const;
-  //return list of indexes for conversion to sorted representation
-  const std::vector<int>& get_idxb_list() const;
-  //check if the node is the end of the tube
-  bool is_end(int idx) const;
-
-  array2003<int, 2> get_segment(int idx) const;
-  array2003<int, 3> get_triplet(int idx) const;
-
-  static const int cnt_end = -1;
-  static const int domain_end = -2;
-  static const int not_cnt = -3;
-private:
-  std::vector<array2003<int, 2> > chain_list, segments;
-  std::vector<array2003<int, 3> > triplets;
-  std::vector<std::vector<array2003<int, 2> > > nb_chains;
-  std::vector<int> index_list, index_list_b;
-};
-
-//=============================================================================
-
-inline const std::vector<std::vector<array2003<int, 2> > > &
- MESONTList::get_nbs() const {
-  return nb_chains;
-}
-
-inline int MESONTList::get_idx(int idx) const {
-  return index_list[idx];
-}
-
-inline const std::vector<int>& MESONTList::get_idx_list() const {
-  return index_list;
-};
-
-
-inline int MESONTList::get_idxb(int idx) const {
-  return index_list_b[idx];
-}
-
-inline const std::vector<int>& MESONTList::get_idxb_list() const {
-  return index_list_b;
-};
-
-inline const std::vector<array2003<int, 2> > & MESONTList::get_segments()
- const {
-  return segments;
-}
-
-inline const std::vector<array2003<int, 3> > & MESONTList::get_triplets()
- const {
-  return triplets;
-}
-
-inline array2003<int, 2> MESONTList::get_segment(int idx) const {
-  array2003<int, 2> result;
-  result[0] = chain_list[idx][0];
-  result[1] = idx;
-  return result;
-}
-
-inline array2003<int, 3> MESONTList::get_triplet(int idx) const {
-  array2003<int, 3> result;
-  result[0] = chain_list[idx][0];
-  result[1] = idx;
-  result[2] = chain_list[idx][1];
-  return result;
-}
-
-inline bool MESONTList::is_end(int idx) const {
-  return chain_list[idx][0] == cnt_end || chain_list[idx][1] == cnt_end;
-};
-
-template<typename T>
-void vector_union(std::vector<T>& v1, std::vector<T>& v2,
- std::vector<T>& merged) {
-  std::sort(v1.begin(), v1.end());
-  std::sort(v2.begin(), v2.end());
-  merged.reserve(v1.size() + v2.size());
-  typename std::vector<T>::iterator it1 = v1.begin();
-  typename std::vector<T>::iterator it2 = v2.begin();
-
-  while (it1 != v1.end() && it2 != v2.end()) {
-    if (*it1 < *it2) {
-      if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
-        ++it1;
-    }
-    else {
-      if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
-      ++it2;
-    }
-  }
-  while (it1 != v1.end()) {
-    if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
-    ++it1;
-  }
-
-  while (it2 != v2.end()) {
-  if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
-    ++it2;
-  }
-}
-
-MESONTList::MESONTList(const Atom* atom, const NeighList* nblist, double /* rc2 */){
-  if (atom == nullptr || nblist == nullptr) return;
-  //number of local atoms at the node
-  int nlocal = atom->nlocal;
-  //total number of atoms in the node and ghost shell
-  int nall = nblist->inum + nblist->gnum;
-  int ntot = atom->nlocal + atom->nghost;
-  tagint* const g_id = atom->tag;
-  tagint** const bonds = atom->bond_nt;
-  tagint* const chain_id = atom->molecule;
-  int* ilist = nblist->ilist;
-
-  //convert bonds to local id representation
-  array2003<int, 2> tmp_arr;
-  tmp_arr[0] = not_cnt; tmp_arr[1] = not_cnt;
-  chain_list.resize(ntot, tmp_arr);
-  for (int ii = 0; ii < nall; ii++) {
-    int i = ilist[ii];
-    chain_list[i][0] = domain_end;
-    chain_list[i][1] = domain_end;
-  }
-  for (int ii = 0; ii < nall; ii++) {
-    int i = ilist[ii];
-    int nnb = nblist->numneigh[i];
-    for (int m = 0; m < 2; m++)
-      if (bonds[i][m] == cnt_end) chain_list[i][m] = cnt_end;
-    for (int j = 0; j < nnb; j++) {
-      int nb = nblist->firstneigh[i][j];
-      if (bonds[i][0] == g_id[nb]){
-        chain_list[i][0] = nb;
-        chain_list[nb][1] = i;
-        break;
-      }
-    }
-  }
-
-  //reorder chains: index list
-  //list of indexes for conversion FROM reordered representation
-  index_list.reserve(nall);
-  index_list_b.resize(ntot, -1); // convert index TO reordered representation
-  for (int i = 0; i < ntot; i++) {
-    if (chain_list[i][0] == cnt_end || chain_list[i][0] == domain_end) {
-      index_list.push_back(i);
-      index_list_b[i] = index_list.size() - 1;
-      int idx = i;
-      while (1) {
-        idx = chain_list[idx][1];
-        if (idx == cnt_end || idx == domain_end) break;
-        else index_list.push_back(idx);
-        index_list_b[idx] = index_list.size() - 1;
-      }
-    }
-  }
-
-  //segment list
-  for (int i = 0; i < nlocal; i++) {
-    if (chain_list[i][0] == not_cnt) continue;
-    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
-     g_id[i] < g_id[chain_list[i][0]]){
-      array2003<int, 2> tmp_c;
-      tmp_c[0] = i; tmp_c[1] = chain_list[i][0];
-      segments.push_back(tmp_c);
-    }
-    if (chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end &&
-     g_id[i] < g_id[chain_list[i][1]]){
-      array2003<int, 2> tmp_c;
-       tmp_c[0] = i; tmp_c[1] = chain_list[i][1];
-       segments.push_back(tmp_c);
-    }
-  }
-  int nbonds = segments.size();
-
-  //triplets
-  for (int i = 0; i < nlocal; i++){
-    if (chain_list[i][0] == not_cnt) continue;
-    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
-     chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end)
-      triplets.push_back(get_triplet(i));
-  }
-
-  //segment neighbor list
-  nb_chains.resize(nbonds);
-  std::vector<int> nb_list_i[2], nb_list;
-  for (int i = 0; i < nbonds; i++) {
-    //union of nb lists
-    for (int m = 0; m < 2; m++) {
-      nb_list_i[m].resize(0);
-      int idx = segments[i][m];
-      if (idx >= nlocal) continue;
-      int nnb = nblist->numneigh[idx];
-      for (int j = 0; j < nnb; j++) {
-        int jdx = nblist->firstneigh[idx][j];
-        //no self interactions for nbs within the same tube
-        if (chain_id[jdx] == chain_id[idx] &&
-         std::abs(index_list_b[idx] - index_list_b[jdx]) <= 5) continue;
-        nb_list_i[m].push_back(index_list_b[jdx]);
-      }
-    }
-    vector_union(nb_list_i[0], nb_list_i[1], nb_list);
-
-    int nnb = nb_list.size();
-    if (nnb > 0) {
-      int idx_s = nb_list[0];
-      for (int j = 0; j < nnb; j++) {
-        //if nodes are not continuous in the sorted representation
-        //or represent chain ends, create a new neighbor chain
-        int idx_next = chain_list[index_list[nb_list[j]]][1];
-        if ((j == nnb - 1) || (nb_list[j] + 1 != nb_list[j+1]) ||
-         (idx_next == cnt_end) || (idx_next == domain_end)) {
-          array2003<int, 2> chain;
-          chain[0] = idx_s;
-          chain[1] = nb_list[j];
-          //make sure that segments having at least one node
-          //in the neighbor list are included
-          int idx0 = index_list[chain[0]]; // real id of the ends
-          int idx1 = index_list[chain[1]];
-          if (chain_list[idx0][0] != cnt_end &&
-           chain_list[idx0][0] != domain_end) chain[0] -= 1;
-          if (chain_list[idx1][1] != cnt_end &&
-           chain_list[idx1][1] != domain_end) chain[1] += 1;
-          if(chain[0] != chain[1]) nb_chains[i].push_back(chain);
-          idx_s = (j == nnb - 1) ? -1 : nb_list[j + 1];
-        }
-      }
-    }
-    nb_list.resize(0);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-// the cutoff distance between walls of tubes
-static const double TPBRcutoff  = 3.0*3.4;
-int PairMESONTTPM::instance_count = 0;
-/* ---------------------------------------------------------------------- */
-
-PairMESONTTPM::PairMESONTTPM(LAMMPS *lmp) : Pair(lmp) {
-  writedata=1;
-  BendingMode = 0;  // Harmonic bending model
-  TPMType = 0;      // Inter-tube segment-segment interaction
-  tab_path = nullptr;
-  tab_path_length = 0;
-
-  eatom_s = nullptr;
-  eatom_b = nullptr;
-  eatom_t = nullptr;
-  instance_count++;
-  if(instance_count > 1) error->all(FLERR,
-   "only a single instance of mesont/tpm pair style can be created");
-}
-
-/* ---------------------------------------------------------------------- */
-
-PairMESONTTPM::~PairMESONTTPM()
-{
-  if (allocated) {
-    memory->destroy(setflag);
-    memory->destroy(cutsq);
-    memory->destroy(cut);
-
-    memory->destroy(eatom_s);
-    memory->destroy(eatom_b);
-    memory->destroy(eatom_t);
-  }
-  instance_count--;
-  if (tab_path != nullptr) memory->destroy(tab_path);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairMESONTTPM::compute(int eflag, int vflag){
-  ev_init(eflag,vflag);
-  //total number of atoms in the node and ghost shell
-  int nall = list->inum + list->gnum;
-  int ntot = atom->nlocal + atom->nghost;
-  int newton_pair = force->newton_pair;
-  if(!newton_pair)
-   error->all(FLERR,"Pair style mesont/tpm requires newton pair on");
-
-  double **x = atom->x;
-  double **f = atom->f;
-  double *r = atom->radius;
-  double *l = atom->length;
-  int *buckling = atom->buckling;
-  tagint *g_id = atom->tag;
-
-  //check if cutoff is chosen correctly
-  double RT = mesont_lib_get_R();
-  double Lmax = 0.0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    int i = list->ilist[ii];
-    if (Lmax < l[i]) Lmax = l[i];
-  }
-  double Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
-   std::pow((2.0*RT + TPBRcutoff),2)));
-  if (cut_global < Rcut_min){
-    std::stringstream err;
-    err << "The selected cutoff is too small for the current system : " <<
-     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
-     ", Rcut_min = " << Rcut_min;
-    error->all(FLERR, err.str().c_str());
-  }
-
-  //generate bonds and chain nblist
-  MESONTList ntlist(atom, list, cut_global*cut_global);
-
-  //reorder data to make it contiguous within tubes
-  //and compatible with Fortran functions
-  std::vector<double> x_sort(3*nall), f_sort(3*nall), s_sort(9*nall);
-  std::vector<double> u_ts_sort(nall), u_tb_sort(nall), u_tt_sort(nall);
-  std::vector<int> b_sort(nall);
-  for (int i = 0; i < nall; i++){
-    int idx = ntlist.get_idx(i);
-    for (int j = 0; j < 3; j++) x_sort[3*i+j] = x[idx][j];
-    b_sort[i] = buckling[idx];
-  }
-
-  //bending potential
-  int n_triplets = ntlist.get_triplets().size();
-  for (int i = 0; i < n_triplets; i++) {
-    const array2003<int,3>& t = ntlist.get_triplets()[i];
-    //idx of nodes of a triplet in sorted representation
-    int idx_s0 = ntlist.get_idxb(t[0]);
-    int idx_s1 = ntlist.get_idxb(t[1]);
-    int idx_s2 = ntlist.get_idxb(t[2]);
-
-    double* X1 = &(x_sort[3*idx_s0]);
-    double* X2 = &(x_sort[3*idx_s1]);
-    double* X3 = &(x_sort[3*idx_s2]);
-    double& U1b = u_tb_sort[idx_s0];
-    double& U2b = u_tb_sort[idx_s1];
-    double& U3b = u_tb_sort[idx_s2];
-    double* F1 = &(f_sort[3*idx_s0]);
-    double* F2 = &(f_sort[3*idx_s1]);
-    double* F3 = &(f_sort[3*idx_s2]);
-    double* S1 = &(s_sort[9*idx_s0]);
-    double* S2 = &(s_sort[9*idx_s1]);
-    double* S3 = &(s_sort[9*idx_s2]);
-    double& R123 = r[t[1]];
-    double& L123 = l[t[1]];
-    int& BBF2 = b_sort[idx_s1];
-
-    mesont_lib_TubeBendingForceField(U1b, U2b, U3b, F1, F2, F3, S1, S2, S3,
-     X1, X2, X3, R123, L123, BBF2);
-  }
-
-  //share new values of buckling
-  if (BendingMode == 1){
-    for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      buckling[idx] = b_sort[i];
-    }
-    comm->forward_comm_pair(this);
-    for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      b_sort[i] = buckling[idx];
-    }
-  }
-
-  //segment-segment and segment-tube interactions
-  int n_segments = ntlist.get_segments().size();
-  double Rmax = 0.0;
-  Lmax = 0.0;
-  for (int i = 0; i < n_segments; i++) {
-    const array2003<int,2>& s = ntlist.get_segments()[i];
-    //idx of a segment end 1 in sorted representation
-    int idx_s0 = ntlist.get_idxb(s[0]);
-    //idx of a segment end 2 in sorted representation
-    int idx_s1 = ntlist.get_idxb(s[1]);
-    double* X1 = &(x_sort[3*idx_s0]);
-    double* X2 = &(x_sort[3*idx_s1]);
-    double length = std::sqrt(std::pow(X1[0]-X2[0],2) +
-     std::pow(X1[1]-X2[1],2) + std::pow(X1[2]-X2[2],2));
-    if (length > Lmax) Lmax = length;
-    double& U1t = u_tt_sort[idx_s0];
-    double& U2t = u_tt_sort[idx_s1];
-    double& U1s = u_ts_sort[idx_s0];
-    double& U2s = u_ts_sort[idx_s1];
-    double* F1 = &(f_sort[3*idx_s0]);
-    double* F2 = &(f_sort[3*idx_s1]);
-    double* S1 = &(s_sort[9*idx_s0]);
-    double* S2 = &(s_sort[9*idx_s1]);
-    double R12 = r[s[0]]; if (R12 > Rmax) Rmax = R12;
-    if (std::abs(R12 - RT) > 1e-3)
-        error->all(FLERR,"Inconsistent input and potential table");
-    //assume that the length of the segment is defined by the node with
-    //smallest global id
-    double L12 = (g_id[s[0]] > g_id[s[1]]) ? l[s[1]] : l[s[0]];
-    mesont_lib_TubeStretchingForceField(U1s, U2s, F1, F2, S1, S2, X1, X2,
-     R12, L12);
-
-    for (int nc = 0; nc < (int)ntlist.get_nbs()[i].size(); nc++){
-      //id of the beginning and end of the chain in the sorted representation
-      const array2003<int,2>& chain = ntlist.get_nbs()[i][nc];
-      int N = chain[1] - chain[0] + 1;  //number of elements in the chain
-      int end1 = ntlist.get_idx(chain[0]);  //chain ends (real representation)
-      int end2 = ntlist.get_idx(chain[1]);
-      double* X = &(x_sort[3*chain[0]]);
-      double* Ut = &(u_tt_sort[chain[0]]);
-      double* F = &(f_sort[3*chain[0]]);
-      double* S = &(s_sort[9*chain[0]]);
-      double R = r[end1];
-      int* BBF = &(b_sort[chain[0]]);
-      int E1 = ntlist.is_end(end1);
-      int E2 = ntlist.is_end(end2);
-
-      int Ee = 0;
-      double* Xe = X; double* Fe = F; double* Se = S;
-      if (!E1 && ntlist.get_triplet(end1)[0] != MESONTList::domain_end &&
-       ntlist.get_triplet(ntlist.get_triplet(end1)[0])[0] ==
-       MESONTList::cnt_end){
-        Ee = 1;
-        int idx = ntlist.get_idxb(ntlist.get_triplet(end1)[0]);
-        Xe = &(x_sort[3*idx]);
-        Fe = &(f_sort[3*idx]);
-        Se = &(s_sort[9*idx]);
-      }
-      else if (!E2 && ntlist.get_triplet(end2)[2] != MESONTList::domain_end &&
-       ntlist.get_triplet(ntlist.get_triplet(end2)[2])[2] ==
-       MESONTList::cnt_end){
-        Ee = 2;
-        int idx = ntlist.get_idxb(ntlist.get_triplet(end2)[2]);
-        Xe = &(x_sort[3*idx]);
-        Fe = &(f_sort[3*idx]);
-        Se = &(s_sort[9*idx]);
-      }
-
-      mesont_lib_SegmentTubeForceField(U1t, U2t, Ut, F1, F2, F, Fe, S1, S2, S,
-       Se, X1, X2, R12, N, X, Xe, BBF, R, E1, E2, Ee, TPMType);
-    }
-  }
-
-  //check if cutoff is chosen correctly
-  Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
-   std::pow((2.0*Rmax + TPBRcutoff),2)));
-  if (cut_global < Rcut_min){
-    std::stringstream err;
-    err << "The selected cutoff is too small for the current system : " <<
-     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
-     ", Rcut_min = " << Rcut_min;
-    error->all(FLERR, err.str().c_str());
-  }
-
-  // set per atom values and accumulators
-  // reallocate per-atom arrays if necessary
-  if (eatom_s == nullptr)
-   memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
-  if (eatom_b == nullptr)
-   memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
-  if (eatom_t == nullptr)
-   memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
-  if (atom->nmax > maxeatom) {
-    maxeatom = atom->nmax;
-    memory->destroy(eatom);
-    memory->create(eatom,comm->nthreads*maxeatom,"pair:eatom");
-    memory->destroy(eatom_s);
-    memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
-    memory->destroy(eatom_b);
-    memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
-    memory->destroy(eatom_t);
-    memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
-  }
-
-  if (atom->nmax > maxvatom) {
-    maxvatom = atom->nmax;
-    memory->destroy(vatom);
-    memory->create(vatom,comm->nthreads*maxvatom,6,"pair:vatom");
-  }
-
-  // zero accumulators
-  eng_vdwl = 0.0; energy_s = 0.0;
-  energy_b = 0.0; energy_t = 0.0;
-  for (int i = 0; i < 6; i++) virial[i] = 0.0;
-  for (int i = 0; i < ntot; i++){
-    eatom[i] = 0.0; eatom_s[i] = 0.0;
-    eatom_b[i] = 0.0; eatom_t[i] = 0.0;
-  }
-  for (int i = 0; i < ntot; i++)
-    for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;
-
-  //convert from sorted representation
-  for (int i = 0; i < nall; i++){
-    int idx = ntlist.get_idx(i);
-    for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
-    eatom_s[idx] = u_ts_sort[i];
-    eatom_b[idx] = u_tb_sort[i];
-    eatom_t[idx] = u_tt_sort[i];
-    eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
-    energy_s += u_ts_sort[i];
-    energy_b += u_tb_sort[i];
-    energy_t += u_tt_sort[i];
-    vatom[idx][0] = s_sort[9*i+0]; //xx
-    vatom[idx][1] = s_sort[9*i+4]; //yy
-    vatom[idx][2] = s_sort[9*i+8]; //zz
-    vatom[idx][3] = s_sort[9*i+1]; //xy
-    vatom[idx][4] = s_sort[9*i+2]; //xz
-    vatom[idx][5] = s_sort[9*i+5]; //yz
-    for (int j = 0; j < 6; j++) virial[j] += vatom[idx][j];
-    buckling[idx] = b_sort[i];
-  }
-  eng_vdwl = energy_s + energy_b + energy_t;
-}
-
-/* ----------------------------------------------------------------------
-   allocate all arrays
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::allocate(){
-  allocated = 1;
-  int n = atom->ntypes;
-
-  memory->create(setflag,n+1,n+1,"pair:setflag");
-  for (int i = 1; i <= n; i++)
-    for (int j = i; j <= n; j++)
-      setflag[i][j] = 0;
-
-  memory->create(cutsq,n+1,n+1,"pair:cutsq");
-  memory->create(cut,n+1,n+1,"pair:cut");
-}
-
-/* ----------------------------------------------------------------------
-   global settings
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::settings(int narg, char **arg){
-  if ((narg == 0) || (narg > 4))
-    error->all(FLERR,"Illegal pair_style command");
-  cut_global = utils::numeric(FLERR,arg[0],false,lmp);
-
-  // reset cutoffs that have been explicitly set
-  if (allocated) {
-    int i,j;
-    for (i = 1; i <= atom->ntypes; i++)
-      for (j = i+1; j <= atom->ntypes; j++)
-        cut[i][j] = cut_global;
-  }
-  std::string TPMAFile = (narg > 1) ? arg[1] : "MESONT-TABTP.xrs";
-  tab_path_length = TPMAFile.length();
-  if (tab_path != nullptr) memory->destroy(tab_path);
-  //c_str returns '\0' terminated string
-  memory->create(tab_path,tab_path_length+1,"pair:path");
-  std::memcpy(tab_path, TPMAFile.c_str(), tab_path_length+1);
-  mesont_lib_SetTablePath(tab_path, tab_path_length);
-
-  if (narg > 2) {
-    BendingMode = utils::numeric(FLERR,arg[2],false,lmp);
-    if ((BendingMode < 0) || (BendingMode > 1))
-      error->all(FLERR,"Incorrect BendingMode");
-  }
-  if (narg > 3) {
-    TPMType = utils::numeric(FLERR,arg[3],false,lmp);
-    if ((TPMType < 0) || (TPMType > 1))
-      error->all(FLERR,"Incorrect TPMType");
-  }
-
-  mesont_lib_TPBInit();
-  int M, N;
-  std::ifstream in(TPMAFile);
-  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
-  std::string tmp;
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  in >> M >> N;
-  in.close();
-  mesont_lib_TPMInit(M, N);
-  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
-}
-
-/* ----------------------------------------------------------------------
-   set coeffs for one or more type pairs
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::coeff(int narg, char **arg){
-  if ((narg < 2) || (narg > 3))
-    error->all(FLERR,"Incorrect args for pair coefficients");
-
-  if (!allocated) allocate();
-
-  int ilo,ihi,jlo,jhi;
-  utils::bounds(FLERR,arg[0],1,atom->ntypes,ilo,ihi,error);
-  utils::bounds(FLERR,arg[1],1,atom->ntypes,jlo,jhi,error);
-
-  double cut_one = cut_global;
-  if (narg == 3) cut_one = utils::numeric(FLERR,arg[2],false,lmp);
-
-  int count = 0;
-  for (int i = ilo; i <= ihi; i++) {
-    for (int j = MAX(jlo,i); j <= jhi; j++) {
-      cut[i][j] = cut_one;
-      setflag[i][j] = 1;
-      count++;
-    }
-  }
-
-  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
-}
-
-/* ----------------------------------------------------------------------
-   init for one type pair i,j and corresponding j,i
-------------------------------------------------------------------------- */
-
-double PairMESONTTPM::init_one(int i, int j){
-  if (setflag[i][j] == 0) {
-    cut[i][j] = mix_distance(cut[i][i],cut[j][j]);
-  }
-
-  return cut[i][j];
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_restart(FILE *fp){
-  write_restart_settings(fp);
-
-  int i,j;
-  for (i = 1; i <= atom->ntypes; i++)
-    for (j = i; j <= atom->ntypes; j++) {
-      fwrite(&setflag[i][j],sizeof(int),1,fp);
-      if (setflag[i][j]) {
-        fwrite(&cut[i][j],sizeof(double),1,fp);
-      }
-    }
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::read_restart(FILE *fp){
-  read_restart_settings(fp);
-  allocate();
-
-  int i,j;
-  int me = comm->me;
-  for (i = 1; i <= atom->ntypes; i++)
-    for (j = i; j <= atom->ntypes; j++) {
-      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
-      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
-      if (setflag[i][j]) {
-        if (me == 0) {
-          fread(&cut[i][j],sizeof(double),1,fp);
-        }
-        MPI_Bcast(&cut[i][j],1,MPI_DOUBLE,0,world);
-      }
-    }
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_restart_settings(FILE *fp){
-  fwrite(&BendingMode,sizeof(int),1,fp);
-  fwrite(&TPMType,sizeof(int),1,fp);
-  fwrite(&cut_global,sizeof(double),1,fp);
-  fwrite(&tab_path_length,sizeof(int),1,fp);
-  fwrite(tab_path,tab_path_length+1,1,fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::read_restart_settings(FILE *fp){
-  int me = comm->me;
-  if (me == 0) {
-    fread(&BendingMode,sizeof(int),1,fp);
-    fread(&TPMType,sizeof(int),1,fp);
-    fread(&cut_global,sizeof(double),1,fp);
-    fread(&tab_path_length,sizeof(int),1,fp);
-  }
-  MPI_Bcast(&BendingMode,1,MPI_INT,0,world);
-  MPI_Bcast(&TPMType,1,MPI_INT,0,world);
-  MPI_Bcast(&cut_global,1,MPI_DOUBLE,0,world);
-  MPI_Bcast(&tab_path_length,1,MPI_INT,0,world);
-
-  if (tab_path != nullptr) memory->destroy(tab_path);
-  memory->create(tab_path,tab_path_length+1,"pair:path");
-  if (me == 0) fread(tab_path,tab_path_length+1,1,fp);
-  MPI_Bcast(tab_path,tab_path_length+1,MPI_CHAR,0,world);
-  mesont_lib_SetTablePath(tab_path,tab_path_length);
-  mesont_lib_TPBInit();
-  int M, N;
-  std::ifstream in(tab_path);
-  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
-  std::string tmp;
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  in >> M >> N;
-  in.close();
-  mesont_lib_TPMInit(M, N);
-  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to data file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_data(FILE *fp){
-  for (int i = 1; i <= atom->ntypes; i++)
-    fprintf(fp,"%d\n",i);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes all pairs to data file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_data_all(FILE *fp){
-  for (int i = 1; i <= atom->ntypes; i++)
-    for (int j = i; j <= atom->ntypes; j++)
-      fprintf(fp,"%d %d %g\n",i,j,cut[i][j]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairMESONTTPM::init_style(){
-  //make sure that a full list is created (including ghost nodes)
-  int r = neighbor->request(this,instance_me);
-  neighbor->requests[r]->half = false;
-  neighbor->requests[r]->full = true;
-  neighbor->requests[r]->ghost = true;
-}
-
-void* PairMESONTTPM::extract(const char *str, int &){
-  if (strcmp(str,"mesonttpm_Es_tot") == 0) return &energy_s;
-  else if (strcmp(str,"mesonttpm_Eb_tot") == 0) return &energy_b;
-  else if (strcmp(str,"mesonttpm_Et_tot") == 0) return &energy_t;
-  else if (strcmp(str,"mesonttpm_Es") == 0) return eatom_s;
-  else if (strcmp(str,"mesonttpm_Eb") == 0) return eatom_b;
-  else if (strcmp(str,"mesonttpm_Et") == 0) return eatom_t;
-  else return nullptr;
-};
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+
+   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
+------------------------------------------------------------------------- */
+
+#include "pair_mesont_tpm.h"
+#include "export_mesont.h"
+
+
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+
+#include <cstring>
+#include <vector>
+#include <cmath>
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+using namespace LAMMPS_NS;
+
+//since LAMMPS is compiled with C++ 2003, define a substitution for std::array
+template<typename T, int N>
+class array2003{
+public:
+  T& operator[] (int idx){ return data[idx];};
+  const T& operator[] (int idx) const{ return data[idx];};
+private:
+  T data[N];
+};
+
+
+class MESONTList {
+public:
+  MESONTList(const Atom* atom, const NeighList* nblist, double rc2);
+  ~MESONTList() {};
+  //list of segments
+  const std::vector<array2003<int,2> >& get_segments() const;
+  //list of triplets
+  const std::vector<array2003<int,3> >& get_triplets() const;
+  //list of neighbor chains [start,end] for segments
+  //(use idx() to get real indexes)
+  const std::vector<std::vector<array2003<int,2> > >& get_nbs() const;
+  //convert idx from sorted representation to real idx
+  int get_idx(int idx) const;
+  //return list of indexes for conversion from sorted representation
+  const std::vector<int>& get_idx_list() const;
+  //convert idx from real idx to sorted representation
+  int get_idxb(int idx) const;
+  //return list of indexes for conversion to sorted representation
+  const std::vector<int>& get_idxb_list() const;
+  //check if the node is the end of the tube
+  bool is_end(int idx) const;
+
+  array2003<int, 2> get_segment(int idx) const;
+  array2003<int, 3> get_triplet(int idx) const;
+
+  static const int cnt_end = -1;
+  static const int domain_end = -2;
+  static const int not_cnt = -3;
+private:
+  std::vector<array2003<int, 2> > chain_list, segments;
+  std::vector<array2003<int, 3> > triplets;
+  std::vector<std::vector<array2003<int, 2> > > nb_chains;
+  std::vector<int> index_list, index_list_b;
+};
+
+//=============================================================================
+
+inline const std::vector<std::vector<array2003<int, 2> > > &
+ MESONTList::get_nbs() const {
+  return nb_chains;
+}
+
+inline int MESONTList::get_idx(int idx) const {
+  return index_list[idx];
+}
+
+inline const std::vector<int>& MESONTList::get_idx_list() const {
+  return index_list;
+};
+
+
+inline int MESONTList::get_idxb(int idx) const {
+  return index_list_b[idx];
+}
+
+inline const std::vector<int>& MESONTList::get_idxb_list() const {
+  return index_list_b;
+};
+
+inline const std::vector<array2003<int, 2> > & MESONTList::get_segments()
+ const {
+  return segments;
+}
+
+inline const std::vector<array2003<int, 3> > & MESONTList::get_triplets()
+ const {
+  return triplets;
+}
+
+inline array2003<int, 2> MESONTList::get_segment(int idx) const {
+  array2003<int, 2> result;
+  result[0] = chain_list[idx][0];
+  result[1] = idx;
+  return result;
+}
+
+inline array2003<int, 3> MESONTList::get_triplet(int idx) const {
+  array2003<int, 3> result;
+  result[0] = chain_list[idx][0];
+  result[1] = idx;
+  result[2] = chain_list[idx][1];
+  return result;
+}
+
+inline bool MESONTList::is_end(int idx) const {
+  return chain_list[idx][0] == cnt_end || chain_list[idx][1] == cnt_end;
+};
+
+template<typename T>
+void vector_union(std::vector<T>& v1, std::vector<T>& v2,
+ std::vector<T>& merged) {
+  std::sort(v1.begin(), v1.end());
+  std::sort(v2.begin(), v2.end());
+  merged.reserve(v1.size() + v2.size());
+  typename std::vector<T>::iterator it1 = v1.begin();
+  typename std::vector<T>::iterator it2 = v2.begin();
+
+  while (it1 != v1.end() && it2 != v2.end()) {
+    if (*it1 < *it2) {
+      if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
+        ++it1;
+    }
+    else {
+      if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
+      ++it2;
+    }
+  }
+  while (it1 != v1.end()) {
+    if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
+    ++it1;
+  }
+
+  while (it2 != v2.end()) {
+  if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
+    ++it2;
+  }
+}
+
+MESONTList::MESONTList(const Atom* atom, const NeighList* nblist, double /* rc2 */){
+  if (atom == nullptr || nblist == nullptr) return;
+  //number of local atoms at the node
+  int nlocal = atom->nlocal;
+  //total number of atoms in the node and ghost shell
+  int nall = nblist->inum + nblist->gnum;
+  int ntot = atom->nlocal + atom->nghost;
+  tagint* const g_id = atom->tag;
+  tagint** const bonds = atom->bond_nt;
+  tagint* const chain_id = atom->molecule;
+  int* ilist = nblist->ilist;
+
+  //convert bonds to local id representation
+  array2003<int, 2> tmp_arr;
+  tmp_arr[0] = not_cnt; tmp_arr[1] = not_cnt;
+  chain_list.resize(ntot, tmp_arr);
+  for (int ii = 0; ii < nall; ii++) {
+    int i = ilist[ii];
+    chain_list[i][0] = domain_end;
+    chain_list[i][1] = domain_end;
+  }
+  for (int ii = 0; ii < nall; ii++) {
+    int i = ilist[ii];
+    int nnb = nblist->numneigh[i];
+    for (int m = 0; m < 2; m++)
+      if (bonds[i][m] == cnt_end) chain_list[i][m] = cnt_end;
+    for (int j = 0; j < nnb; j++) {
+      int nb = nblist->firstneigh[i][j];
+      if (bonds[i][0] == g_id[nb]){
+        chain_list[i][0] = nb;
+        chain_list[nb][1] = i;
+        break;
+      }
+    }
+  }
+
+  //reorder chains: index list
+  //list of indexes for conversion FROM reordered representation
+  index_list.reserve(nall);
+  index_list_b.resize(ntot, -1); // convert index TO reordered representation
+  for (int i = 0; i < ntot; i++) {
+    if (chain_list[i][0] == cnt_end || chain_list[i][0] == domain_end) {
+      index_list.push_back(i);
+      index_list_b[i] = index_list.size() - 1;
+      int idx = i;
+      while (1) {
+        idx = chain_list[idx][1];
+        if (idx == cnt_end || idx == domain_end) break;
+        else index_list.push_back(idx);
+        index_list_b[idx] = index_list.size() - 1;
+      }
+    }
+  }
+
+  //segment list
+  for (int i = 0; i < nlocal; i++) {
+    if (chain_list[i][0] == not_cnt) continue;
+    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
+     g_id[i] < g_id[chain_list[i][0]]){
+      array2003<int, 2> tmp_c;
+      tmp_c[0] = i; tmp_c[1] = chain_list[i][0];
+      segments.push_back(tmp_c);
+    }
+    if (chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end &&
+     g_id[i] < g_id[chain_list[i][1]]){
+      array2003<int, 2> tmp_c;
+       tmp_c[0] = i; tmp_c[1] = chain_list[i][1];
+       segments.push_back(tmp_c);
+    }
+  }
+  int nbonds = segments.size();
+
+  //triplets
+  for (int i = 0; i < nlocal; i++){
+    if (chain_list[i][0] == not_cnt) continue;
+    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
+     chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end)
+      triplets.push_back(get_triplet(i));
+  }
+
+  //segment neighbor list
+  nb_chains.resize(nbonds);
+  std::vector<int> nb_list_i[2], nb_list;
+  for (int i = 0; i < nbonds; i++) {
+    //union of nb lists
+    for (int m = 0; m < 2; m++) {
+      nb_list_i[m].resize(0);
+      int idx = segments[i][m];
+      if (idx >= nlocal) continue;
+      int nnb = nblist->numneigh[idx];
+      for (int j = 0; j < nnb; j++) {
+        int jdx = nblist->firstneigh[idx][j];
+        //no self interactions for nbs within the same tube
+        if (chain_id[jdx] == chain_id[idx] &&
+         std::abs(index_list_b[idx] - index_list_b[jdx]) <= 5) continue;
+        nb_list_i[m].push_back(index_list_b[jdx]);
+      }
+    }
+    vector_union(nb_list_i[0], nb_list_i[1], nb_list);
+
+    int nnb = nb_list.size();
+    if (nnb > 0) {
+      int idx_s = nb_list[0];
+      for (int j = 0; j < nnb; j++) {
+        //if nodes are not continuous in the sorted representation
+        //or represent chain ends, create a new neighbor chain
+        int idx_next = chain_list[index_list[nb_list[j]]][1];
+        if ((j == nnb - 1) || (nb_list[j] + 1 != nb_list[j+1]) ||
+         (idx_next == cnt_end) || (idx_next == domain_end)) {
+          array2003<int, 2> chain;
+          chain[0] = idx_s;
+          chain[1] = nb_list[j];
+          //make sure that segments having at least one node
+          //in the neighbor list are included
+          int idx0 = index_list[chain[0]]; // real id of the ends
+          int idx1 = index_list[chain[1]];
+          if (chain_list[idx0][0] != cnt_end &&
+           chain_list[idx0][0] != domain_end) chain[0] -= 1;
+          if (chain_list[idx1][1] != cnt_end &&
+           chain_list[idx1][1] != domain_end) chain[1] += 1;
+          if(chain[0] != chain[1]) nb_chains[i].push_back(chain);
+          idx_s = (j == nnb - 1) ? -1 : nb_list[j + 1];
+        }
+      }
+    }
+    nb_list.resize(0);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+// the cutoff distance between walls of tubes
+static const double TPBRcutoff  = 3.0*3.4;
+int PairMESONTTPM::instance_count = 0;
+/* ---------------------------------------------------------------------- */
+
+PairMESONTTPM::PairMESONTTPM(LAMMPS *lmp) : Pair(lmp) {
+  writedata=1;
+  BendingMode = 0;  // Harmonic bending model
+  TPMType = 0;      // Inter-tube segment-segment interaction
+  tab_path = nullptr;
+  tab_path_length = 0;
+
+  eatom_s = nullptr;
+  eatom_b = nullptr;
+  eatom_t = nullptr;
+  nmax = 0;
+  instance_count++;
+  if(instance_count > 1) error->all(FLERR,
+   "only a single instance of mesont/tpm pair style can be created");
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairMESONTTPM::~PairMESONTTPM()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(cut);
+
+    memory->destroy(eatom_s);
+    memory->destroy(eatom_b);
+    memory->destroy(eatom_t);
+  }
+  instance_count--;
+  if (tab_path != nullptr) memory->destroy(tab_path);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMESONTTPM::compute(int eflag, int vflag){
+  // set per atom values and accumulators
+  // reallocate per-atom arrays if necessary
+  ev_init(eflag,vflag);
+  if (atom->nmax > nmax) {
+    memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
+    memory->destroy(eatom_b);
+    memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
+    memory->destroy(eatom_t);
+    memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
+    nmax = atom->nmax;
+  }
+  //total number of atoms in the node and ghost shell
+  int nall = list->inum + list->gnum;
+  int ntot = atom->nlocal + atom->nghost;
+  int newton_pair = force->newton_pair;
+  if(!newton_pair)
+   error->all(FLERR,"Pair style mesont/tpm requires newton pair on");
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *r = atom->radius;
+  double *l = atom->length;
+  int *buckling = atom->buckling;
+  tagint *g_id = atom->tag;
+
+  //check if cutoff is chosen correctly
+  double RT = mesont_lib_get_R();
+  double Lmax = 0.0;
+  for (int ii = 0; ii < list->inum; ii++) {
+    int i = list->ilist[ii];
+    if (Lmax < l[i]) Lmax = l[i];
+  }
+  double Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
+   std::pow((2.0*RT + TPBRcutoff),2)));
+  if (cut_global < Rcut_min){
+    std::stringstream err;
+    err << "The selected cutoff is too small for the current system : " <<
+     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
+     ", Rcut_min = " << Rcut_min;
+    error->all(FLERR, err.str().c_str());
+  }
+
+  //generate bonds and chain nblist
+  MESONTList ntlist(atom, list, cut_global*cut_global);
+
+  //reorder data to make it contiguous within tubes
+  //and compatible with Fortran functions
+  std::vector<double> x_sort(3*nall), f_sort(3*nall), s_sort(9*nall);
+  std::vector<double> u_ts_sort(nall), u_tb_sort(nall), u_tt_sort(nall);
+  std::vector<int> b_sort(nall);
+  for (int i = 0; i < nall; i++){
+    int idx = ntlist.get_idx(i);
+    for (int j = 0; j < 3; j++) x_sort[3*i+j] = x[idx][j];
+    b_sort[i] = buckling[idx];
+  }
+
+  //bending potential
+  int n_triplets = ntlist.get_triplets().size();
+  for (int i = 0; i < n_triplets; i++) {
+    const array2003<int,3>& t = ntlist.get_triplets()[i];
+    //idx of nodes of a triplet in sorted representation
+    int idx_s0 = ntlist.get_idxb(t[0]);
+    int idx_s1 = ntlist.get_idxb(t[1]);
+    int idx_s2 = ntlist.get_idxb(t[2]);
+
+    double* X1 = &(x_sort[3*idx_s0]);
+    double* X2 = &(x_sort[3*idx_s1]);
+    double* X3 = &(x_sort[3*idx_s2]);
+    double& U1b = u_tb_sort[idx_s0];
+    double& U2b = u_tb_sort[idx_s1];
+    double& U3b = u_tb_sort[idx_s2];
+    double* F1 = &(f_sort[3*idx_s0]);
+    double* F2 = &(f_sort[3*idx_s1]);
+    double* F3 = &(f_sort[3*idx_s2]);
+    double* S1 = &(s_sort[9*idx_s0]);
+    double* S2 = &(s_sort[9*idx_s1]);
+    double* S3 = &(s_sort[9*idx_s2]);
+    double& R123 = r[t[1]];
+    double& L123 = l[t[1]];
+    int& BBF2 = b_sort[idx_s1];
+
+    mesont_lib_TubeBendingForceField(U1b, U2b, U3b, F1, F2, F3, S1, S2, S3,
+     X1, X2, X3, R123, L123, BBF2);
+  }
+
+  //share new values of buckling
+  if (BendingMode == 1){
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      buckling[idx] = b_sort[i];
+    }
+    comm->forward_comm_pair(this);
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      b_sort[i] = buckling[idx];
+    }
+  }
+
+  //segment-segment and segment-tube interactions
+  int n_segments = ntlist.get_segments().size();
+  double Rmax = 0.0;
+  Lmax = 0.0;
+  for (int i = 0; i < n_segments; i++) {
+    const array2003<int,2>& s = ntlist.get_segments()[i];
+    //idx of a segment end 1 in sorted representation
+    int idx_s0 = ntlist.get_idxb(s[0]);
+    //idx of a segment end 2 in sorted representation
+    int idx_s1 = ntlist.get_idxb(s[1]);
+    double* X1 = &(x_sort[3*idx_s0]);
+    double* X2 = &(x_sort[3*idx_s1]);
+    double length = std::sqrt(std::pow(X1[0]-X2[0],2) +
+     std::pow(X1[1]-X2[1],2) + std::pow(X1[2]-X2[2],2));
+    if (length > Lmax) Lmax = length;
+    double& U1t = u_tt_sort[idx_s0];
+    double& U2t = u_tt_sort[idx_s1];
+    double& U1s = u_ts_sort[idx_s0];
+    double& U2s = u_ts_sort[idx_s1];
+    double* F1 = &(f_sort[3*idx_s0]);
+    double* F2 = &(f_sort[3*idx_s1]);
+    double* S1 = &(s_sort[9*idx_s0]);
+    double* S2 = &(s_sort[9*idx_s1]);
+    double R12 = r[s[0]]; if (R12 > Rmax) Rmax = R12;
+    if (std::abs(R12 - RT) > 1e-3)
+        error->all(FLERR,"Inconsistent input and potential table");
+    //assume that the length of the segment is defined by the node with
+    //smallest global id
+    double L12 = (g_id[s[0]] > g_id[s[1]]) ? l[s[1]] : l[s[0]];
+    mesont_lib_TubeStretchingForceField(U1s, U2s, F1, F2, S1, S2, X1, X2,
+     R12, L12);
+
+    for (int nc = 0; nc < (int)ntlist.get_nbs()[i].size(); nc++){
+      //id of the beginning and end of the chain in the sorted representation
+      const array2003<int,2>& chain = ntlist.get_nbs()[i][nc];
+      int N = chain[1] - chain[0] + 1;  //number of elements in the chain
+      int end1 = ntlist.get_idx(chain[0]);  //chain ends (real representation)
+      int end2 = ntlist.get_idx(chain[1]);
+      double* X = &(x_sort[3*chain[0]]);
+      double* Ut = &(u_tt_sort[chain[0]]);
+      double* F = &(f_sort[3*chain[0]]);
+      double* S = &(s_sort[9*chain[0]]);
+      double R = r[end1];
+      int* BBF = &(b_sort[chain[0]]);
+      int E1 = ntlist.is_end(end1);
+      int E2 = ntlist.is_end(end2);
+
+      int Ee = 0;
+      double* Xe = X; double* Fe = F; double* Se = S;
+      if (!E1 && ntlist.get_triplet(end1)[0] != MESONTList::domain_end &&
+       ntlist.get_triplet(ntlist.get_triplet(end1)[0])[0] ==
+       MESONTList::cnt_end){
+        Ee = 1;
+        int idx = ntlist.get_idxb(ntlist.get_triplet(end1)[0]);
+        Xe = &(x_sort[3*idx]);
+        Fe = &(f_sort[3*idx]);
+        Se = &(s_sort[9*idx]);
+      }
+      else if (!E2 && ntlist.get_triplet(end2)[2] != MESONTList::domain_end &&
+       ntlist.get_triplet(ntlist.get_triplet(end2)[2])[2] ==
+       MESONTList::cnt_end){
+        Ee = 2;
+        int idx = ntlist.get_idxb(ntlist.get_triplet(end2)[2]);
+        Xe = &(x_sort[3*idx]);
+        Fe = &(f_sort[3*idx]);
+        Se = &(s_sort[9*idx]);
+      }
+
+      mesont_lib_SegmentTubeForceField(U1t, U2t, Ut, F1, F2, F, Fe, S1, S2, S,
+       Se, X1, X2, R12, N, X, Xe, BBF, R, E1, E2, Ee, TPMType);
+    }
+  }
+
+  //check if cutoff is chosen correctly
+  Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
+   std::pow((2.0*Rmax + TPBRcutoff),2)));
+  if (cut_global < Rcut_min){
+    std::stringstream err;
+    err << "The selected cutoff is too small for the current system : " <<
+     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
+     ", Rcut_min = " << Rcut_min;
+    error->all(FLERR, err.str().c_str());
+  }
+
+  //convert from sorted representation
+  for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
+      buckling[idx] = b_sort[i];
+  }
+  if(eflag){
+    eng_vdwl = 0.0; energy_s = 0.0;
+    energy_b = 0.0; energy_t = 0.0;
+    for (int i = 0; i < ntot; i++){
+      eatom[i] = 0.0; eatom_s[i] = 0.0;
+      eatom_b[i] = 0.0; eatom_t[i] = 0.0;
+    }
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      eatom_s[idx] = u_ts_sort[i];
+      eatom_b[idx] = u_tb_sort[i];
+      eatom_t[idx] = u_tt_sort[i];
+      eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
+      energy_s += u_ts_sort[i];
+      energy_b += u_tb_sort[i];
+      energy_t += u_tt_sort[i];
+    }
+    eng_vdwl = energy_s + energy_b + energy_t;
+  }
+  if(vflag){
+    for (int i = 0; i < 6; i++) virial[i] = 0.0;
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      virial[0] += s_sort[9*i+0]; //xx
+      virial[1] += s_sort[9*i+4]; //yy
+      virial[2] += s_sort[9*i+8]; //zz
+      virial[3] += s_sort[9*i+1]; //xy
+      virial[4] += s_sort[9*i+2]; //xz
+      virial[5] += s_sort[9*i+5]; //yz
+    }
+  }
+  int vflag_atom = vflag & 4;
+  if(vflag_atom){
+    for (int i = 0; i < ntot; i++)
+      for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      vatom[idx][0] = s_sort[9*i+0]; //xx
+      vatom[idx][1] = s_sort[9*i+4]; //yy
+      vatom[idx][2] = s_sort[9*i+8]; //zz
+      vatom[idx][3] = s_sort[9*i+1]; //xy
+      vatom[idx][4] = s_sort[9*i+2]; //xz
+      vatom[idx][5] = s_sort[9*i+5]; //yz
+    }
+  }
+
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::allocate(){
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+  memory->create(cut,n+1,n+1,"pair:cut");
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::settings(int narg, char **arg){
+  if ((narg == 0) || (narg > 4))
+    error->all(FLERR,"Illegal pair_style command");
+  cut_global = utils::numeric(FLERR,arg[0],false,lmp);
+
+  // reset cutoffs that have been explicitly set
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++)
+      for (j = i+1; j <= atom->ntypes; j++)
+        cut[i][j] = cut_global;
+  }
+  std::string TPMAFile = (narg > 1) ? arg[1] : "MESONT-TABTP.xrs";
+  tab_path_length = TPMAFile.length();
+  if (tab_path != nullptr) memory->destroy(tab_path);
+  //c_str returns '\0' terminated string
+  memory->create(tab_path,tab_path_length+1,"pair:path");
+  std::memcpy(tab_path, TPMAFile.c_str(), tab_path_length+1);
+  mesont_lib_SetTablePath(tab_path, tab_path_length);
+
+  if (narg > 2) {
+    BendingMode = utils::numeric(FLERR,arg[2],false,lmp);
+    if ((BendingMode < 0) || (BendingMode > 1))
+      error->all(FLERR,"Incorrect BendingMode");
+  }
+  if (narg > 3) {
+    TPMType = utils::numeric(FLERR,arg[3],false,lmp);
+    if ((TPMType < 0) || (TPMType > 1))
+      error->all(FLERR,"Incorrect TPMType");
+  }
+
+  mesont_lib_TPBInit();
+  int M, N;
+  std::ifstream in(TPMAFile);
+  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
+  std::string tmp;
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  in >> M >> N;
+  in.close();
+  mesont_lib_TPMInit(M, N);
+  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::coeff(int narg, char **arg){
+  if ((narg < 2) || (narg > 3))
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  utils::bounds(FLERR,arg[0],1,atom->ntypes,ilo,ihi,error);
+  utils::bounds(FLERR,arg[1],1,atom->ntypes,jlo,jhi,error);
+
+  double cut_one = cut_global;
+  if (narg == 3) cut_one = utils::numeric(FLERR,arg[2],false,lmp);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      cut[i][j] = cut_one;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairMESONTTPM::init_one(int i, int j){
+  if (setflag[i][j] == 0) {
+    cut[i][j] = mix_distance(cut[i][i],cut[j][j]);
+  }
+
+  return cut[i][j];
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_restart(FILE *fp){
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+        fwrite(&cut[i][j],sizeof(double),1,fp);
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::read_restart(FILE *fp){
+  read_restart_settings(fp);
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+        if (me == 0) {
+          fread(&cut[i][j],sizeof(double),1,fp);
+        }
+        MPI_Bcast(&cut[i][j],1,MPI_DOUBLE,0,world);
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_restart_settings(FILE *fp){
+  fwrite(&BendingMode,sizeof(int),1,fp);
+  fwrite(&TPMType,sizeof(int),1,fp);
+  fwrite(&cut_global,sizeof(double),1,fp);
+  fwrite(&tab_path_length,sizeof(int),1,fp);
+  fwrite(tab_path,tab_path_length+1,1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::read_restart_settings(FILE *fp){
+  int me = comm->me;
+  if (me == 0) {
+    fread(&BendingMode,sizeof(int),1,fp);
+    fread(&TPMType,sizeof(int),1,fp);
+    fread(&cut_global,sizeof(double),1,fp);
+    fread(&tab_path_length,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&BendingMode,1,MPI_INT,0,world);
+  MPI_Bcast(&TPMType,1,MPI_INT,0,world);
+  MPI_Bcast(&cut_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&tab_path_length,1,MPI_INT,0,world);
+
+  if (tab_path != nullptr) memory->destroy(tab_path);
+  memory->create(tab_path,tab_path_length+1,"pair:path");
+  if (me == 0) fread(tab_path,tab_path_length+1,1,fp);
+  MPI_Bcast(tab_path,tab_path_length+1,MPI_CHAR,0,world);
+  mesont_lib_SetTablePath(tab_path,tab_path_length);
+  mesont_lib_TPBInit();
+  int M, N;
+  std::ifstream in(tab_path);
+  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
+  std::string tmp;
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  in >> M >> N;
+  in.close();
+  mesont_lib_TPMInit(M, N);
+  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to data file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_data(FILE *fp){
+  for (int i = 1; i <= atom->ntypes; i++)
+    fprintf(fp,"%d\n",i);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes all pairs to data file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_data_all(FILE *fp){
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = i; j <= atom->ntypes; j++)
+      fprintf(fp,"%d %d %g\n",i,j,cut[i][j]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMESONTTPM::init_style(){
+  //make sure that a full list is created (including ghost nodes)
+  int r = neighbor->request(this,instance_me);
+  neighbor->requests[r]->half = false;
+  neighbor->requests[r]->full = true;
+  neighbor->requests[r]->ghost = true;
+}
+
+void* PairMESONTTPM::extract(const char *str, int &){
+  if (strcmp(str,"mesonttpm_Es_tot") == 0) return &energy_s;
+  else if (strcmp(str,"mesonttpm_Eb_tot") == 0) return &energy_b;
+  else if (strcmp(str,"mesonttpm_Et_tot") == 0) return &energy_t;
+  else if (strcmp(str,"mesonttpm_Es") == 0) return eatom_s;
+  else if (strcmp(str,"mesonttpm_Eb") == 0) return eatom_b;
+  else if (strcmp(str,"mesonttpm_Et") == 0) return eatom_t;
+  else return nullptr;
+};
diff --git a/src/USER-MESONT/pair_mesont_tpm.h b/src/USER-MESONT/pair_mesont_tpm.h
index 704556d75e..c3d71ae953 100644
--- a/src/USER-MESONT/pair_mesont_tpm.h
+++ b/src/USER-MESONT/pair_mesont_tpm.h
@@ -1,98 +1,99 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(mesont/tpm,PairMESONTTPM)
-
-#else
-
-#ifndef LMP_PAIR_MESONT_TPM_H
-#define LMP_PAIR_MESONT_TPM_H
-
-#include "pair.h"
-
-namespace LAMMPS_NS {
-
-class PairMESONTTPM : public Pair {
- public:
-  PairMESONTTPM(class LAMMPS *);
-  virtual ~PairMESONTTPM();
-  virtual void compute(int, int);
-  void settings(int, char **);
-  void coeff(int, char **);
-  double init_one(int, int);
-  void write_restart(FILE *);
-  void read_restart(FILE *);
-  void write_restart_settings(FILE *);
-  void read_restart_settings(FILE *);
-  void write_data(FILE *);
-  void write_data_all(FILE *);
-  virtual void init_style();
-
-  double energy_s;  // accumulated energies for stretching
-  double energy_b;  // accumulated energies for bending
-  double energy_t;  // accumulated energies for tube-tube interaction
-  double *eatom_s, *eatom_b, *eatom_t; // accumulated per-atom values
-
- protected:
-  int BendingMode, TPMType;
-  char* tab_path;
-  int tab_path_length;
-  double cut_global;
-  double **cut;
-  static int instance_count;
-
-  virtual void allocate();
-  virtual void *extract(const char *, int &);
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-E: Pair style mesont/tpm requires newton pair on
-
-newton_pair must be set to on
-
-E: The selected cutoff is too small for the current system
-
-cutoff must be increased.
-
-E: Illegal pair_style command
-
-Incorrect argument list in the style init.
-
-E: Incorrect table path
-
-Incorrect path to the table files.
-
-E: Incorrect BendingMode
-
-Self-explanatory.
-
-E: Incorrect TPMType
-
-Self-explanatory.
-
-E: Inconsistent input and potential table
-
-The tube diameter is inconsistent with the chirality specified
-during generation of the potential table.
-
-*/
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+
+   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(mesont/tpm,PairMESONTTPM)
+
+#else
+
+#ifndef LMP_PAIR_MESONT_TPM_H
+#define LMP_PAIR_MESONT_TPM_H
+
+#include "pair.h"
+
+namespace LAMMPS_NS {
+
+class PairMESONTTPM : public Pair {
+ public:
+  PairMESONTTPM(class LAMMPS *);
+  virtual ~PairMESONTTPM();
+  virtual void compute(int, int);
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  void write_data(FILE *);
+  void write_data_all(FILE *);
+  virtual void init_style();
+
+  double energy_s;  // accumulated energies for stretching
+  double energy_b;  // accumulated energies for bending
+  double energy_t;  // accumulated energies for tube-tube interaction
+  double *eatom_s, *eatom_b, *eatom_t; // accumulated per-atom values
+
+ protected:
+  int BendingMode, TPMType;
+  char* tab_path;
+  int tab_path_length;
+  double cut_global;
+  double **cut;
+  static int instance_count;
+  int nmax;
+
+  virtual void allocate();
+  virtual void *extract(const char *, int &);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Pair style mesont/tpm requires newton pair on
+
+newton_pair must be set to on
+
+E: The selected cutoff is too small for the current system
+
+cutoff must be increased.
+
+E: Illegal pair_style command
+
+Incorrect argument list in the style init.
+
+E: Incorrect table path
+
+Incorrect path to the table files.
+
+E: Incorrect BendingMode
+
+Self-explanatory.
+
+E: Incorrect TPMType
+
+Self-explanatory.
+
+E: Inconsistent input and potential table
+
+The tube diameter is inconsistent with the chirality specified
+during generation of the potential table.
+
+*/

From 769e7a099511f15e857e079722520576bf7ad940 Mon Sep 17 00:00:00 2001
From: iafoss <iafoss@yandex.ru>
Date: Mon, 2 Nov 2020 16:24:57 -0500
Subject: [PATCH 35/64] Revert "Add files via upload"

This reverts commit e6643979516195965c7261053878f4b88d9aaa2b.
---
 src/USER-MESONT/pair_mesont_tpm.cpp | 1611 +++++++++++++--------------
 src/USER-MESONT/pair_mesont_tpm.h   |  197 ++--
 2 files changed, 901 insertions(+), 907 deletions(-)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index a58f9892ed..9185786341 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -1,808 +1,803 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
-------------------------------------------------------------------------- */
-
-#include "pair_mesont_tpm.h"
-#include "export_mesont.h"
-
-
-#include "atom.h"
-#include "comm.h"
-#include "force.h"
-#include "memory.h"
-#include "error.h"
-#include "neighbor.h"
-#include "neigh_list.h"
-#include "neigh_request.h"
-
-#include <cstring>
-#include <vector>
-#include <cmath>
-
-#include <fstream>
-#include <sstream>
-#include <algorithm>
-
-using namespace LAMMPS_NS;
-
-//since LAMMPS is compiled with C++ 2003, define a substitution for std::array
-template<typename T, int N>
-class array2003{
-public:
-  T& operator[] (int idx){ return data[idx];};
-  const T& operator[] (int idx) const{ return data[idx];};
-private:
-  T data[N];
-};
-
-
-class MESONTList {
-public:
-  MESONTList(const Atom* atom, const NeighList* nblist, double rc2);
-  ~MESONTList() {};
-  //list of segments
-  const std::vector<array2003<int,2> >& get_segments() const;
-  //list of triplets
-  const std::vector<array2003<int,3> >& get_triplets() const;
-  //list of neighbor chains [start,end] for segments
-  //(use idx() to get real indexes)
-  const std::vector<std::vector<array2003<int,2> > >& get_nbs() const;
-  //convert idx from sorted representation to real idx
-  int get_idx(int idx) const;
-  //return list of indexes for conversion from sorted representation
-  const std::vector<int>& get_idx_list() const;
-  //convert idx from real idx to sorted representation
-  int get_idxb(int idx) const;
-  //return list of indexes for conversion to sorted representation
-  const std::vector<int>& get_idxb_list() const;
-  //check if the node is the end of the tube
-  bool is_end(int idx) const;
-
-  array2003<int, 2> get_segment(int idx) const;
-  array2003<int, 3> get_triplet(int idx) const;
-
-  static const int cnt_end = -1;
-  static const int domain_end = -2;
-  static const int not_cnt = -3;
-private:
-  std::vector<array2003<int, 2> > chain_list, segments;
-  std::vector<array2003<int, 3> > triplets;
-  std::vector<std::vector<array2003<int, 2> > > nb_chains;
-  std::vector<int> index_list, index_list_b;
-};
-
-//=============================================================================
-
-inline const std::vector<std::vector<array2003<int, 2> > > &
- MESONTList::get_nbs() const {
-  return nb_chains;
-}
-
-inline int MESONTList::get_idx(int idx) const {
-  return index_list[idx];
-}
-
-inline const std::vector<int>& MESONTList::get_idx_list() const {
-  return index_list;
-};
-
-
-inline int MESONTList::get_idxb(int idx) const {
-  return index_list_b[idx];
-}
-
-inline const std::vector<int>& MESONTList::get_idxb_list() const {
-  return index_list_b;
-};
-
-inline const std::vector<array2003<int, 2> > & MESONTList::get_segments()
- const {
-  return segments;
-}
-
-inline const std::vector<array2003<int, 3> > & MESONTList::get_triplets()
- const {
-  return triplets;
-}
-
-inline array2003<int, 2> MESONTList::get_segment(int idx) const {
-  array2003<int, 2> result;
-  result[0] = chain_list[idx][0];
-  result[1] = idx;
-  return result;
-}
-
-inline array2003<int, 3> MESONTList::get_triplet(int idx) const {
-  array2003<int, 3> result;
-  result[0] = chain_list[idx][0];
-  result[1] = idx;
-  result[2] = chain_list[idx][1];
-  return result;
-}
-
-inline bool MESONTList::is_end(int idx) const {
-  return chain_list[idx][0] == cnt_end || chain_list[idx][1] == cnt_end;
-};
-
-template<typename T>
-void vector_union(std::vector<T>& v1, std::vector<T>& v2,
- std::vector<T>& merged) {
-  std::sort(v1.begin(), v1.end());
-  std::sort(v2.begin(), v2.end());
-  merged.reserve(v1.size() + v2.size());
-  typename std::vector<T>::iterator it1 = v1.begin();
-  typename std::vector<T>::iterator it2 = v2.begin();
-
-  while (it1 != v1.end() && it2 != v2.end()) {
-    if (*it1 < *it2) {
-      if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
-        ++it1;
-    }
-    else {
-      if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
-      ++it2;
-    }
-  }
-  while (it1 != v1.end()) {
-    if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
-    ++it1;
-  }
-
-  while (it2 != v2.end()) {
-  if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
-    ++it2;
-  }
-}
-
-MESONTList::MESONTList(const Atom* atom, const NeighList* nblist, double /* rc2 */){
-  if (atom == nullptr || nblist == nullptr) return;
-  //number of local atoms at the node
-  int nlocal = atom->nlocal;
-  //total number of atoms in the node and ghost shell
-  int nall = nblist->inum + nblist->gnum;
-  int ntot = atom->nlocal + atom->nghost;
-  tagint* const g_id = atom->tag;
-  tagint** const bonds = atom->bond_nt;
-  tagint* const chain_id = atom->molecule;
-  int* ilist = nblist->ilist;
-
-  //convert bonds to local id representation
-  array2003<int, 2> tmp_arr;
-  tmp_arr[0] = not_cnt; tmp_arr[1] = not_cnt;
-  chain_list.resize(ntot, tmp_arr);
-  for (int ii = 0; ii < nall; ii++) {
-    int i = ilist[ii];
-    chain_list[i][0] = domain_end;
-    chain_list[i][1] = domain_end;
-  }
-  for (int ii = 0; ii < nall; ii++) {
-    int i = ilist[ii];
-    int nnb = nblist->numneigh[i];
-    for (int m = 0; m < 2; m++)
-      if (bonds[i][m] == cnt_end) chain_list[i][m] = cnt_end;
-    for (int j = 0; j < nnb; j++) {
-      int nb = nblist->firstneigh[i][j];
-      if (bonds[i][0] == g_id[nb]){
-        chain_list[i][0] = nb;
-        chain_list[nb][1] = i;
-        break;
-      }
-    }
-  }
-
-  //reorder chains: index list
-  //list of indexes for conversion FROM reordered representation
-  index_list.reserve(nall);
-  index_list_b.resize(ntot, -1); // convert index TO reordered representation
-  for (int i = 0; i < ntot; i++) {
-    if (chain_list[i][0] == cnt_end || chain_list[i][0] == domain_end) {
-      index_list.push_back(i);
-      index_list_b[i] = index_list.size() - 1;
-      int idx = i;
-      while (1) {
-        idx = chain_list[idx][1];
-        if (idx == cnt_end || idx == domain_end) break;
-        else index_list.push_back(idx);
-        index_list_b[idx] = index_list.size() - 1;
-      }
-    }
-  }
-
-  //segment list
-  for (int i = 0; i < nlocal; i++) {
-    if (chain_list[i][0] == not_cnt) continue;
-    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
-     g_id[i] < g_id[chain_list[i][0]]){
-      array2003<int, 2> tmp_c;
-      tmp_c[0] = i; tmp_c[1] = chain_list[i][0];
-      segments.push_back(tmp_c);
-    }
-    if (chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end &&
-     g_id[i] < g_id[chain_list[i][1]]){
-      array2003<int, 2> tmp_c;
-       tmp_c[0] = i; tmp_c[1] = chain_list[i][1];
-       segments.push_back(tmp_c);
-    }
-  }
-  int nbonds = segments.size();
-
-  //triplets
-  for (int i = 0; i < nlocal; i++){
-    if (chain_list[i][0] == not_cnt) continue;
-    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
-     chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end)
-      triplets.push_back(get_triplet(i));
-  }
-
-  //segment neighbor list
-  nb_chains.resize(nbonds);
-  std::vector<int> nb_list_i[2], nb_list;
-  for (int i = 0; i < nbonds; i++) {
-    //union of nb lists
-    for (int m = 0; m < 2; m++) {
-      nb_list_i[m].resize(0);
-      int idx = segments[i][m];
-      if (idx >= nlocal) continue;
-      int nnb = nblist->numneigh[idx];
-      for (int j = 0; j < nnb; j++) {
-        int jdx = nblist->firstneigh[idx][j];
-        //no self interactions for nbs within the same tube
-        if (chain_id[jdx] == chain_id[idx] &&
-         std::abs(index_list_b[idx] - index_list_b[jdx]) <= 5) continue;
-        nb_list_i[m].push_back(index_list_b[jdx]);
-      }
-    }
-    vector_union(nb_list_i[0], nb_list_i[1], nb_list);
-
-    int nnb = nb_list.size();
-    if (nnb > 0) {
-      int idx_s = nb_list[0];
-      for (int j = 0; j < nnb; j++) {
-        //if nodes are not continuous in the sorted representation
-        //or represent chain ends, create a new neighbor chain
-        int idx_next = chain_list[index_list[nb_list[j]]][1];
-        if ((j == nnb - 1) || (nb_list[j] + 1 != nb_list[j+1]) ||
-         (idx_next == cnt_end) || (idx_next == domain_end)) {
-          array2003<int, 2> chain;
-          chain[0] = idx_s;
-          chain[1] = nb_list[j];
-          //make sure that segments having at least one node
-          //in the neighbor list are included
-          int idx0 = index_list[chain[0]]; // real id of the ends
-          int idx1 = index_list[chain[1]];
-          if (chain_list[idx0][0] != cnt_end &&
-           chain_list[idx0][0] != domain_end) chain[0] -= 1;
-          if (chain_list[idx1][1] != cnt_end &&
-           chain_list[idx1][1] != domain_end) chain[1] += 1;
-          if(chain[0] != chain[1]) nb_chains[i].push_back(chain);
-          idx_s = (j == nnb - 1) ? -1 : nb_list[j + 1];
-        }
-      }
-    }
-    nb_list.resize(0);
-  }
-}
-
-/* ---------------------------------------------------------------------- */
-
-// the cutoff distance between walls of tubes
-static const double TPBRcutoff  = 3.0*3.4;
-int PairMESONTTPM::instance_count = 0;
-/* ---------------------------------------------------------------------- */
-
-PairMESONTTPM::PairMESONTTPM(LAMMPS *lmp) : Pair(lmp) {
-  writedata=1;
-  BendingMode = 0;  // Harmonic bending model
-  TPMType = 0;      // Inter-tube segment-segment interaction
-  tab_path = nullptr;
-  tab_path_length = 0;
-
-  eatom_s = nullptr;
-  eatom_b = nullptr;
-  eatom_t = nullptr;
-  nmax = 0;
-  instance_count++;
-  if(instance_count > 1) error->all(FLERR,
-   "only a single instance of mesont/tpm pair style can be created");
-}
-
-/* ---------------------------------------------------------------------- */
-
-PairMESONTTPM::~PairMESONTTPM()
-{
-  if (allocated) {
-    memory->destroy(setflag);
-    memory->destroy(cutsq);
-    memory->destroy(cut);
-
-    memory->destroy(eatom_s);
-    memory->destroy(eatom_b);
-    memory->destroy(eatom_t);
-  }
-  instance_count--;
-  if (tab_path != nullptr) memory->destroy(tab_path);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairMESONTTPM::compute(int eflag, int vflag){
-  // set per atom values and accumulators
-  // reallocate per-atom arrays if necessary
-  ev_init(eflag,vflag);
-  if (atom->nmax > nmax) {
-    memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
-    memory->destroy(eatom_b);
-    memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
-    memory->destroy(eatom_t);
-    memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
-    nmax = atom->nmax;
-  }
-  //total number of atoms in the node and ghost shell
-  int nall = list->inum + list->gnum;
-  int ntot = atom->nlocal + atom->nghost;
-  int newton_pair = force->newton_pair;
-  if(!newton_pair)
-   error->all(FLERR,"Pair style mesont/tpm requires newton pair on");
-
-  double **x = atom->x;
-  double **f = atom->f;
-  double *r = atom->radius;
-  double *l = atom->length;
-  int *buckling = atom->buckling;
-  tagint *g_id = atom->tag;
-
-  //check if cutoff is chosen correctly
-  double RT = mesont_lib_get_R();
-  double Lmax = 0.0;
-  for (int ii = 0; ii < list->inum; ii++) {
-    int i = list->ilist[ii];
-    if (Lmax < l[i]) Lmax = l[i];
-  }
-  double Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
-   std::pow((2.0*RT + TPBRcutoff),2)));
-  if (cut_global < Rcut_min){
-    std::stringstream err;
-    err << "The selected cutoff is too small for the current system : " <<
-     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
-     ", Rcut_min = " << Rcut_min;
-    error->all(FLERR, err.str().c_str());
-  }
-
-  //generate bonds and chain nblist
-  MESONTList ntlist(atom, list, cut_global*cut_global);
-
-  //reorder data to make it contiguous within tubes
-  //and compatible with Fortran functions
-  std::vector<double> x_sort(3*nall), f_sort(3*nall), s_sort(9*nall);
-  std::vector<double> u_ts_sort(nall), u_tb_sort(nall), u_tt_sort(nall);
-  std::vector<int> b_sort(nall);
-  for (int i = 0; i < nall; i++){
-    int idx = ntlist.get_idx(i);
-    for (int j = 0; j < 3; j++) x_sort[3*i+j] = x[idx][j];
-    b_sort[i] = buckling[idx];
-  }
-
-  //bending potential
-  int n_triplets = ntlist.get_triplets().size();
-  for (int i = 0; i < n_triplets; i++) {
-    const array2003<int,3>& t = ntlist.get_triplets()[i];
-    //idx of nodes of a triplet in sorted representation
-    int idx_s0 = ntlist.get_idxb(t[0]);
-    int idx_s1 = ntlist.get_idxb(t[1]);
-    int idx_s2 = ntlist.get_idxb(t[2]);
-
-    double* X1 = &(x_sort[3*idx_s0]);
-    double* X2 = &(x_sort[3*idx_s1]);
-    double* X3 = &(x_sort[3*idx_s2]);
-    double& U1b = u_tb_sort[idx_s0];
-    double& U2b = u_tb_sort[idx_s1];
-    double& U3b = u_tb_sort[idx_s2];
-    double* F1 = &(f_sort[3*idx_s0]);
-    double* F2 = &(f_sort[3*idx_s1]);
-    double* F3 = &(f_sort[3*idx_s2]);
-    double* S1 = &(s_sort[9*idx_s0]);
-    double* S2 = &(s_sort[9*idx_s1]);
-    double* S3 = &(s_sort[9*idx_s2]);
-    double& R123 = r[t[1]];
-    double& L123 = l[t[1]];
-    int& BBF2 = b_sort[idx_s1];
-
-    mesont_lib_TubeBendingForceField(U1b, U2b, U3b, F1, F2, F3, S1, S2, S3,
-     X1, X2, X3, R123, L123, BBF2);
-  }
-
-  //share new values of buckling
-  if (BendingMode == 1){
-    for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      buckling[idx] = b_sort[i];
-    }
-    comm->forward_comm_pair(this);
-    for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      b_sort[i] = buckling[idx];
-    }
-  }
-
-  //segment-segment and segment-tube interactions
-  int n_segments = ntlist.get_segments().size();
-  double Rmax = 0.0;
-  Lmax = 0.0;
-  for (int i = 0; i < n_segments; i++) {
-    const array2003<int,2>& s = ntlist.get_segments()[i];
-    //idx of a segment end 1 in sorted representation
-    int idx_s0 = ntlist.get_idxb(s[0]);
-    //idx of a segment end 2 in sorted representation
-    int idx_s1 = ntlist.get_idxb(s[1]);
-    double* X1 = &(x_sort[3*idx_s0]);
-    double* X2 = &(x_sort[3*idx_s1]);
-    double length = std::sqrt(std::pow(X1[0]-X2[0],2) +
-     std::pow(X1[1]-X2[1],2) + std::pow(X1[2]-X2[2],2));
-    if (length > Lmax) Lmax = length;
-    double& U1t = u_tt_sort[idx_s0];
-    double& U2t = u_tt_sort[idx_s1];
-    double& U1s = u_ts_sort[idx_s0];
-    double& U2s = u_ts_sort[idx_s1];
-    double* F1 = &(f_sort[3*idx_s0]);
-    double* F2 = &(f_sort[3*idx_s1]);
-    double* S1 = &(s_sort[9*idx_s0]);
-    double* S2 = &(s_sort[9*idx_s1]);
-    double R12 = r[s[0]]; if (R12 > Rmax) Rmax = R12;
-    if (std::abs(R12 - RT) > 1e-3)
-        error->all(FLERR,"Inconsistent input and potential table");
-    //assume that the length of the segment is defined by the node with
-    //smallest global id
-    double L12 = (g_id[s[0]] > g_id[s[1]]) ? l[s[1]] : l[s[0]];
-    mesont_lib_TubeStretchingForceField(U1s, U2s, F1, F2, S1, S2, X1, X2,
-     R12, L12);
-
-    for (int nc = 0; nc < (int)ntlist.get_nbs()[i].size(); nc++){
-      //id of the beginning and end of the chain in the sorted representation
-      const array2003<int,2>& chain = ntlist.get_nbs()[i][nc];
-      int N = chain[1] - chain[0] + 1;  //number of elements in the chain
-      int end1 = ntlist.get_idx(chain[0]);  //chain ends (real representation)
-      int end2 = ntlist.get_idx(chain[1]);
-      double* X = &(x_sort[3*chain[0]]);
-      double* Ut = &(u_tt_sort[chain[0]]);
-      double* F = &(f_sort[3*chain[0]]);
-      double* S = &(s_sort[9*chain[0]]);
-      double R = r[end1];
-      int* BBF = &(b_sort[chain[0]]);
-      int E1 = ntlist.is_end(end1);
-      int E2 = ntlist.is_end(end2);
-
-      int Ee = 0;
-      double* Xe = X; double* Fe = F; double* Se = S;
-      if (!E1 && ntlist.get_triplet(end1)[0] != MESONTList::domain_end &&
-       ntlist.get_triplet(ntlist.get_triplet(end1)[0])[0] ==
-       MESONTList::cnt_end){
-        Ee = 1;
-        int idx = ntlist.get_idxb(ntlist.get_triplet(end1)[0]);
-        Xe = &(x_sort[3*idx]);
-        Fe = &(f_sort[3*idx]);
-        Se = &(s_sort[9*idx]);
-      }
-      else if (!E2 && ntlist.get_triplet(end2)[2] != MESONTList::domain_end &&
-       ntlist.get_triplet(ntlist.get_triplet(end2)[2])[2] ==
-       MESONTList::cnt_end){
-        Ee = 2;
-        int idx = ntlist.get_idxb(ntlist.get_triplet(end2)[2]);
-        Xe = &(x_sort[3*idx]);
-        Fe = &(f_sort[3*idx]);
-        Se = &(s_sort[9*idx]);
-      }
-
-      mesont_lib_SegmentTubeForceField(U1t, U2t, Ut, F1, F2, F, Fe, S1, S2, S,
-       Se, X1, X2, R12, N, X, Xe, BBF, R, E1, E2, Ee, TPMType);
-    }
-  }
-
-  //check if cutoff is chosen correctly
-  Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
-   std::pow((2.0*Rmax + TPBRcutoff),2)));
-  if (cut_global < Rcut_min){
-    std::stringstream err;
-    err << "The selected cutoff is too small for the current system : " <<
-     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
-     ", Rcut_min = " << Rcut_min;
-    error->all(FLERR, err.str().c_str());
-  }
-
-  //convert from sorted representation
-  for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
-      buckling[idx] = b_sort[i];
-  }
-  if(eflag){
-    eng_vdwl = 0.0; energy_s = 0.0;
-    energy_b = 0.0; energy_t = 0.0;
-    for (int i = 0; i < ntot; i++){
-      eatom[i] = 0.0; eatom_s[i] = 0.0;
-      eatom_b[i] = 0.0; eatom_t[i] = 0.0;
-    }
-    for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      eatom_s[idx] = u_ts_sort[i];
-      eatom_b[idx] = u_tb_sort[i];
-      eatom_t[idx] = u_tt_sort[i];
-      eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
-      energy_s += u_ts_sort[i];
-      energy_b += u_tb_sort[i];
-      energy_t += u_tt_sort[i];
-    }
-    eng_vdwl = energy_s + energy_b + energy_t;
-  }
-  if(vflag){
-    for (int i = 0; i < 6; i++) virial[i] = 0.0;
-    for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      virial[0] += s_sort[9*i+0]; //xx
-      virial[1] += s_sort[9*i+4]; //yy
-      virial[2] += s_sort[9*i+8]; //zz
-      virial[3] += s_sort[9*i+1]; //xy
-      virial[4] += s_sort[9*i+2]; //xz
-      virial[5] += s_sort[9*i+5]; //yz
-    }
-  }
-  int vflag_atom = vflag & 4;
-  if(vflag_atom){
-    for (int i = 0; i < ntot; i++)
-      for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;
-    for (int i = 0; i < nall; i++){
-      int idx = ntlist.get_idx(i);
-      vatom[idx][0] = s_sort[9*i+0]; //xx
-      vatom[idx][1] = s_sort[9*i+4]; //yy
-      vatom[idx][2] = s_sort[9*i+8]; //zz
-      vatom[idx][3] = s_sort[9*i+1]; //xy
-      vatom[idx][4] = s_sort[9*i+2]; //xz
-      vatom[idx][5] = s_sort[9*i+5]; //yz
-    }
-  }
-
-}
-
-/* ----------------------------------------------------------------------
-   allocate all arrays
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::allocate(){
-  allocated = 1;
-  int n = atom->ntypes;
-
-  memory->create(setflag,n+1,n+1,"pair:setflag");
-  for (int i = 1; i <= n; i++)
-    for (int j = i; j <= n; j++)
-      setflag[i][j] = 0;
-
-  memory->create(cutsq,n+1,n+1,"pair:cutsq");
-  memory->create(cut,n+1,n+1,"pair:cut");
-}
-
-/* ----------------------------------------------------------------------
-   global settings
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::settings(int narg, char **arg){
-  if ((narg == 0) || (narg > 4))
-    error->all(FLERR,"Illegal pair_style command");
-  cut_global = utils::numeric(FLERR,arg[0],false,lmp);
-
-  // reset cutoffs that have been explicitly set
-  if (allocated) {
-    int i,j;
-    for (i = 1; i <= atom->ntypes; i++)
-      for (j = i+1; j <= atom->ntypes; j++)
-        cut[i][j] = cut_global;
-  }
-  std::string TPMAFile = (narg > 1) ? arg[1] : "MESONT-TABTP.xrs";
-  tab_path_length = TPMAFile.length();
-  if (tab_path != nullptr) memory->destroy(tab_path);
-  //c_str returns '\0' terminated string
-  memory->create(tab_path,tab_path_length+1,"pair:path");
-  std::memcpy(tab_path, TPMAFile.c_str(), tab_path_length+1);
-  mesont_lib_SetTablePath(tab_path, tab_path_length);
-
-  if (narg > 2) {
-    BendingMode = utils::numeric(FLERR,arg[2],false,lmp);
-    if ((BendingMode < 0) || (BendingMode > 1))
-      error->all(FLERR,"Incorrect BendingMode");
-  }
-  if (narg > 3) {
-    TPMType = utils::numeric(FLERR,arg[3],false,lmp);
-    if ((TPMType < 0) || (TPMType > 1))
-      error->all(FLERR,"Incorrect TPMType");
-  }
-
-  mesont_lib_TPBInit();
-  int M, N;
-  std::ifstream in(TPMAFile);
-  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
-  std::string tmp;
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  in >> M >> N;
-  in.close();
-  mesont_lib_TPMInit(M, N);
-  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
-}
-
-/* ----------------------------------------------------------------------
-   set coeffs for one or more type pairs
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::coeff(int narg, char **arg){
-  if ((narg < 2) || (narg > 3))
-    error->all(FLERR,"Incorrect args for pair coefficients");
-
-  if (!allocated) allocate();
-
-  int ilo,ihi,jlo,jhi;
-  utils::bounds(FLERR,arg[0],1,atom->ntypes,ilo,ihi,error);
-  utils::bounds(FLERR,arg[1],1,atom->ntypes,jlo,jhi,error);
-
-  double cut_one = cut_global;
-  if (narg == 3) cut_one = utils::numeric(FLERR,arg[2],false,lmp);
-
-  int count = 0;
-  for (int i = ilo; i <= ihi; i++) {
-    for (int j = MAX(jlo,i); j <= jhi; j++) {
-      cut[i][j] = cut_one;
-      setflag[i][j] = 1;
-      count++;
-    }
-  }
-
-  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
-}
-
-/* ----------------------------------------------------------------------
-   init for one type pair i,j and corresponding j,i
-------------------------------------------------------------------------- */
-
-double PairMESONTTPM::init_one(int i, int j){
-  if (setflag[i][j] == 0) {
-    cut[i][j] = mix_distance(cut[i][i],cut[j][j]);
-  }
-
-  return cut[i][j];
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_restart(FILE *fp){
-  write_restart_settings(fp);
-
-  int i,j;
-  for (i = 1; i <= atom->ntypes; i++)
-    for (j = i; j <= atom->ntypes; j++) {
-      fwrite(&setflag[i][j],sizeof(int),1,fp);
-      if (setflag[i][j]) {
-        fwrite(&cut[i][j],sizeof(double),1,fp);
-      }
-    }
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::read_restart(FILE *fp){
-  read_restart_settings(fp);
-  allocate();
-
-  int i,j;
-  int me = comm->me;
-  for (i = 1; i <= atom->ntypes; i++)
-    for (j = i; j <= atom->ntypes; j++) {
-      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
-      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
-      if (setflag[i][j]) {
-        if (me == 0) {
-          fread(&cut[i][j],sizeof(double),1,fp);
-        }
-        MPI_Bcast(&cut[i][j],1,MPI_DOUBLE,0,world);
-      }
-    }
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to restart file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_restart_settings(FILE *fp){
-  fwrite(&BendingMode,sizeof(int),1,fp);
-  fwrite(&TPMType,sizeof(int),1,fp);
-  fwrite(&cut_global,sizeof(double),1,fp);
-  fwrite(&tab_path_length,sizeof(int),1,fp);
-  fwrite(tab_path,tab_path_length+1,1,fp);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 reads from restart file, bcasts
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::read_restart_settings(FILE *fp){
-  int me = comm->me;
-  if (me == 0) {
-    fread(&BendingMode,sizeof(int),1,fp);
-    fread(&TPMType,sizeof(int),1,fp);
-    fread(&cut_global,sizeof(double),1,fp);
-    fread(&tab_path_length,sizeof(int),1,fp);
-  }
-  MPI_Bcast(&BendingMode,1,MPI_INT,0,world);
-  MPI_Bcast(&TPMType,1,MPI_INT,0,world);
-  MPI_Bcast(&cut_global,1,MPI_DOUBLE,0,world);
-  MPI_Bcast(&tab_path_length,1,MPI_INT,0,world);
-
-  if (tab_path != nullptr) memory->destroy(tab_path);
-  memory->create(tab_path,tab_path_length+1,"pair:path");
-  if (me == 0) fread(tab_path,tab_path_length+1,1,fp);
-  MPI_Bcast(tab_path,tab_path_length+1,MPI_CHAR,0,world);
-  mesont_lib_SetTablePath(tab_path,tab_path_length);
-  mesont_lib_TPBInit();
-  int M, N;
-  std::ifstream in(tab_path);
-  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
-  std::string tmp;
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  std::getline(in,tmp);
-  in >> M >> N;
-  in.close();
-  mesont_lib_TPMInit(M, N);
-  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes to data file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_data(FILE *fp){
-  for (int i = 1; i <= atom->ntypes; i++)
-    fprintf(fp,"%d\n",i);
-}
-
-/* ----------------------------------------------------------------------
-   proc 0 writes all pairs to data file
-------------------------------------------------------------------------- */
-
-void PairMESONTTPM::write_data_all(FILE *fp){
-  for (int i = 1; i <= atom->ntypes; i++)
-    for (int j = i; j <= atom->ntypes; j++)
-      fprintf(fp,"%d %d %g\n",i,j,cut[i][j]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void PairMESONTTPM::init_style(){
-  //make sure that a full list is created (including ghost nodes)
-  int r = neighbor->request(this,instance_me);
-  neighbor->requests[r]->half = false;
-  neighbor->requests[r]->full = true;
-  neighbor->requests[r]->ghost = true;
-}
-
-void* PairMESONTTPM::extract(const char *str, int &){
-  if (strcmp(str,"mesonttpm_Es_tot") == 0) return &energy_s;
-  else if (strcmp(str,"mesonttpm_Eb_tot") == 0) return &energy_b;
-  else if (strcmp(str,"mesonttpm_Et_tot") == 0) return &energy_t;
-  else if (strcmp(str,"mesonttpm_Es") == 0) return eatom_s;
-  else if (strcmp(str,"mesonttpm_Eb") == 0) return eatom_b;
-  else if (strcmp(str,"mesonttpm_Et") == 0) return eatom_t;
-  else return nullptr;
-};
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://lammps.sandia.gov/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+
+   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
+------------------------------------------------------------------------- */
+
+#include "pair_mesont_tpm.h"
+#include "export_mesont.h"
+
+
+#include "atom.h"
+#include "comm.h"
+#include "force.h"
+#include "memory.h"
+#include "error.h"
+#include "neighbor.h"
+#include "neigh_list.h"
+#include "neigh_request.h"
+
+#include <cstring>
+#include <vector>
+#include <cmath>
+
+#include <fstream>
+#include <sstream>
+#include <algorithm>
+
+using namespace LAMMPS_NS;
+
+//since LAMMPS is compiled with C++ 2003, define a substitution for std::array
+template<typename T, int N>
+class array2003{
+public:
+  T& operator[] (int idx){ return data[idx];};
+  const T& operator[] (int idx) const{ return data[idx];};
+private:
+  T data[N];
+};
+
+
+class MESONTList {
+public:
+  MESONTList(const Atom* atom, const NeighList* nblist, double rc2);
+  ~MESONTList() {};
+  //list of segments
+  const std::vector<array2003<int,2> >& get_segments() const;
+  //list of triplets
+  const std::vector<array2003<int,3> >& get_triplets() const;
+  //list of neighbor chains [start,end] for segments
+  //(use idx() to get real indexes)
+  const std::vector<std::vector<array2003<int,2> > >& get_nbs() const;
+  //convert idx from sorted representation to real idx
+  int get_idx(int idx) const;
+  //return list of indexes for conversion from sorted representation
+  const std::vector<int>& get_idx_list() const;
+  //convert idx from real idx to sorted representation
+  int get_idxb(int idx) const;
+  //return list of indexes for conversion to sorted representation
+  const std::vector<int>& get_idxb_list() const;
+  //check if the node is the end of the tube
+  bool is_end(int idx) const;
+
+  array2003<int, 2> get_segment(int idx) const;
+  array2003<int, 3> get_triplet(int idx) const;
+
+  static const int cnt_end = -1;
+  static const int domain_end = -2;
+  static const int not_cnt = -3;
+private:
+  std::vector<array2003<int, 2> > chain_list, segments;
+  std::vector<array2003<int, 3> > triplets;
+  std::vector<std::vector<array2003<int, 2> > > nb_chains;
+  std::vector<int> index_list, index_list_b;
+};
+
+//=============================================================================
+
+inline const std::vector<std::vector<array2003<int, 2> > > &
+ MESONTList::get_nbs() const {
+  return nb_chains;
+}
+
+inline int MESONTList::get_idx(int idx) const {
+  return index_list[idx];
+}
+
+inline const std::vector<int>& MESONTList::get_idx_list() const {
+  return index_list;
+};
+
+
+inline int MESONTList::get_idxb(int idx) const {
+  return index_list_b[idx];
+}
+
+inline const std::vector<int>& MESONTList::get_idxb_list() const {
+  return index_list_b;
+};
+
+inline const std::vector<array2003<int, 2> > & MESONTList::get_segments()
+ const {
+  return segments;
+}
+
+inline const std::vector<array2003<int, 3> > & MESONTList::get_triplets()
+ const {
+  return triplets;
+}
+
+inline array2003<int, 2> MESONTList::get_segment(int idx) const {
+  array2003<int, 2> result;
+  result[0] = chain_list[idx][0];
+  result[1] = idx;
+  return result;
+}
+
+inline array2003<int, 3> MESONTList::get_triplet(int idx) const {
+  array2003<int, 3> result;
+  result[0] = chain_list[idx][0];
+  result[1] = idx;
+  result[2] = chain_list[idx][1];
+  return result;
+}
+
+inline bool MESONTList::is_end(int idx) const {
+  return chain_list[idx][0] == cnt_end || chain_list[idx][1] == cnt_end;
+};
+
+template<typename T>
+void vector_union(std::vector<T>& v1, std::vector<T>& v2,
+ std::vector<T>& merged) {
+  std::sort(v1.begin(), v1.end());
+  std::sort(v2.begin(), v2.end());
+  merged.reserve(v1.size() + v2.size());
+  typename std::vector<T>::iterator it1 = v1.begin();
+  typename std::vector<T>::iterator it2 = v2.begin();
+
+  while (it1 != v1.end() && it2 != v2.end()) {
+    if (*it1 < *it2) {
+      if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
+        ++it1;
+    }
+    else {
+      if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
+      ++it2;
+    }
+  }
+  while (it1 != v1.end()) {
+    if (merged.empty() || merged.back() < *it1) merged.push_back(*it1);
+    ++it1;
+  }
+
+  while (it2 != v2.end()) {
+  if (merged.empty() || merged.back() < *it2) merged.push_back(*it2);
+    ++it2;
+  }
+}
+
+MESONTList::MESONTList(const Atom* atom, const NeighList* nblist, double /* rc2 */){
+  if (atom == nullptr || nblist == nullptr) return;
+  //number of local atoms at the node
+  int nlocal = atom->nlocal;
+  //total number of atoms in the node and ghost shell
+  int nall = nblist->inum + nblist->gnum;
+  int ntot = atom->nlocal + atom->nghost;
+  tagint* const g_id = atom->tag;
+  tagint** const bonds = atom->bond_nt;
+  tagint* const chain_id = atom->molecule;
+  int* ilist = nblist->ilist;
+
+  //convert bonds to local id representation
+  array2003<int, 2> tmp_arr;
+  tmp_arr[0] = not_cnt; tmp_arr[1] = not_cnt;
+  chain_list.resize(ntot, tmp_arr);
+  for (int ii = 0; ii < nall; ii++) {
+    int i = ilist[ii];
+    chain_list[i][0] = domain_end;
+    chain_list[i][1] = domain_end;
+  }
+  for (int ii = 0; ii < nall; ii++) {
+    int i = ilist[ii];
+    int nnb = nblist->numneigh[i];
+    for (int m = 0; m < 2; m++)
+      if (bonds[i][m] == cnt_end) chain_list[i][m] = cnt_end;
+    for (int j = 0; j < nnb; j++) {
+      int nb = nblist->firstneigh[i][j];
+      if (bonds[i][0] == g_id[nb]){
+        chain_list[i][0] = nb;
+        chain_list[nb][1] = i;
+        break;
+      }
+    }
+  }
+
+  //reorder chains: index list
+  //list of indexes for conversion FROM reordered representation
+  index_list.reserve(nall);
+  index_list_b.resize(ntot, -1); // convert index TO reordered representation
+  for (int i = 0; i < ntot; i++) {
+    if (chain_list[i][0] == cnt_end || chain_list[i][0] == domain_end) {
+      index_list.push_back(i);
+      index_list_b[i] = index_list.size() - 1;
+      int idx = i;
+      while (1) {
+        idx = chain_list[idx][1];
+        if (idx == cnt_end || idx == domain_end) break;
+        else index_list.push_back(idx);
+        index_list_b[idx] = index_list.size() - 1;
+      }
+    }
+  }
+
+  //segment list
+  for (int i = 0; i < nlocal; i++) {
+    if (chain_list[i][0] == not_cnt) continue;
+    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
+     g_id[i] < g_id[chain_list[i][0]]){
+      array2003<int, 2> tmp_c;
+      tmp_c[0] = i; tmp_c[1] = chain_list[i][0];
+      segments.push_back(tmp_c);
+    }
+    if (chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end &&
+     g_id[i] < g_id[chain_list[i][1]]){
+      array2003<int, 2> tmp_c;
+       tmp_c[0] = i; tmp_c[1] = chain_list[i][1];
+       segments.push_back(tmp_c);
+    }
+  }
+  int nbonds = segments.size();
+
+  //triplets
+  for (int i = 0; i < nlocal; i++){
+    if (chain_list[i][0] == not_cnt) continue;
+    if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
+     chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end)
+      triplets.push_back(get_triplet(i));
+  }
+
+  //segment neighbor list
+  nb_chains.resize(nbonds);
+  std::vector<int> nb_list_i[2], nb_list;
+  for (int i = 0; i < nbonds; i++) {
+    //union of nb lists
+    for (int m = 0; m < 2; m++) {
+      nb_list_i[m].resize(0);
+      int idx = segments[i][m];
+      if (idx >= nlocal) continue;
+      int nnb = nblist->numneigh[idx];
+      for (int j = 0; j < nnb; j++) {
+        int jdx = nblist->firstneigh[idx][j];
+        //no self interactions for nbs within the same tube
+        if (chain_id[jdx] == chain_id[idx] &&
+         std::abs(index_list_b[idx] - index_list_b[jdx]) <= 5) continue;
+        nb_list_i[m].push_back(index_list_b[jdx]);
+      }
+    }
+    vector_union(nb_list_i[0], nb_list_i[1], nb_list);
+
+    int nnb = nb_list.size();
+    if (nnb > 0) {
+      int idx_s = nb_list[0];
+      for (int j = 0; j < nnb; j++) {
+        //if nodes are not continuous in the sorted representation
+        //or represent chain ends, create a new neighbor chain
+        int idx_next = chain_list[index_list[nb_list[j]]][1];
+        if ((j == nnb - 1) || (nb_list[j] + 1 != nb_list[j+1]) ||
+         (idx_next == cnt_end) || (idx_next == domain_end)) {
+          array2003<int, 2> chain;
+          chain[0] = idx_s;
+          chain[1] = nb_list[j];
+          //make sure that segments having at least one node
+          //in the neighbor list are included
+          int idx0 = index_list[chain[0]]; // real id of the ends
+          int idx1 = index_list[chain[1]];
+          if (chain_list[idx0][0] != cnt_end &&
+           chain_list[idx0][0] != domain_end) chain[0] -= 1;
+          if (chain_list[idx1][1] != cnt_end &&
+           chain_list[idx1][1] != domain_end) chain[1] += 1;
+          if(chain[0] != chain[1]) nb_chains[i].push_back(chain);
+          idx_s = (j == nnb - 1) ? -1 : nb_list[j + 1];
+        }
+      }
+    }
+    nb_list.resize(0);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+
+// the cutoff distance between walls of tubes
+static const double TPBRcutoff  = 3.0*3.4;
+int PairMESONTTPM::instance_count = 0;
+/* ---------------------------------------------------------------------- */
+
+PairMESONTTPM::PairMESONTTPM(LAMMPS *lmp) : Pair(lmp) {
+  writedata=1;
+  BendingMode = 0;  // Harmonic bending model
+  TPMType = 0;      // Inter-tube segment-segment interaction
+  tab_path = nullptr;
+  tab_path_length = 0;
+
+  eatom_s = nullptr;
+  eatom_b = nullptr;
+  eatom_t = nullptr;
+  instance_count++;
+  if(instance_count > 1) error->all(FLERR,
+   "only a single instance of mesont/tpm pair style can be created");
+}
+
+/* ---------------------------------------------------------------------- */
+
+PairMESONTTPM::~PairMESONTTPM()
+{
+  if (allocated) {
+    memory->destroy(setflag);
+    memory->destroy(cutsq);
+    memory->destroy(cut);
+
+    memory->destroy(eatom_s);
+    memory->destroy(eatom_b);
+    memory->destroy(eatom_t);
+  }
+  instance_count--;
+  if (tab_path != nullptr) memory->destroy(tab_path);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMESONTTPM::compute(int eflag, int vflag){
+  ev_init(eflag,vflag);
+  //total number of atoms in the node and ghost shell
+  int nall = list->inum + list->gnum;
+  int ntot = atom->nlocal + atom->nghost;
+  int newton_pair = force->newton_pair;
+  if(!newton_pair)
+   error->all(FLERR,"Pair style mesont/tpm requires newton pair on");
+
+  double **x = atom->x;
+  double **f = atom->f;
+  double *r = atom->radius;
+  double *l = atom->length;
+  int *buckling = atom->buckling;
+  tagint *g_id = atom->tag;
+
+  //check if cutoff is chosen correctly
+  double RT = mesont_lib_get_R();
+  double Lmax = 0.0;
+  for (int ii = 0; ii < list->inum; ii++) {
+    int i = list->ilist[ii];
+    if (Lmax < l[i]) Lmax = l[i];
+  }
+  double Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
+   std::pow((2.0*RT + TPBRcutoff),2)));
+  if (cut_global < Rcut_min){
+    std::stringstream err;
+    err << "The selected cutoff is too small for the current system : " <<
+     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
+     ", Rcut_min = " << Rcut_min;
+    error->all(FLERR, err.str().c_str());
+  }
+
+  //generate bonds and chain nblist
+  MESONTList ntlist(atom, list, cut_global*cut_global);
+
+  //reorder data to make it contiguous within tubes
+  //and compatible with Fortran functions
+  std::vector<double> x_sort(3*nall), f_sort(3*nall), s_sort(9*nall);
+  std::vector<double> u_ts_sort(nall), u_tb_sort(nall), u_tt_sort(nall);
+  std::vector<int> b_sort(nall);
+  for (int i = 0; i < nall; i++){
+    int idx = ntlist.get_idx(i);
+    for (int j = 0; j < 3; j++) x_sort[3*i+j] = x[idx][j];
+    b_sort[i] = buckling[idx];
+  }
+
+  //bending potential
+  int n_triplets = ntlist.get_triplets().size();
+  for (int i = 0; i < n_triplets; i++) {
+    const array2003<int,3>& t = ntlist.get_triplets()[i];
+    //idx of nodes of a triplet in sorted representation
+    int idx_s0 = ntlist.get_idxb(t[0]);
+    int idx_s1 = ntlist.get_idxb(t[1]);
+    int idx_s2 = ntlist.get_idxb(t[2]);
+
+    double* X1 = &(x_sort[3*idx_s0]);
+    double* X2 = &(x_sort[3*idx_s1]);
+    double* X3 = &(x_sort[3*idx_s2]);
+    double& U1b = u_tb_sort[idx_s0];
+    double& U2b = u_tb_sort[idx_s1];
+    double& U3b = u_tb_sort[idx_s2];
+    double* F1 = &(f_sort[3*idx_s0]);
+    double* F2 = &(f_sort[3*idx_s1]);
+    double* F3 = &(f_sort[3*idx_s2]);
+    double* S1 = &(s_sort[9*idx_s0]);
+    double* S2 = &(s_sort[9*idx_s1]);
+    double* S3 = &(s_sort[9*idx_s2]);
+    double& R123 = r[t[1]];
+    double& L123 = l[t[1]];
+    int& BBF2 = b_sort[idx_s1];
+
+    mesont_lib_TubeBendingForceField(U1b, U2b, U3b, F1, F2, F3, S1, S2, S3,
+     X1, X2, X3, R123, L123, BBF2);
+  }
+
+  //share new values of buckling
+  if (BendingMode == 1){
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      buckling[idx] = b_sort[i];
+    }
+    comm->forward_comm_pair(this);
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      b_sort[i] = buckling[idx];
+    }
+  }
+
+  //segment-segment and segment-tube interactions
+  int n_segments = ntlist.get_segments().size();
+  double Rmax = 0.0;
+  Lmax = 0.0;
+  for (int i = 0; i < n_segments; i++) {
+    const array2003<int,2>& s = ntlist.get_segments()[i];
+    //idx of a segment end 1 in sorted representation
+    int idx_s0 = ntlist.get_idxb(s[0]);
+    //idx of a segment end 2 in sorted representation
+    int idx_s1 = ntlist.get_idxb(s[1]);
+    double* X1 = &(x_sort[3*idx_s0]);
+    double* X2 = &(x_sort[3*idx_s1]);
+    double length = std::sqrt(std::pow(X1[0]-X2[0],2) +
+     std::pow(X1[1]-X2[1],2) + std::pow(X1[2]-X2[2],2));
+    if (length > Lmax) Lmax = length;
+    double& U1t = u_tt_sort[idx_s0];
+    double& U2t = u_tt_sort[idx_s1];
+    double& U1s = u_ts_sort[idx_s0];
+    double& U2s = u_ts_sort[idx_s1];
+    double* F1 = &(f_sort[3*idx_s0]);
+    double* F2 = &(f_sort[3*idx_s1]);
+    double* S1 = &(s_sort[9*idx_s0]);
+    double* S2 = &(s_sort[9*idx_s1]);
+    double R12 = r[s[0]]; if (R12 > Rmax) Rmax = R12;
+    if (std::abs(R12 - RT) > 1e-3)
+        error->all(FLERR,"Inconsistent input and potential table");
+    //assume that the length of the segment is defined by the node with
+    //smallest global id
+    double L12 = (g_id[s[0]] > g_id[s[1]]) ? l[s[1]] : l[s[0]];
+    mesont_lib_TubeStretchingForceField(U1s, U2s, F1, F2, S1, S2, X1, X2,
+     R12, L12);
+
+    for (int nc = 0; nc < (int)ntlist.get_nbs()[i].size(); nc++){
+      //id of the beginning and end of the chain in the sorted representation
+      const array2003<int,2>& chain = ntlist.get_nbs()[i][nc];
+      int N = chain[1] - chain[0] + 1;  //number of elements in the chain
+      int end1 = ntlist.get_idx(chain[0]);  //chain ends (real representation)
+      int end2 = ntlist.get_idx(chain[1]);
+      double* X = &(x_sort[3*chain[0]]);
+      double* Ut = &(u_tt_sort[chain[0]]);
+      double* F = &(f_sort[3*chain[0]]);
+      double* S = &(s_sort[9*chain[0]]);
+      double R = r[end1];
+      int* BBF = &(b_sort[chain[0]]);
+      int E1 = ntlist.is_end(end1);
+      int E2 = ntlist.is_end(end2);
+
+      int Ee = 0;
+      double* Xe = X; double* Fe = F; double* Se = S;
+      if (!E1 && ntlist.get_triplet(end1)[0] != MESONTList::domain_end &&
+       ntlist.get_triplet(ntlist.get_triplet(end1)[0])[0] ==
+       MESONTList::cnt_end){
+        Ee = 1;
+        int idx = ntlist.get_idxb(ntlist.get_triplet(end1)[0]);
+        Xe = &(x_sort[3*idx]);
+        Fe = &(f_sort[3*idx]);
+        Se = &(s_sort[9*idx]);
+      }
+      else if (!E2 && ntlist.get_triplet(end2)[2] != MESONTList::domain_end &&
+       ntlist.get_triplet(ntlist.get_triplet(end2)[2])[2] ==
+       MESONTList::cnt_end){
+        Ee = 2;
+        int idx = ntlist.get_idxb(ntlist.get_triplet(end2)[2]);
+        Xe = &(x_sort[3*idx]);
+        Fe = &(f_sort[3*idx]);
+        Se = &(s_sort[9*idx]);
+      }
+
+      mesont_lib_SegmentTubeForceField(U1t, U2t, Ut, F1, F2, F, Fe, S1, S2, S,
+       Se, X1, X2, R12, N, X, Xe, BBF, R, E1, E2, Ee, TPMType);
+    }
+  }
+
+  //check if cutoff is chosen correctly
+  Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
+   std::pow((2.0*Rmax + TPBRcutoff),2)));
+  if (cut_global < Rcut_min){
+    std::stringstream err;
+    err << "The selected cutoff is too small for the current system : " <<
+     "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
+     ", Rcut_min = " << Rcut_min;
+    error->all(FLERR, err.str().c_str());
+  }
+
+  // set per atom values and accumulators
+  // reallocate per-atom arrays if necessary
+  if (eatom_s == nullptr)
+   memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
+  if (eatom_b == nullptr)
+   memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
+  if (eatom_t == nullptr)
+   memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
+  if (atom->nmax > maxeatom) {
+    maxeatom = atom->nmax;
+    memory->destroy(eatom);
+    memory->create(eatom,comm->nthreads*maxeatom,"pair:eatom");
+    memory->destroy(eatom_s);
+    memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
+    memory->destroy(eatom_b);
+    memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
+    memory->destroy(eatom_t);
+    memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
+  }
+
+  if (atom->nmax > maxvatom) {
+    maxvatom = atom->nmax;
+    memory->destroy(vatom);
+    memory->create(vatom,comm->nthreads*maxvatom,6,"pair:vatom");
+  }
+
+  // zero accumulators
+  eng_vdwl = 0.0; energy_s = 0.0;
+  energy_b = 0.0; energy_t = 0.0;
+  for (int i = 0; i < 6; i++) virial[i] = 0.0;
+  for (int i = 0; i < ntot; i++){
+    eatom[i] = 0.0; eatom_s[i] = 0.0;
+    eatom_b[i] = 0.0; eatom_t[i] = 0.0;
+  }
+  for (int i = 0; i < ntot; i++)
+    for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;
+
+  //convert from sorted representation
+  for (int i = 0; i < nall; i++){
+    int idx = ntlist.get_idx(i);
+    for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
+    eatom_s[idx] = u_ts_sort[i];
+    eatom_b[idx] = u_tb_sort[i];
+    eatom_t[idx] = u_tt_sort[i];
+    eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
+    energy_s += u_ts_sort[i];
+    energy_b += u_tb_sort[i];
+    energy_t += u_tt_sort[i];
+    vatom[idx][0] = s_sort[9*i+0]; //xx
+    vatom[idx][1] = s_sort[9*i+4]; //yy
+    vatom[idx][2] = s_sort[9*i+8]; //zz
+    vatom[idx][3] = s_sort[9*i+1]; //xy
+    vatom[idx][4] = s_sort[9*i+2]; //xz
+    vatom[idx][5] = s_sort[9*i+5]; //yz
+    for (int j = 0; j < 6; j++) virial[j] += vatom[idx][j];
+    buckling[idx] = b_sort[i];
+  }
+  eng_vdwl = energy_s + energy_b + energy_t;
+}
+
+/* ----------------------------------------------------------------------
+   allocate all arrays
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::allocate(){
+  allocated = 1;
+  int n = atom->ntypes;
+
+  memory->create(setflag,n+1,n+1,"pair:setflag");
+  for (int i = 1; i <= n; i++)
+    for (int j = i; j <= n; j++)
+      setflag[i][j] = 0;
+
+  memory->create(cutsq,n+1,n+1,"pair:cutsq");
+  memory->create(cut,n+1,n+1,"pair:cut");
+}
+
+/* ----------------------------------------------------------------------
+   global settings
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::settings(int narg, char **arg){
+  if ((narg == 0) || (narg > 4))
+    error->all(FLERR,"Illegal pair_style command");
+  cut_global = utils::numeric(FLERR,arg[0],false,lmp);
+
+  // reset cutoffs that have been explicitly set
+  if (allocated) {
+    int i,j;
+    for (i = 1; i <= atom->ntypes; i++)
+      for (j = i+1; j <= atom->ntypes; j++)
+        cut[i][j] = cut_global;
+  }
+  std::string TPMAFile = (narg > 1) ? arg[1] : "MESONT-TABTP.xrs";
+  tab_path_length = TPMAFile.length();
+  if (tab_path != nullptr) memory->destroy(tab_path);
+  //c_str returns '\0' terminated string
+  memory->create(tab_path,tab_path_length+1,"pair:path");
+  std::memcpy(tab_path, TPMAFile.c_str(), tab_path_length+1);
+  mesont_lib_SetTablePath(tab_path, tab_path_length);
+
+  if (narg > 2) {
+    BendingMode = utils::numeric(FLERR,arg[2],false,lmp);
+    if ((BendingMode < 0) || (BendingMode > 1))
+      error->all(FLERR,"Incorrect BendingMode");
+  }
+  if (narg > 3) {
+    TPMType = utils::numeric(FLERR,arg[3],false,lmp);
+    if ((TPMType < 0) || (TPMType > 1))
+      error->all(FLERR,"Incorrect TPMType");
+  }
+
+  mesont_lib_TPBInit();
+  int M, N;
+  std::ifstream in(TPMAFile);
+  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
+  std::string tmp;
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  in >> M >> N;
+  in.close();
+  mesont_lib_TPMInit(M, N);
+  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
+}
+
+/* ----------------------------------------------------------------------
+   set coeffs for one or more type pairs
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::coeff(int narg, char **arg){
+  if ((narg < 2) || (narg > 3))
+    error->all(FLERR,"Incorrect args for pair coefficients");
+
+  if (!allocated) allocate();
+
+  int ilo,ihi,jlo,jhi;
+  utils::bounds(FLERR,arg[0],1,atom->ntypes,ilo,ihi,error);
+  utils::bounds(FLERR,arg[1],1,atom->ntypes,jlo,jhi,error);
+
+  double cut_one = cut_global;
+  if (narg == 3) cut_one = utils::numeric(FLERR,arg[2],false,lmp);
+
+  int count = 0;
+  for (int i = ilo; i <= ihi; i++) {
+    for (int j = MAX(jlo,i); j <= jhi; j++) {
+      cut[i][j] = cut_one;
+      setflag[i][j] = 1;
+      count++;
+    }
+  }
+
+  if (count == 0) error->all(FLERR,"Incorrect args for pair coefficients");
+}
+
+/* ----------------------------------------------------------------------
+   init for one type pair i,j and corresponding j,i
+------------------------------------------------------------------------- */
+
+double PairMESONTTPM::init_one(int i, int j){
+  if (setflag[i][j] == 0) {
+    cut[i][j] = mix_distance(cut[i][i],cut[j][j]);
+  }
+
+  return cut[i][j];
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_restart(FILE *fp){
+  write_restart_settings(fp);
+
+  int i,j;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      fwrite(&setflag[i][j],sizeof(int),1,fp);
+      if (setflag[i][j]) {
+        fwrite(&cut[i][j],sizeof(double),1,fp);
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::read_restart(FILE *fp){
+  read_restart_settings(fp);
+  allocate();
+
+  int i,j;
+  int me = comm->me;
+  for (i = 1; i <= atom->ntypes; i++)
+    for (j = i; j <= atom->ntypes; j++) {
+      if (me == 0) fread(&setflag[i][j],sizeof(int),1,fp);
+      MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
+      if (setflag[i][j]) {
+        if (me == 0) {
+          fread(&cut[i][j],sizeof(double),1,fp);
+        }
+        MPI_Bcast(&cut[i][j],1,MPI_DOUBLE,0,world);
+      }
+    }
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to restart file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_restart_settings(FILE *fp){
+  fwrite(&BendingMode,sizeof(int),1,fp);
+  fwrite(&TPMType,sizeof(int),1,fp);
+  fwrite(&cut_global,sizeof(double),1,fp);
+  fwrite(&tab_path_length,sizeof(int),1,fp);
+  fwrite(tab_path,tab_path_length+1,1,fp);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 reads from restart file, bcasts
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::read_restart_settings(FILE *fp){
+  int me = comm->me;
+  if (me == 0) {
+    fread(&BendingMode,sizeof(int),1,fp);
+    fread(&TPMType,sizeof(int),1,fp);
+    fread(&cut_global,sizeof(double),1,fp);
+    fread(&tab_path_length,sizeof(int),1,fp);
+  }
+  MPI_Bcast(&BendingMode,1,MPI_INT,0,world);
+  MPI_Bcast(&TPMType,1,MPI_INT,0,world);
+  MPI_Bcast(&cut_global,1,MPI_DOUBLE,0,world);
+  MPI_Bcast(&tab_path_length,1,MPI_INT,0,world);
+
+  if (tab_path != nullptr) memory->destroy(tab_path);
+  memory->create(tab_path,tab_path_length+1,"pair:path");
+  if (me == 0) fread(tab_path,tab_path_length+1,1,fp);
+  MPI_Bcast(tab_path,tab_path_length+1,MPI_CHAR,0,world);
+  mesont_lib_SetTablePath(tab_path,tab_path_length);
+  mesont_lib_TPBInit();
+  int M, N;
+  std::ifstream in(tab_path);
+  if (!in.is_open()) error->all(FLERR,"Incorrect table path");
+  std::string tmp;
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  std::getline(in,tmp);
+  in >> M >> N;
+  in.close();
+  mesont_lib_TPMInit(M, N);
+  mesont_lib_InitCNTPotModule(1, 3, 0, BendingMode, mesont_lib_get_R());
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes to data file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_data(FILE *fp){
+  for (int i = 1; i <= atom->ntypes; i++)
+    fprintf(fp,"%d\n",i);
+}
+
+/* ----------------------------------------------------------------------
+   proc 0 writes all pairs to data file
+------------------------------------------------------------------------- */
+
+void PairMESONTTPM::write_data_all(FILE *fp){
+  for (int i = 1; i <= atom->ntypes; i++)
+    for (int j = i; j <= atom->ntypes; j++)
+      fprintf(fp,"%d %d %g\n",i,j,cut[i][j]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void PairMESONTTPM::init_style(){
+  //make sure that a full list is created (including ghost nodes)
+  int r = neighbor->request(this,instance_me);
+  neighbor->requests[r]->half = false;
+  neighbor->requests[r]->full = true;
+  neighbor->requests[r]->ghost = true;
+}
+
+void* PairMESONTTPM::extract(const char *str, int &){
+  if (strcmp(str,"mesonttpm_Es_tot") == 0) return &energy_s;
+  else if (strcmp(str,"mesonttpm_Eb_tot") == 0) return &energy_b;
+  else if (strcmp(str,"mesonttpm_Et_tot") == 0) return &energy_t;
+  else if (strcmp(str,"mesonttpm_Es") == 0) return eatom_s;
+  else if (strcmp(str,"mesonttpm_Eb") == 0) return eatom_b;
+  else if (strcmp(str,"mesonttpm_Et") == 0) return eatom_t;
+  else return nullptr;
+};
diff --git a/src/USER-MESONT/pair_mesont_tpm.h b/src/USER-MESONT/pair_mesont_tpm.h
index c3d71ae953..704556d75e 100644
--- a/src/USER-MESONT/pair_mesont_tpm.h
+++ b/src/USER-MESONT/pair_mesont_tpm.h
@@ -1,99 +1,98 @@
-/* -*- c++ -*- ----------------------------------------------------------
-   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   http://lammps.sandia.gov, Sandia National Laboratories
-   Steve Plimpton, sjplimp@sandia.gov
-
-   Copyright (2003) Sandia Corporation.  Under the terms of Contract
-   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
-   certain rights in this software.  This software is distributed under
-   the GNU General Public License.
-
-   See the README file in the top-level LAMMPS directory.
-
-   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
-------------------------------------------------------------------------- */
-
-#ifdef PAIR_CLASS
-
-PairStyle(mesont/tpm,PairMESONTTPM)
-
-#else
-
-#ifndef LMP_PAIR_MESONT_TPM_H
-#define LMP_PAIR_MESONT_TPM_H
-
-#include "pair.h"
-
-namespace LAMMPS_NS {
-
-class PairMESONTTPM : public Pair {
- public:
-  PairMESONTTPM(class LAMMPS *);
-  virtual ~PairMESONTTPM();
-  virtual void compute(int, int);
-  void settings(int, char **);
-  void coeff(int, char **);
-  double init_one(int, int);
-  void write_restart(FILE *);
-  void read_restart(FILE *);
-  void write_restart_settings(FILE *);
-  void read_restart_settings(FILE *);
-  void write_data(FILE *);
-  void write_data_all(FILE *);
-  virtual void init_style();
-
-  double energy_s;  // accumulated energies for stretching
-  double energy_b;  // accumulated energies for bending
-  double energy_t;  // accumulated energies for tube-tube interaction
-  double *eatom_s, *eatom_b, *eatom_t; // accumulated per-atom values
-
- protected:
-  int BendingMode, TPMType;
-  char* tab_path;
-  int tab_path_length;
-  double cut_global;
-  double **cut;
-  static int instance_count;
-  int nmax;
-
-  virtual void allocate();
-  virtual void *extract(const char *, int &);
-};
-
-}
-
-#endif
-#endif
-
-/* ERROR/WARNING messages:
-
-E: Pair style mesont/tpm requires newton pair on
-
-newton_pair must be set to on
-
-E: The selected cutoff is too small for the current system
-
-cutoff must be increased.
-
-E: Illegal pair_style command
-
-Incorrect argument list in the style init.
-
-E: Incorrect table path
-
-Incorrect path to the table files.
-
-E: Incorrect BendingMode
-
-Self-explanatory.
-
-E: Incorrect TPMType
-
-Self-explanatory.
-
-E: Inconsistent input and potential table
-
-The tube diameter is inconsistent with the chirality specified
-during generation of the potential table.
-
-*/
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   http://lammps.sandia.gov, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+
+   Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
+------------------------------------------------------------------------- */
+
+#ifdef PAIR_CLASS
+
+PairStyle(mesont/tpm,PairMESONTTPM)
+
+#else
+
+#ifndef LMP_PAIR_MESONT_TPM_H
+#define LMP_PAIR_MESONT_TPM_H
+
+#include "pair.h"
+
+namespace LAMMPS_NS {
+
+class PairMESONTTPM : public Pair {
+ public:
+  PairMESONTTPM(class LAMMPS *);
+  virtual ~PairMESONTTPM();
+  virtual void compute(int, int);
+  void settings(int, char **);
+  void coeff(int, char **);
+  double init_one(int, int);
+  void write_restart(FILE *);
+  void read_restart(FILE *);
+  void write_restart_settings(FILE *);
+  void read_restart_settings(FILE *);
+  void write_data(FILE *);
+  void write_data_all(FILE *);
+  virtual void init_style();
+
+  double energy_s;  // accumulated energies for stretching
+  double energy_b;  // accumulated energies for bending
+  double energy_t;  // accumulated energies for tube-tube interaction
+  double *eatom_s, *eatom_b, *eatom_t; // accumulated per-atom values
+
+ protected:
+  int BendingMode, TPMType;
+  char* tab_path;
+  int tab_path_length;
+  double cut_global;
+  double **cut;
+  static int instance_count;
+
+  virtual void allocate();
+  virtual void *extract(const char *, int &);
+};
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Pair style mesont/tpm requires newton pair on
+
+newton_pair must be set to on
+
+E: The selected cutoff is too small for the current system
+
+cutoff must be increased.
+
+E: Illegal pair_style command
+
+Incorrect argument list in the style init.
+
+E: Incorrect table path
+
+Incorrect path to the table files.
+
+E: Incorrect BendingMode
+
+Self-explanatory.
+
+E: Incorrect TPMType
+
+Self-explanatory.
+
+E: Inconsistent input and potential table
+
+The tube diameter is inconsistent with the chirality specified
+during generation of the potential table.
+
+*/

From 62c7aca26fb5eab910382bcefdff0ea66f38af94 Mon Sep 17 00:00:00 2001
From: iafoss <iafoss@yandex.ru>
Date: Mon, 2 Nov 2020 16:35:50 -0500
Subject: [PATCH 36/64] fix bug with memory allocation

fix bug with eatom_s, eatom_b, eatom_t allocation
---
 src/USER-MESONT/pair_mesont_tpm.cpp | 115 +++++++++++++++-------------
 src/USER-MESONT/pair_mesont_tpm.h   |   1 +
 2 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index 9185786341..f341a73e23 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -311,6 +311,7 @@ PairMESONTTPM::PairMESONTTPM(LAMMPS *lmp) : Pair(lmp) {
   eatom_s = nullptr;
   eatom_b = nullptr;
   eatom_t = nullptr;
+  nmax = 0;
   instance_count++;
   if(instance_count > 1) error->all(FLERR,
    "only a single instance of mesont/tpm pair style can be created");
@@ -336,7 +337,17 @@ PairMESONTTPM::~PairMESONTTPM()
 /* ---------------------------------------------------------------------- */
 
 void PairMESONTTPM::compute(int eflag, int vflag){
+  // set per atom values and accumulators
+  // reallocate per-atom arrays if necessary
   ev_init(eflag,vflag);
+  if (atom->nmax > nmax) {
+    memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
+    memory->destroy(eatom_b);
+    memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
+    memory->destroy(eatom_t);
+    memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
+    nmax = atom->nmax;
+  }
   //total number of atoms in the node and ghost shell
   int nall = list->inum + list->gnum;
   int ntot = atom->nlocal + atom->nghost;
@@ -508,64 +519,58 @@ void PairMESONTTPM::compute(int eflag, int vflag){
     error->all(FLERR, err.str().c_str());
   }
 
-  // set per atom values and accumulators
-  // reallocate per-atom arrays if necessary
-  if (eatom_s == nullptr)
-   memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
-  if (eatom_b == nullptr)
-   memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
-  if (eatom_t == nullptr)
-   memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
-  if (atom->nmax > maxeatom) {
-    maxeatom = atom->nmax;
-    memory->destroy(eatom);
-    memory->create(eatom,comm->nthreads*maxeatom,"pair:eatom");
-    memory->destroy(eatom_s);
-    memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
-    memory->destroy(eatom_b);
-    memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");
-    memory->destroy(eatom_t);
-    memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
-  }
-
-  if (atom->nmax > maxvatom) {
-    maxvatom = atom->nmax;
-    memory->destroy(vatom);
-    memory->create(vatom,comm->nthreads*maxvatom,6,"pair:vatom");
-  }
-
-  // zero accumulators
-  eng_vdwl = 0.0; energy_s = 0.0;
-  energy_b = 0.0; energy_t = 0.0;
-  for (int i = 0; i < 6; i++) virial[i] = 0.0;
-  for (int i = 0; i < ntot; i++){
-    eatom[i] = 0.0; eatom_s[i] = 0.0;
-    eatom_b[i] = 0.0; eatom_t[i] = 0.0;
-  }
-  for (int i = 0; i < ntot; i++)
-    for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;
-
   //convert from sorted representation
   for (int i = 0; i < nall; i++){
-    int idx = ntlist.get_idx(i);
-    for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
-    eatom_s[idx] = u_ts_sort[i];
-    eatom_b[idx] = u_tb_sort[i];
-    eatom_t[idx] = u_tt_sort[i];
-    eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
-    energy_s += u_ts_sort[i];
-    energy_b += u_tb_sort[i];
-    energy_t += u_tt_sort[i];
-    vatom[idx][0] = s_sort[9*i+0]; //xx
-    vatom[idx][1] = s_sort[9*i+4]; //yy
-    vatom[idx][2] = s_sort[9*i+8]; //zz
-    vatom[idx][3] = s_sort[9*i+1]; //xy
-    vatom[idx][4] = s_sort[9*i+2]; //xz
-    vatom[idx][5] = s_sort[9*i+5]; //yz
-    for (int j = 0; j < 6; j++) virial[j] += vatom[idx][j];
-    buckling[idx] = b_sort[i];
+      int idx = ntlist.get_idx(i);
+      for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
+      buckling[idx] = b_sort[i];
   }
-  eng_vdwl = energy_s + energy_b + energy_t;
+  if(eflag){
+    eng_vdwl = 0.0; energy_s = 0.0;
+    energy_b = 0.0; energy_t = 0.0;
+    for (int i = 0; i < ntot; i++){
+      eatom[i] = 0.0; eatom_s[i] = 0.0;
+      eatom_b[i] = 0.0; eatom_t[i] = 0.0;
+    }
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      eatom_s[idx] = u_ts_sort[i];
+      eatom_b[idx] = u_tb_sort[i];
+      eatom_t[idx] = u_tt_sort[i];
+      eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
+      energy_s += u_ts_sort[i];
+      energy_b += u_tb_sort[i];
+      energy_t += u_tt_sort[i];
+    }
+    eng_vdwl = energy_s + energy_b + energy_t;
+  }
+  if(vflag){
+    for (int i = 0; i < 6; i++) virial[i] = 0.0;
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      virial[0] += s_sort[9*i+0]; //xx
+      virial[1] += s_sort[9*i+4]; //yy
+      virial[2] += s_sort[9*i+8]; //zz
+      virial[3] += s_sort[9*i+1]; //xy
+      virial[4] += s_sort[9*i+2]; //xz
+      virial[5] += s_sort[9*i+5]; //yz
+    }
+  }
+  int vflag_atom = vflag & 4;
+  if(vflag_atom){
+    for (int i = 0; i < ntot; i++)
+      for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      vatom[idx][0] = s_sort[9*i+0]; //xx
+      vatom[idx][1] = s_sort[9*i+4]; //yy
+      vatom[idx][2] = s_sort[9*i+8]; //zz
+      vatom[idx][3] = s_sort[9*i+1]; //xy
+      vatom[idx][4] = s_sort[9*i+2]; //xz
+      vatom[idx][5] = s_sort[9*i+5]; //yz
+    }
+  }
+
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/USER-MESONT/pair_mesont_tpm.h b/src/USER-MESONT/pair_mesont_tpm.h
index 704556d75e..a18e555349 100644
--- a/src/USER-MESONT/pair_mesont_tpm.h
+++ b/src/USER-MESONT/pair_mesont_tpm.h
@@ -54,6 +54,7 @@ class PairMESONTTPM : public Pair {
   double cut_global;
   double **cut;
   static int instance_count;
+  int nmax;
 
   virtual void allocate();
   virtual void *extract(const char *, int &);

From 559d6b10cfb93f3a59b0318b991b70ed092db429 Mon Sep 17 00:00:00 2001
From: iafoss <iafoss@yandex.ru>
Date: Mon, 2 Nov 2020 16:39:13 -0500
Subject: [PATCH 37/64] fix bug with memory allocation

---
 src/USER-MESONT/pair_mesont_tpm.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index f341a73e23..720a821aa6 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -341,6 +341,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   // reallocate per-atom arrays if necessary
   ev_init(eflag,vflag);
   if (atom->nmax > nmax) {
+    memory->destroy(eatom_s);
     memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
     memory->destroy(eatom_b);
     memory->create(eatom_b,comm->nthreads*maxeatom,"pair:eatom_b");

From aff54e948a5890c96864cc2d5a31b318f1909961 Mon Sep 17 00:00:00 2001
From: iafoss <iafoss@yandex.ru>
Date: Mon, 2 Nov 2020 18:39:34 -0500
Subject: [PATCH 38/64] eflag fix

---
 src/USER-MESONT/pair_mesont_tpm.cpp | 32 +++++++++++++++++------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index 720a821aa6..08b43b122f 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -49,7 +49,7 @@ private:
 
 class MESONTList {
 public:
-  MESONTList(const Atom* atom, const NeighList* nblist, double rc2);
+  MESONTList(const Atom* atom, const NeighList* nblist);
   ~MESONTList() {};
   //list of segments
   const std::vector<array2003<int,2> >& get_segments() const;
@@ -165,12 +165,13 @@ void vector_union(std::vector<T>& v1, std::vector<T>& v2,
   }
 }
 
-MESONTList::MESONTList(const Atom* atom, const NeighList* nblist, double /* rc2 */){
+MESONTList::MESONTList(const Atom* atom, const NeighList* nblist){
   if (atom == nullptr || nblist == nullptr) return;
   //number of local atoms at the node
   int nlocal = atom->nlocal;
-  //total number of atoms in the node and ghost shell
+  //total number of atoms in the node and ghost shell treated as NTs
   int nall = nblist->inum + nblist->gnum;
+  //total number of atoms in the node and ghost shell
   int ntot = atom->nlocal + atom->nghost;
   tagint* const g_id = atom->tag;
   tagint** const bonds = atom->bond_nt;
@@ -340,7 +341,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   // set per atom values and accumulators
   // reallocate per-atom arrays if necessary
   ev_init(eflag,vflag);
-  if (atom->nmax > nmax) {
+  if (atom->nmax > nmax && eflag_atom) {
     memory->destroy(eatom_s);
     memory->create(eatom_s,comm->nthreads*maxeatom,"pair:eatom_s");
     memory->destroy(eatom_b);
@@ -349,8 +350,9 @@ void PairMESONTTPM::compute(int eflag, int vflag){
     memory->create(eatom_t,comm->nthreads*maxeatom,"pair:eatom_t");
     nmax = atom->nmax;
   }
-  //total number of atoms in the node and ghost shell
+  //total number of atoms in the node and ghost shell treated as NTs
   int nall = list->inum + list->gnum;
+  //total number of atoms in the node and ghost shell
   int ntot = atom->nlocal + atom->nghost;
   int newton_pair = force->newton_pair;
   if(!newton_pair)
@@ -381,7 +383,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   }
 
   //generate bonds and chain nblist
-  MESONTList ntlist(atom, list, cut_global*cut_global);
+  MESONTList ntlist(atom, list);
 
   //reorder data to make it contiguous within tubes
   //and compatible with Fortran functions
@@ -526,9 +528,18 @@ void PairMESONTTPM::compute(int eflag, int vflag){
       for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
       buckling[idx] = b_sort[i];
   }
-  if(eflag){
+  if(eflag_global){
     eng_vdwl = 0.0; energy_s = 0.0;
     energy_b = 0.0; energy_t = 0.0;
+    for (int i = 0; i < nall; i++){
+      int idx = ntlist.get_idx(i);
+      energy_s += u_ts_sort[i];
+      energy_b += u_tb_sort[i];
+      energy_t += u_tt_sort[i];
+    }
+    eng_vdwl = energy_s + energy_b + energy_t;
+  }
+  if(eflag_atom){
     for (int i = 0; i < ntot; i++){
       eatom[i] = 0.0; eatom_s[i] = 0.0;
       eatom_b[i] = 0.0; eatom_t[i] = 0.0;
@@ -539,13 +550,9 @@ void PairMESONTTPM::compute(int eflag, int vflag){
       eatom_b[idx] = u_tb_sort[i];
       eatom_t[idx] = u_tt_sort[i];
       eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
-      energy_s += u_ts_sort[i];
-      energy_b += u_tb_sort[i];
-      energy_t += u_tt_sort[i];
     }
-    eng_vdwl = energy_s + energy_b + energy_t;
   }
-  if(vflag){
+  if(vflag_global){
     for (int i = 0; i < 6; i++) virial[i] = 0.0;
     for (int i = 0; i < nall; i++){
       int idx = ntlist.get_idx(i);
@@ -557,7 +564,6 @@ void PairMESONTTPM::compute(int eflag, int vflag){
       virial[5] += s_sort[9*i+5]; //yz
     }
   }
-  int vflag_atom = vflag & 4;
   if(vflag_atom){
     for (int i = 0; i < ntot; i++)
       for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;

From 4d19b8bf3ad955837740ce7901510c3034a881f2 Mon Sep 17 00:00:00 2001
From: iafoss <iafoss@yandex.ru>
Date: Mon, 2 Nov 2020 19:38:56 -0500
Subject: [PATCH 39/64] stype adjustment

---
 src/USER-MESONT/pair_mesont_tpm.cpp | 82 ++++++++++++++---------------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index 08b43b122f..2fe5b2036f 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -40,7 +40,7 @@ using namespace LAMMPS_NS;
 template<typename T, int N>
 class array2003{
 public:
-  T& operator[] (int idx){ return data[idx];};
+  T& operator[] (int idx) { return data[idx];};
   const T& operator[] (int idx) const{ return data[idx];};
 private:
   T data[N];
@@ -165,7 +165,7 @@ void vector_union(std::vector<T>& v1, std::vector<T>& v2,
   }
 }
 
-MESONTList::MESONTList(const Atom* atom, const NeighList* nblist){
+MESONTList::MESONTList(const Atom* atom, const NeighList* nblist) {
   if (atom == nullptr || nblist == nullptr) return;
   //number of local atoms at the node
   int nlocal = atom->nlocal;
@@ -194,7 +194,7 @@ MESONTList::MESONTList(const Atom* atom, const NeighList* nblist){
       if (bonds[i][m] == cnt_end) chain_list[i][m] = cnt_end;
     for (int j = 0; j < nnb; j++) {
       int nb = nblist->firstneigh[i][j];
-      if (bonds[i][0] == g_id[nb]){
+      if (bonds[i][0] == g_id[nb]) {
         chain_list[i][0] = nb;
         chain_list[nb][1] = i;
         break;
@@ -224,13 +224,13 @@ MESONTList::MESONTList(const Atom* atom, const NeighList* nblist){
   for (int i = 0; i < nlocal; i++) {
     if (chain_list[i][0] == not_cnt) continue;
     if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
-     g_id[i] < g_id[chain_list[i][0]]){
+     g_id[i] < g_id[chain_list[i][0]]) {
       array2003<int, 2> tmp_c;
       tmp_c[0] = i; tmp_c[1] = chain_list[i][0];
       segments.push_back(tmp_c);
     }
     if (chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end &&
-     g_id[i] < g_id[chain_list[i][1]]){
+     g_id[i] < g_id[chain_list[i][1]]) {
       array2003<int, 2> tmp_c;
        tmp_c[0] = i; tmp_c[1] = chain_list[i][1];
        segments.push_back(tmp_c);
@@ -239,7 +239,7 @@ MESONTList::MESONTList(const Atom* atom, const NeighList* nblist){
   int nbonds = segments.size();
 
   //triplets
-  for (int i = 0; i < nlocal; i++){
+  for (int i = 0; i < nlocal; i++) {
     if (chain_list[i][0] == not_cnt) continue;
     if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
      chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end)
@@ -286,7 +286,7 @@ MESONTList::MESONTList(const Atom* atom, const NeighList* nblist){
            chain_list[idx0][0] != domain_end) chain[0] -= 1;
           if (chain_list[idx1][1] != cnt_end &&
            chain_list[idx1][1] != domain_end) chain[1] += 1;
-          if(chain[0] != chain[1]) nb_chains[i].push_back(chain);
+          if (chain[0] != chain[1]) nb_chains[i].push_back(chain);
           idx_s = (j == nnb - 1) ? -1 : nb_list[j + 1];
         }
       }
@@ -314,7 +314,7 @@ PairMESONTTPM::PairMESONTTPM(LAMMPS *lmp) : Pair(lmp) {
   eatom_t = nullptr;
   nmax = 0;
   instance_count++;
-  if(instance_count > 1) error->all(FLERR,
+  if (instance_count > 1) error->all(FLERR,
    "only a single instance of mesont/tpm pair style can be created");
 }
 
@@ -337,7 +337,7 @@ PairMESONTTPM::~PairMESONTTPM()
 
 /* ---------------------------------------------------------------------- */
 
-void PairMESONTTPM::compute(int eflag, int vflag){
+void PairMESONTTPM::compute(int eflag, int vflag) {
   // set per atom values and accumulators
   // reallocate per-atom arrays if necessary
   ev_init(eflag,vflag);
@@ -355,7 +355,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   //total number of atoms in the node and ghost shell
   int ntot = atom->nlocal + atom->nghost;
   int newton_pair = force->newton_pair;
-  if(!newton_pair)
+  if (!newton_pair)
    error->all(FLERR,"Pair style mesont/tpm requires newton pair on");
 
   double **x = atom->x;
@@ -374,7 +374,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   }
   double Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
    std::pow((2.0*RT + TPBRcutoff),2)));
-  if (cut_global < Rcut_min){
+  if (cut_global < Rcut_min) {
     std::stringstream err;
     err << "The selected cutoff is too small for the current system : " <<
      "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
@@ -390,7 +390,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   std::vector<double> x_sort(3*nall), f_sort(3*nall), s_sort(9*nall);
   std::vector<double> u_ts_sort(nall), u_tb_sort(nall), u_tt_sort(nall);
   std::vector<int> b_sort(nall);
-  for (int i = 0; i < nall; i++){
+  for (int i = 0; i < nall; i++) {
     int idx = ntlist.get_idx(i);
     for (int j = 0; j < 3; j++) x_sort[3*i+j] = x[idx][j];
     b_sort[i] = buckling[idx];
@@ -426,13 +426,13 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   }
 
   //share new values of buckling
-  if (BendingMode == 1){
-    for (int i = 0; i < nall; i++){
+  if (BendingMode == 1) {
+    for (int i = 0; i < nall; i++) {
       int idx = ntlist.get_idx(i);
       buckling[idx] = b_sort[i];
     }
     comm->forward_comm_pair(this);
-    for (int i = 0; i < nall; i++){
+    for (int i = 0; i < nall; i++) {
       int idx = ntlist.get_idx(i);
       b_sort[i] = buckling[idx];
     }
@@ -470,7 +470,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
     mesont_lib_TubeStretchingForceField(U1s, U2s, F1, F2, S1, S2, X1, X2,
      R12, L12);
 
-    for (int nc = 0; nc < (int)ntlist.get_nbs()[i].size(); nc++){
+    for (int nc = 0; nc < (int)ntlist.get_nbs()[i].size(); nc++) {
       //id of the beginning and end of the chain in the sorted representation
       const array2003<int,2>& chain = ntlist.get_nbs()[i][nc];
       int N = chain[1] - chain[0] + 1;  //number of elements in the chain
@@ -489,7 +489,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
       double* Xe = X; double* Fe = F; double* Se = S;
       if (!E1 && ntlist.get_triplet(end1)[0] != MESONTList::domain_end &&
        ntlist.get_triplet(ntlist.get_triplet(end1)[0])[0] ==
-       MESONTList::cnt_end){
+       MESONTList::cnt_end) {
         Ee = 1;
         int idx = ntlist.get_idxb(ntlist.get_triplet(end1)[0]);
         Xe = &(x_sort[3*idx]);
@@ -498,7 +498,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
       }
       else if (!E2 && ntlist.get_triplet(end2)[2] != MESONTList::domain_end &&
        ntlist.get_triplet(ntlist.get_triplet(end2)[2])[2] ==
-       MESONTList::cnt_end){
+       MESONTList::cnt_end) {
         Ee = 2;
         int idx = ntlist.get_idxb(ntlist.get_triplet(end2)[2]);
         Xe = &(x_sort[3*idx]);
@@ -514,7 +514,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   //check if cutoff is chosen correctly
   Rcut_min = std::max(2.0*Lmax, std::sqrt(0.5*Lmax*Lmax +
    std::pow((2.0*Rmax + TPBRcutoff),2)));
-  if (cut_global < Rcut_min){
+  if (cut_global < Rcut_min) {
     std::stringstream err;
     err << "The selected cutoff is too small for the current system : " <<
      "L_max = " << Lmax << ", R_max = " << RT << ", Rc = " << cut_global <<
@@ -523,15 +523,15 @@ void PairMESONTTPM::compute(int eflag, int vflag){
   }
 
   //convert from sorted representation
-  for (int i = 0; i < nall; i++){
+  for (int i = 0; i < nall; i++) {
       int idx = ntlist.get_idx(i);
       for (int j = 0; j < 3; j++) f[idx][j] += f_sort[3*i+j];
       buckling[idx] = b_sort[i];
   }
-  if(eflag_global){
+  if (eflag_global) {
     eng_vdwl = 0.0; energy_s = 0.0;
     energy_b = 0.0; energy_t = 0.0;
-    for (int i = 0; i < nall; i++){
+    for (int i = 0; i < nall; i++) {
       int idx = ntlist.get_idx(i);
       energy_s += u_ts_sort[i];
       energy_b += u_tb_sort[i];
@@ -539,12 +539,12 @@ void PairMESONTTPM::compute(int eflag, int vflag){
     }
     eng_vdwl = energy_s + energy_b + energy_t;
   }
-  if(eflag_atom){
-    for (int i = 0; i < ntot; i++){
+  if (eflag_atom) {
+    for (int i = 0; i < ntot; i++) {
       eatom[i] = 0.0; eatom_s[i] = 0.0;
       eatom_b[i] = 0.0; eatom_t[i] = 0.0;
     }
-    for (int i = 0; i < nall; i++){
+    for (int i = 0; i < nall; i++) {
       int idx = ntlist.get_idx(i);
       eatom_s[idx] = u_ts_sort[i];
       eatom_b[idx] = u_tb_sort[i];
@@ -552,9 +552,9 @@ void PairMESONTTPM::compute(int eflag, int vflag){
       eatom[idx] = u_ts_sort[i] + u_tb_sort[i] + u_tt_sort[i];
     }
   }
-  if(vflag_global){
+  if (vflag_global) {
     for (int i = 0; i < 6; i++) virial[i] = 0.0;
-    for (int i = 0; i < nall; i++){
+    for (int i = 0; i < nall; i++) {
       int idx = ntlist.get_idx(i);
       virial[0] += s_sort[9*i+0]; //xx
       virial[1] += s_sort[9*i+4]; //yy
@@ -564,10 +564,10 @@ void PairMESONTTPM::compute(int eflag, int vflag){
       virial[5] += s_sort[9*i+5]; //yz
     }
   }
-  if(vflag_atom){
+  if (vflag_atom) {
     for (int i = 0; i < ntot; i++)
       for (int j = 0; j < 6; j++) vatom[i][j] = 0.0;
-    for (int i = 0; i < nall; i++){
+    for (int i = 0; i < nall; i++) {
       int idx = ntlist.get_idx(i);
       vatom[idx][0] = s_sort[9*i+0]; //xx
       vatom[idx][1] = s_sort[9*i+4]; //yy
@@ -584,7 +584,7 @@ void PairMESONTTPM::compute(int eflag, int vflag){
    allocate all arrays
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::allocate(){
+void PairMESONTTPM::allocate() {
   allocated = 1;
   int n = atom->ntypes;
 
@@ -601,7 +601,7 @@ void PairMESONTTPM::allocate(){
    global settings
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::settings(int narg, char **arg){
+void PairMESONTTPM::settings(int narg, char **arg) {
   if ((narg == 0) || (narg > 4))
     error->all(FLERR,"Illegal pair_style command");
   cut_global = utils::numeric(FLERR,arg[0],false,lmp);
@@ -650,7 +650,7 @@ void PairMESONTTPM::settings(int narg, char **arg){
    set coeffs for one or more type pairs
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::coeff(int narg, char **arg){
+void PairMESONTTPM::coeff(int narg, char **arg) {
   if ((narg < 2) || (narg > 3))
     error->all(FLERR,"Incorrect args for pair coefficients");
 
@@ -679,7 +679,7 @@ void PairMESONTTPM::coeff(int narg, char **arg){
    init for one type pair i,j and corresponding j,i
 ------------------------------------------------------------------------- */
 
-double PairMESONTTPM::init_one(int i, int j){
+double PairMESONTTPM::init_one(int i, int j) {
   if (setflag[i][j] == 0) {
     cut[i][j] = mix_distance(cut[i][i],cut[j][j]);
   }
@@ -691,7 +691,7 @@ double PairMESONTTPM::init_one(int i, int j){
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::write_restart(FILE *fp){
+void PairMESONTTPM::write_restart(FILE *fp) {
   write_restart_settings(fp);
 
   int i,j;
@@ -708,7 +708,7 @@ void PairMESONTTPM::write_restart(FILE *fp){
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::read_restart(FILE *fp){
+void PairMESONTTPM::read_restart(FILE *fp) {
   read_restart_settings(fp);
   allocate();
 
@@ -731,7 +731,7 @@ void PairMESONTTPM::read_restart(FILE *fp){
    proc 0 writes to restart file
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::write_restart_settings(FILE *fp){
+void PairMESONTTPM::write_restart_settings(FILE *fp) {
   fwrite(&BendingMode,sizeof(int),1,fp);
   fwrite(&TPMType,sizeof(int),1,fp);
   fwrite(&cut_global,sizeof(double),1,fp);
@@ -743,7 +743,7 @@ void PairMESONTTPM::write_restart_settings(FILE *fp){
    proc 0 reads from restart file, bcasts
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::read_restart_settings(FILE *fp){
+void PairMESONTTPM::read_restart_settings(FILE *fp) {
   int me = comm->me;
   if (me == 0) {
     fread(&BendingMode,sizeof(int),1,fp);
@@ -779,7 +779,7 @@ void PairMESONTTPM::read_restart_settings(FILE *fp){
    proc 0 writes to data file
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::write_data(FILE *fp){
+void PairMESONTTPM::write_data(FILE *fp) {
   for (int i = 1; i <= atom->ntypes; i++)
     fprintf(fp,"%d\n",i);
 }
@@ -788,7 +788,7 @@ void PairMESONTTPM::write_data(FILE *fp){
    proc 0 writes all pairs to data file
 ------------------------------------------------------------------------- */
 
-void PairMESONTTPM::write_data_all(FILE *fp){
+void PairMESONTTPM::write_data_all(FILE *fp) {
   for (int i = 1; i <= atom->ntypes; i++)
     for (int j = i; j <= atom->ntypes; j++)
       fprintf(fp,"%d %d %g\n",i,j,cut[i][j]);
@@ -796,7 +796,7 @@ void PairMESONTTPM::write_data_all(FILE *fp){
 
 /* ---------------------------------------------------------------------- */
 
-void PairMESONTTPM::init_style(){
+void PairMESONTTPM::init_style() {
   //make sure that a full list is created (including ghost nodes)
   int r = neighbor->request(this,instance_me);
   neighbor->requests[r]->half = false;
@@ -804,7 +804,7 @@ void PairMESONTTPM::init_style(){
   neighbor->requests[r]->ghost = true;
 }
 
-void* PairMESONTTPM::extract(const char *str, int &){
+void* PairMESONTTPM::extract(const char *str, int &) {
   if (strcmp(str,"mesonttpm_Es_tot") == 0) return &energy_s;
   else if (strcmp(str,"mesonttpm_Eb_tot") == 0) return &energy_b;
   else if (strcmp(str,"mesonttpm_Et_tot") == 0) return &energy_t;

From 3e7df13203a7cb39712930f114bb8f8de413b2d1 Mon Sep 17 00:00:00 2001
From: iafoss <iafoss@yandex.ru>
Date: Mon, 2 Nov 2020 21:22:14 -0500
Subject: [PATCH 40/64] c++11

---
 src/USER-MESONT/pair_mesont_tpm.cpp | 70 +++++++++++------------------
 1 file changed, 26 insertions(+), 44 deletions(-)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index 2fe5b2036f..b92fc16750 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -9,7 +9,7 @@
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
-
+ 2
    Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
 ------------------------------------------------------------------------- */
 
@@ -29,6 +29,7 @@
 #include <cstring>
 #include <vector>
 #include <cmath>
+#include <array>
 
 #include <fstream>
 #include <sstream>
@@ -36,28 +37,17 @@
 
 using namespace LAMMPS_NS;
 
-//since LAMMPS is compiled with C++ 2003, define a substitution for std::array
-template<typename T, int N>
-class array2003{
-public:
-  T& operator[] (int idx) { return data[idx];};
-  const T& operator[] (int idx) const{ return data[idx];};
-private:
-  T data[N];
-};
-
-
 class MESONTList {
 public:
   MESONTList(const Atom* atom, const NeighList* nblist);
   ~MESONTList() {};
   //list of segments
-  const std::vector<array2003<int,2> >& get_segments() const;
+  const std::vector<std::array<int,2>>& get_segments() const;
   //list of triplets
-  const std::vector<array2003<int,3> >& get_triplets() const;
+  const std::vector<std::array<int,3>>& get_triplets() const;
   //list of neighbor chains [start,end] for segments
   //(use idx() to get real indexes)
-  const std::vector<std::vector<array2003<int,2> > >& get_nbs() const;
+  const std::vector<std::vector<std::array<int,2>>>& get_nbs() const;
   //convert idx from sorted representation to real idx
   int get_idx(int idx) const;
   //return list of indexes for conversion from sorted representation
@@ -69,22 +59,22 @@ public:
   //check if the node is the end of the tube
   bool is_end(int idx) const;
 
-  array2003<int, 2> get_segment(int idx) const;
-  array2003<int, 3> get_triplet(int idx) const;
+  std::array<int,2> get_segment(int idx) const;
+  std::array<int,3> get_triplet(int idx) const;
 
   static const int cnt_end = -1;
   static const int domain_end = -2;
   static const int not_cnt = -3;
 private:
-  std::vector<array2003<int, 2> > chain_list, segments;
-  std::vector<array2003<int, 3> > triplets;
-  std::vector<std::vector<array2003<int, 2> > > nb_chains;
+  std::vector<std::array<int,2>> chain_list, segments;
+  std::vector<std::array<int,3>> triplets;
+  std::vector<std::vector<std::array<int,2>>> nb_chains;
   std::vector<int> index_list, index_list_b;
 };
 
 //=============================================================================
 
-inline const std::vector<std::vector<array2003<int, 2> > > &
+inline const std::vector<std::vector<std::array<int,2>>> &
  MESONTList::get_nbs() const {
   return nb_chains;
 }
@@ -106,25 +96,25 @@ inline const std::vector<int>& MESONTList::get_idxb_list() const {
   return index_list_b;
 };
 
-inline const std::vector<array2003<int, 2> > & MESONTList::get_segments()
+inline const std::vector<std::array<int,2>> & MESONTList::get_segments()
  const {
   return segments;
 }
 
-inline const std::vector<array2003<int, 3> > & MESONTList::get_triplets()
+inline const std::vector<std::array<int,3>> & MESONTList::get_triplets()
  const {
   return triplets;
 }
 
-inline array2003<int, 2> MESONTList::get_segment(int idx) const {
-  array2003<int, 2> result;
+inline std::array<int,2> MESONTList::get_segment(int idx) const {
+  std::array<int,2> result;
   result[0] = chain_list[idx][0];
   result[1] = idx;
   return result;
 }
 
-inline array2003<int, 3> MESONTList::get_triplet(int idx) const {
-  array2003<int, 3> result;
+inline std::array<int,3> MESONTList::get_triplet(int idx) const {
+  std::array<int,3> result;
   result[0] = chain_list[idx][0];
   result[1] = idx;
   result[2] = chain_list[idx][1];
@@ -179,9 +169,7 @@ MESONTList::MESONTList(const Atom* atom, const NeighList* nblist) {
   int* ilist = nblist->ilist;
 
   //convert bonds to local id representation
-  array2003<int, 2> tmp_arr;
-  tmp_arr[0] = not_cnt; tmp_arr[1] = not_cnt;
-  chain_list.resize(ntot, tmp_arr);
+  chain_list.resize(ntot, {not_cnt,not_cnt});
   for (int ii = 0; ii < nall; ii++) {
     int i = ilist[ii];
     chain_list[i][0] = domain_end;
@@ -224,17 +212,11 @@ MESONTList::MESONTList(const Atom* atom, const NeighList* nblist) {
   for (int i = 0; i < nlocal; i++) {
     if (chain_list[i][0] == not_cnt) continue;
     if (chain_list[i][0] != cnt_end && chain_list[i][0] != domain_end &&
-     g_id[i] < g_id[chain_list[i][0]]) {
-      array2003<int, 2> tmp_c;
-      tmp_c[0] = i; tmp_c[1] = chain_list[i][0];
-      segments.push_back(tmp_c);
-    }
+     g_id[i] < g_id[chain_list[i][0]])
+      segments.push_back({i,chain_list[i][0]});
     if (chain_list[i][1] != cnt_end && chain_list[i][1] != domain_end &&
-     g_id[i] < g_id[chain_list[i][1]]) {
-      array2003<int, 2> tmp_c;
-       tmp_c[0] = i; tmp_c[1] = chain_list[i][1];
-       segments.push_back(tmp_c);
-    }
+     g_id[i] < g_id[chain_list[i][1]])
+      segments.push_back({i,chain_list[i][1]});
   }
   int nbonds = segments.size();
 
@@ -275,7 +257,7 @@ MESONTList::MESONTList(const Atom* atom, const NeighList* nblist) {
         int idx_next = chain_list[index_list[nb_list[j]]][1];
         if ((j == nnb - 1) || (nb_list[j] + 1 != nb_list[j+1]) ||
          (idx_next == cnt_end) || (idx_next == domain_end)) {
-          array2003<int, 2> chain;
+          std::array<int,2> chain;
           chain[0] = idx_s;
           chain[1] = nb_list[j];
           //make sure that segments having at least one node
@@ -399,7 +381,7 @@ void PairMESONTTPM::compute(int eflag, int vflag) {
   //bending potential
   int n_triplets = ntlist.get_triplets().size();
   for (int i = 0; i < n_triplets; i++) {
-    const array2003<int,3>& t = ntlist.get_triplets()[i];
+    const std::array<int,3>& t = ntlist.get_triplets()[i];
     //idx of nodes of a triplet in sorted representation
     int idx_s0 = ntlist.get_idxb(t[0]);
     int idx_s1 = ntlist.get_idxb(t[1]);
@@ -443,7 +425,7 @@ void PairMESONTTPM::compute(int eflag, int vflag) {
   double Rmax = 0.0;
   Lmax = 0.0;
   for (int i = 0; i < n_segments; i++) {
-    const array2003<int,2>& s = ntlist.get_segments()[i];
+    const std::array<int,2>& s = ntlist.get_segments()[i];
     //idx of a segment end 1 in sorted representation
     int idx_s0 = ntlist.get_idxb(s[0]);
     //idx of a segment end 2 in sorted representation
@@ -472,7 +454,7 @@ void PairMESONTTPM::compute(int eflag, int vflag) {
 
     for (int nc = 0; nc < (int)ntlist.get_nbs()[i].size(); nc++) {
       //id of the beginning and end of the chain in the sorted representation
-      const array2003<int,2>& chain = ntlist.get_nbs()[i][nc];
+      const std::array<int,2>& chain = ntlist.get_nbs()[i][nc];
       int N = chain[1] - chain[0] + 1;  //number of elements in the chain
       int end1 = ntlist.get_idx(chain[0]);  //chain ends (real representation)
       int end2 = ntlist.get_idx(chain[1]);

From af14739541c9b96d29d7ddfa03dc09de988f21ee Mon Sep 17 00:00:00 2001
From: iafoss <iafoss@yandex.ru>
Date: Mon, 2 Nov 2020 22:09:55 -0500
Subject: [PATCH 41/64] typo

---
 src/USER-MESONT/pair_mesont_tpm.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/USER-MESONT/pair_mesont_tpm.cpp b/src/USER-MESONT/pair_mesont_tpm.cpp
index b92fc16750..1271ebddb6 100644
--- a/src/USER-MESONT/pair_mesont_tpm.cpp
+++ b/src/USER-MESONT/pair_mesont_tpm.cpp
@@ -9,7 +9,6 @@
    the GNU General Public License.
 
    See the README file in the top-level LAMMPS directory.
- 2
    Contributing author: Maxim Shugaev (UVA), mvs9t@virginia.edu
 ------------------------------------------------------------------------- */
 

From 3ea395615a891b1f56156e4eb97e65956331343c Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 4 Nov 2020 10:54:20 -0500
Subject: [PATCH 42/64] update fmtlib version 7.1.1 to 7.1.2

---
 src/fmt/core.h       | 2 +-
 src/fmt/format-inl.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fmt/core.h b/src/fmt/core.h
index 9bd2003b28..b4fc461011 100644
--- a/src/fmt/core.h
+++ b/src/fmt/core.h
@@ -18,7 +18,7 @@
 #include <vector>
 
 // The fmt library version in the form major * 10000 + minor * 100 + patch.
-#define FMT_VERSION 70101
+#define FMT_VERSION 70102
 
 #ifdef __clang__
 #  define FMT_CLANG_VERSION (__clang_major__ * 100 + __clang_minor__)
diff --git a/src/fmt/format-inl.h b/src/fmt/format-inl.h
index 5d466eebbc..8f2fe7354a 100644
--- a/src/fmt/format-inl.h
+++ b/src/fmt/format-inl.h
@@ -2337,7 +2337,7 @@ void fallback_format(Double d, int num_digits, bool binary32, buffer<char>& buf,
       upper = &upper_store;
     }
     denominator.assign_pow10(exp10);
-    denominator <<= 1;
+    denominator <<= shift;
   } else if (exp10 < 0) {
     numerator.assign_pow10(-exp10);
     lower.assign(numerator);

From c2b9b6d57b7c49d57ce143d1182e7aab82c933a2 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 6 Nov 2020 17:14:48 -0500
Subject: [PATCH 43/64] fix bug using the wrong flag variable and print warning
 only if a change was made

---
 src/domain.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/domain.cpp b/src/domain.cpp
index b0a32ce114..1becba5e0b 100644
--- a/src/domain.cpp
+++ b/src/domain.cpp
@@ -1873,6 +1873,7 @@ void Domain::set_boundary(int narg, char **arg, int flag)
   else zperiodic = 0;
 
   // record if we changed a periodic boundary to a non-periodic one
+
   int pflag=0;
   if ((periodicity[0] && !xperiodic)
       || (periodicity[1] && !yperiodic)
@@ -1889,23 +1890,27 @@ void Domain::set_boundary(int narg, char **arg, int flag)
         boundary[1][0] >= 2 || boundary[1][1] >= 2 ||
         boundary[2][0] >= 2 || boundary[2][1] >= 2) nonperiodic = 2;
   }
+
+  // force non-zero image flags to zero for non-periodic dimensions
+  // keep track if a change was made, so we can print a warning message
+
   if (pflag) {
     pflag = 0;
     for (int i=0; i < atom->nlocal; ++i) {
       int xbox = (atom->image[i] & IMGMASK) - IMGMAX;
       int ybox = (atom->image[i] >> IMGBITS & IMGMASK) - IMGMAX;
       int zbox = (atom->image[i] >> IMG2BITS) - IMGMAX;
-      if (!xperiodic) { xbox = 0; pflag = 1; }
-      if (!yperiodic) { ybox = 0; pflag = 1; }
-      if (!zperiodic) { zbox = 0; pflag = 1; }
+      if ((!xperiodic) && (xbox != 0)) { xbox = 0; pflag = 1; }
+      if ((!yperiodic) && (ybox != 0)) { ybox = 0; pflag = 1; }
+      if ((!zperiodic) && (zbox != 0)) { zbox = 0; pflag = 1; }
       atom->image[i] = ((imageint) (xbox + IMGMAX) & IMGMASK) |
         (((imageint) (ybox + IMGMAX) & IMGMASK) << IMGBITS) |
         (((imageint) (zbox + IMGMAX) & IMGMASK) << IMG2BITS);
     }
     int flag_all;
-    MPI_Allreduce(&flag,&flag_all, 1, MPI_INT, MPI_SUM, world);
+    MPI_Allreduce(&pflag,&flag_all, 1, MPI_INT, MPI_SUM, world);
     if ((flag_all > 0) && (comm->me == 0))
-      error->warning(FLERR,"Reset image flags for non-periodic boundary");
+      error->warning(FLERR,"Resetting image flags for non-periodic dimensions");
   }
 }
 

From c68829f17d59c9e247613d9d32f41a402ecb8f18 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 6 Nov 2020 17:17:18 -0500
Subject: [PATCH 44/64] update granular pair style example to comply to tighter
 history fix requirements

---
 examples/granular/in.pour.drum                | 111 ++++++------
 examples/granular/in.pour.flatwall            |  76 ++++-----
 ...drum.g++.1 => log.29Oct20.pour.drum.g++.1} | 150 ++++++++--------
 ...drum.g++.4 => log.29Oct20.pour.drum.g++.4} | 156 ++++++++---------
 ....g++.1 => log.29Oct20.pour.flatwall.g++.1} | 120 +++++++++----
 ....g++.4 => log.29Oct20.pour.flatwall.g++.4} | 161 +++++++++---------
 6 files changed, 411 insertions(+), 363 deletions(-)
 rename examples/granular/{log.29Mar19.pour.drum.g++.1 => log.29Oct20.pour.drum.g++.1} (69%)
 rename examples/granular/{log.29Mar19.pour.drum.g++.4 => log.29Oct20.pour.drum.g++.4} (69%)
 rename examples/granular/{log.29Mar19.pour.flatwall.g++.1 => log.29Oct20.pour.flatwall.g++.1} (56%)
 rename examples/granular/{log.29Mar19.pour.flatwall.g++.4 => log.29Oct20.pour.flatwall.g++.4} (52%)

diff --git a/examples/granular/in.pour.drum b/examples/granular/in.pour.drum
index 54372cd391..e0a0455f61 100644
--- a/examples/granular/in.pour.drum
+++ b/examples/granular/in.pour.drum
@@ -2,99 +2,98 @@
 # 'turn' cylinder by changing direction of gravity, then rotate it.
 # This simulates a rotating drum powder characterization experiment.
 
-variable	name string rotating_drum_two_types
+variable        name string rotating_drum_two_types
 
-atom_style	sphere
-units		lj
+atom_style      sphere
+units           lj
 
 ###############################################
 # Geometry-related parameters
 ###############################################
 
-variable	boxx equal 30
-variable	boxy equal 30
-variable	boxz equal 50
+variable        boxx equal 30
+variable        boxy equal 30
+variable        boxz equal 50
 
-variable	drum_rad equal ${boxx}*0.5
-variable	drum_height equal 20
+variable        drum_rad equal ${boxx}*0.5
+variable        drum_height equal 20
 
-variable	xc equal 0.5*${boxx}
-variable	yc equal 0.5*${boxx}
-variable	zc equal 0.5*${boxz}
+variable        xc equal 0.5*${boxx}
+variable        yc equal 0.5*${boxx}
+variable        zc equal 0.5*${boxz}
 
 ###############################################
 # Particle-related parameters
 ###############################################
-variable	rlo equal 0.25
-variable	rhi equal 0.5
-variable	dlo equal 2.0*${rlo}
-variable	dhi equal 2.0*${rhi}
+variable        rlo equal 0.25
+variable        rhi equal 0.5
+variable        dlo equal 2.0*${rlo}
+variable        dhi equal 2.0*${rhi}
 
-variable	cyl_rad_inner equal ${drum_rad}-1.1*${rhi}
+variable        cyl_rad_inner equal ${drum_rad}-1.1*${rhi}
 
-variable	dens equal 1.0
+variable        dens equal 1.0
 
 variable skin equal 0.4*${rhi}
 
 #############
 processors * * 1
-region		boxreg block 0 ${boxx} 0 ${boxy} 0 ${boxz}
-create_box	2 boxreg
-change_box	all boundary p p f
-comm_modify	vel yes
+region          boxreg block 0 ${boxx} 0 ${boxy} 0 ${boxz}
+create_box      2 boxreg
+change_box      all boundary p p f
 
-variable	theta equal 0
+pair_style      granular
+pair_coeff      1 * hertz/material 1e5 0.2 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji
+pair_coeff      2 2 jkr 1e5 0.1 0.3 50 tangential mindlin NULL 1.0 0.5 rolling sds 1e3 1e3 0.1 twisting marshall damping tsuji
 
-region		curved_wall cylinder z ${xc} ${yc} ${drum_rad} 0 ${drum_height} side in rotate v_theta ${xc} ${yc} 0 0 0 1
-region		bottom_wall plane ${xc} ${yc} 0 0 0 1 side in rotate v_theta ${xc} ${yc} 0 0 0 1
+variable        theta equal 0
 
-region		insreg cylinder z ${xc} ${yc} ${cyl_rad_inner} ${drum_height} ${boxz}
+region          curved_wall cylinder z ${xc} ${yc} ${drum_rad} 0 ${drum_height} side in rotate v_theta ${xc} ${yc} 0 0 0 1
+region          bottom_wall plane ${xc} ${yc} 0 0 0 1 side in rotate v_theta ${xc} ${yc} 0 0 0 1
 
-fix		0 all balance 100 1.0 shift xy 5 1.1
-fix		1 all nve/sphere
-fix		grav all gravity 10 vector 0 0 -1
-fix		ins1 all pour 2000 1 1234 region insreg diam range ${dlo} ${dhi} dens ${dens} ${dens}
-fix		ins2 all pour 2000 2 1234 region insreg diam range ${dlo} ${dhi} dens ${dens} ${dens}
+region          insreg cylinder z ${xc} ${yc} ${cyl_rad_inner} ${drum_height} ${boxz}
 
-comm_modify	vel yes
+fix             0 all balance 100 1.0 shift xy 5 1.1
+fix             1 all nve/sphere
+fix             grav all gravity 10 vector 0 0 -1
+fix             ins1 all pour 2000 1 1234 region insreg diam range ${dlo} ${dhi} dens ${dens} ${dens}
+fix             ins2 all pour 2000 2 1234 region insreg diam range ${dlo} ${dhi} dens ${dens} ${dens}
 
-neighbor	${skin} bin
-neigh_modify	delay 0 every 1 check yes
+comm_modify     vel yes
 
-pair_style	granular 
-pair_coeff	1 * hertz/material 1e5 0.2 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji
-pair_coeff	2 2 jkr 1e5 0.1 0.3 50 tangential mindlin NULL 1.0 0.5 rolling sds 1e3 1e3 0.1 twisting marshall damping tsuji
+neighbor        ${skin} bin
+neigh_modify    delay 0 every 1 check yes
 
-fix		3 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region curved_wall 
-fix		4 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region bottom_wall
+fix             3 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region curved_wall
+fix             4 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region bottom_wall
 
-thermo_style	custom step atoms ke v_theta
-thermo_modify	lost warn
-thermo		100
+thermo_style    custom step atoms ke v_theta
+thermo_modify   lost warn
+thermo          100
 
-timestep	0.001
+timestep        0.001
 
-#dump		1 all custom 100 ${name}.dump id type radius mass x y z 
+#dump           1 all custom 100 ${name}.dump id type radius mass x y z
 
 #For removal later
-compute		1 all property/atom radius
-variable	zmax atom z+c_1>0.5*${drum_height}
-group		delgroup dynamic all var zmax every 10000
+compute         1 all property/atom radius
+variable        zmax atom z+c_1>0.5*${drum_height}
+group           delgroup dynamic all var zmax every 10000
 
-run		2000
+run             2000
 
 #Remove any particles that are above z > 0.5*drum_height
-delete_atoms	group delgroup
+delete_atoms    group delgroup
 
 #Add top lid
-region		top_wall plane ${xc} ${yc} ${drum_height} 0 0 -1 side in rotate v_theta ${xc} ${yc} 0 0 0 1
-fix		5 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region top_wall
+region          top_wall plane ${xc} ${yc} ${drum_height} 0 0 -1 side in rotate v_theta ${xc} ${yc} 0 0 0 1
+fix             5 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region top_wall
 
 # 'Turn' drum by switching the direction of gravity
-unfix		grav
-unfix		ins1
-unfix		ins2
-fix		grav all gravity 10 vector 0 -1 0
+unfix           grav
+unfix           ins1
+unfix           ins2
+fix             grav all gravity 10 vector 0 -1 0
 
-variable	theta equal 2*PI*elapsed/20000.0
-run		3000
+variable        theta equal 2*PI*elapsed/20000.0
+run             3000
diff --git a/examples/granular/in.pour.flatwall b/examples/granular/in.pour.flatwall
index cfa70e2d84..74d7c7370e 100644
--- a/examples/granular/in.pour.flatwall
+++ b/examples/granular/in.pour.flatwall
@@ -1,67 +1,65 @@
 # pour two types of particles (cohesive and non-cohesive) on flat wall
 
-variable   	name string pour_two_types
+variable        name string pour_two_types
 
-atom_style	sphere
-units		lj
+atom_style      sphere
+units           lj
 
 ###############################################
 # Geometry-related parameters
 ###############################################
 
-variable	boxx equal 20
-variable	boxy equal 20
-variable	boxz equal 30
+variable        boxx equal 20
+variable        boxy equal 20
+variable        boxz equal 30
 
-variable	xc1 equal 0.3*${boxx}
-variable	xc2 equal 0.7*${boxx}
-variable	yc equal 0.5*${boxy}
+variable        xc1 equal 0.3*${boxx}
+variable        xc2 equal 0.7*${boxx}
+variable        yc equal 0.5*${boxy}
 
 ###############################################
 # Particle-related parameters
 ###############################################
-variable	rlo equal 0.25
-variable	rhi equal 0.5
-variable	dlo equal 2.0*${rlo}
-variable	dhi equal 2.0*${rhi}
+variable        rlo equal 0.25
+variable        rhi equal 0.5
+variable        dlo equal 2.0*${rlo}
+variable        dhi equal 2.0*${rhi}
 
-variable	dens equal 1.0
+variable        dens equal 1.0
 
 variable skin equal 0.3*${rhi}
 
 #############
-processors	* * 1
-region 		boxreg block 0 ${boxx} 0 ${boxy} 0 ${boxz}
-create_box	2 boxreg
-change_box	all boundary p p f
+processors      * * 1
+region          boxreg block 0 ${boxx} 0 ${boxy} 0 ${boxz}
+create_box      2 boxreg
+change_box      all boundary p p f
 
-comm_modify 	vel yes
+pair_style      granular
+pair_coeff      1 * jkr 1000.0 50.0 0.3 10 tangential mindlin 800.0 1.0 0.5 rolling sds 500.0 200.0 0.5 twisting marshall
+pair_coeff      2 2 hertz 200.0 20.0 tangential linear_history 300.0 1.0 0.1 rolling sds 200.0 100.0 0.1 twisting marshall
 
-region		insreg1 cylinder z ${xc1} ${yc} 5 15 ${boxz}
-region		insreg2 cylinder z ${xc2} ${yc} 5 15 ${boxz}
+comm_modify     vel yes
 
-fix		1 all nve/sphere
-fix		grav all gravity 10.0 vector 0 0 -1
-fix		ins1 all pour 1500 1 3123 region insreg1 diam range ${dlo} ${dhi} dens ${dens} ${dens}
-fix		ins2 all pour 1500 2 3123 region insreg2 diam range ${dlo} ${dhi} dens ${dens} ${dens}
+region          insreg1 cylinder z ${xc1} ${yc} 5 15 ${boxz}
+region          insreg2 cylinder z ${xc2} ${yc} 5 15 ${boxz}
 
-comm_modify	vel yes
+fix             1 all nve/sphere
+fix             grav all gravity 10.0 vector 0 0 -1
+fix             ins1 all pour 1500 1 3123 region insreg1 diam range ${dlo} ${dhi} dens ${dens} ${dens}
+fix             ins2 all pour 1500 2 3123 region insreg2 diam range ${dlo} ${dhi} dens ${dens} ${dens}
 
-neighbor	${skin} bin
-neigh_modify	delay 0 every 1 check yes
+neighbor        ${skin} bin
+neigh_modify    delay 0 every 1 check yes
 
-pair_style	granular 
-pair_coeff 	1 * jkr 1000.0 50.0 0.3 10 tangential mindlin 800.0 1.0 0.5 rolling sds 500.0 200.0 0.5 twisting marshall
-pair_coeff 	2 2 hertz 200.0 20.0 tangential linear_history 300.0 1.0 0.1 rolling sds 200.0 100.0 0.1 twisting marshall 
+fix             3 all wall/gran granular hertz/material 1e5 1e3 0.3 tangential mindlin NULL 1.0 0.5 zplane 0 NULL
 
-fix		3 all wall/gran granular hertz/material 1e5 1e3 0.3 tangential mindlin NULL 1.0 0.5 zplane 0 NULL 
+thermo_style    custom step atoms ke
+thermo_modify   lost warn
+thermo          100
 
-thermo_style	custom step cpu atoms ke
-thermo_modify	lost warn
-thermo		100
+timestep        0.001
 
-timestep	0.001
+#dump           1 all custom 100 ${name}.dump id type radius mass x y z
 
-#dump		1 all custom 100 ${name}.dump id type radius mass x y z 
-
-run		5000
+run             5000
diff --git a/examples/granular/log.29Mar19.pour.drum.g++.1 b/examples/granular/log.29Oct20.pour.drum.g++.1
similarity index 69%
rename from examples/granular/log.29Mar19.pour.drum.g++.1
rename to examples/granular/log.29Oct20.pour.drum.g++.1
index 71232ed8dd..8c4b828488 100644
--- a/examples/granular/log.29Mar19.pour.drum.g++.1
+++ b/examples/granular/log.29Oct20.pour.drum.g++.1
@@ -1,5 +1,4 @@
-LAMMPS (29 Mar 2019)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:88)
+LAMMPS (29 Oct 2020)
   using 1 OpenMP thread(s) per MPI task
 # pour two types of particles (cohesive and non-cohesive) into cylinder
 # 'turn' cylinder by changing direction of gravity, then rotate it.
@@ -55,10 +54,14 @@ region		boxreg block 0 30 0 ${boxy} 0 ${boxz}
 region		boxreg block 0 30 0 30 0 ${boxz}
 region		boxreg block 0 30 0 30 0 50
 create_box	2 boxreg
-Created orthogonal box = (0 0 0) to (30 30 50)
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (30.000000 30.000000 50.000000)
   1 by 1 by 1 MPI processor grid
 change_box	all boundary p p f
-comm_modify	vel yes
+Changing box ...
+
+pair_style	granular
+pair_coeff	1 * hertz/material 1e5 0.2 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji
+pair_coeff	2 2 jkr 1e5 0.1 0.3 50 tangential mindlin NULL 1.0 0.5 rolling sds 1e3 1e3 0.1 twisting marshall damping tsuji
 
 variable	theta equal 0
 
@@ -104,10 +107,6 @@ neighbor	${skin} bin
 neighbor	0.2 bin
 neigh_modify	delay 0 every 1 check yes
 
-pair_style	granular
-pair_coeff	1 * hertz/material 1e5 0.2 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji
-pair_coeff	2 2 jkr 1e5 0.1 0.3 50 tangential mindlin NULL 1.0 0.5 rolling sds 1e3 1e3 0.1 twisting marshall damping tsuji
-
 fix		3 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region curved_wall
 fix		4 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region bottom_wall
 
@@ -117,8 +116,7 @@ thermo		100
 
 timestep	0.001
 
-dump		1 all custom 100 ${name}.dump id type radius mass x y z
-dump		1 all custom 100 rotating_drum_two_types.dump id type radius mass x y z
+#dump		1 all custom 100 ${name}.dump id type radius mass x y z
 
 #For removal later
 compute		1 all property/atom radius
@@ -140,7 +138,7 @@ Neighbor list info ...
       pair build: half/size/bin/newton
       stencil: half/bin/3d/newton
       bin: standard
-Per MPI rank memory allocation (min/avg/max) = 13.02 | 13.02 | 13.02 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 13.03 | 13.03 | 13.03 Mbytes
 Step Atoms KinEng v_theta 
        0        0           -0            0 
      100     4000           -0            0 
@@ -163,32 +161,32 @@ Step Atoms KinEng v_theta
     1800     4000           -0            0 
     1900     4000           -0            0 
     2000     4000           -0            0 
-Loop time of 3.54461 on 1 procs for 2000 steps with 4000 atoms
+Loop time of 10.5178 on 1 procs for 2000 steps with 4000 atoms
 
-Performance: 48750.057 tau/day, 564.237 timesteps/s
-99.5% CPU use with 1 MPI tasks x 1 OpenMP threads
+Performance: 16429.309 tau/day, 190.154 timesteps/s
+99.8% CPU use with 1 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.61949    | 0.61949    | 0.61949    |   0.0 | 17.48
-Neigh   | 1.2492     | 1.2492     | 1.2492     |   0.0 | 35.24
-Comm    | 0.046404   | 0.046404   | 0.046404   |   0.0 |  1.31
-Output  | 0.15901    | 0.15901    | 0.15901    |   0.0 |  4.49
-Modify  | 1.4165     | 1.4165     | 1.4165     |   0.0 | 39.96
-Other   |            | 0.05391    |            |       |  1.52
+Pair    | 1.0701     | 1.0701     | 1.0701     |   0.0 | 10.17
+Neigh   | 4.2135     | 4.2135     | 4.2135     |   0.0 | 40.06
+Comm    | 0.38276    | 0.38276    | 0.38276    |   0.0 |  3.64
+Output  | 0.0013647  | 0.0013647  | 0.0013647  |   0.0 |  0.01
+Modify  | 4.7076     | 4.7076     | 4.7076     |   0.0 | 44.76
+Other   |            | 0.1424     |            |       |  1.35
 
-Nlocal:    4000 ave 4000 max 4000 min
+Nlocal:        4000.00 ave        4000 max        4000 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    166 ave 166 max 166 min
+Nghost:        171.000 ave         171 max         171 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    8195 ave 8195 max 8195 min
+Neighs:        8093.00 ave        8093 max        8093 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 
-Total # of neighbors = 8195
-Ave neighs/atom = 2.04875
+Total # of neighbors = 8093
+Ave neighs/atom = 2.0232500
 Neighbor list builds = 1004
-Dangerous builds = 3
+Dangerous builds = 4
 
 #Remove any particles that are above z > 0.5*drum_height
 delete_atoms	group delgroup
@@ -205,67 +203,69 @@ fix		5 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindl
 
 # 'Turn' drum by switching the direction of gravity
 unfix		grav
+unfix		ins1
+unfix		ins2
 fix		grav all gravity 10 vector 0 -1 0
 
 variable	theta equal 2*PI*elapsed/20000.0
 run		3000
-Per MPI rank memory allocation (min/avg/max) = 24.81 | 24.81 | 24.81 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 19.37 | 19.37 | 19.37 Mbytes
 Step Atoms KinEng v_theta 
-    2000     4000    64.333531            0 
-    2100     4000    106.69182  0.031415927 
-    2200     4000     121.8461  0.062831853 
-    2300     4000    88.767952   0.09424778 
-    2400     4000    82.850721   0.12566371 
-    2500     4000    91.683284   0.15707963 
-    2600     4000     31.56344   0.18849556 
-    2700     4000    4.5697672   0.21991149 
-    2800     4000    3.9879051   0.25132741 
-    2900     4000    4.4394235   0.28274334 
-    3000     4000    5.1212931   0.31415927 
-    3100     4000    5.8608892   0.34557519 
-    3200     4000     6.600714   0.37699112 
-    3300     4000    7.3497851   0.40840704 
-    3400     4000    8.0490988   0.43982297 
-    3500     4000    8.6712396    0.4712389 
-    3600     4000    9.1328667   0.50265482 
-    3700     4000    9.4683561   0.53407075 
-    3800     4000    9.5878145   0.56548668 
-    3900     4000     9.387745    0.5969026 
-    4000     4000    8.9117631   0.62831853 
-    4100     4000    8.2344368   0.65973446 
-    4200     4000    7.5335088   0.69115038 
-    4300     4000    6.8426179   0.72256631 
-    4400     4000    6.0567247   0.75398224 
-    4500     4000    5.4166132   0.78539816 
-    4600     4000    4.6012409   0.81681409 
-    4700     4000    3.8314982   0.84823002 
-    4800     4000    3.1916415   0.87964594 
-    4900     4000    2.7833964   0.91106187 
-    5000     4000    2.5051362    0.9424778 
-Loop time of 11.9545 on 1 procs for 3000 steps with 4000 atoms
+    2000     4000    65.647582            0 
+    2100     4000    105.60001  0.031415927 
+    2200     4000    112.27573  0.062831853 
+    2300     4000    92.758671   0.09424778 
+    2400     4000    88.925835   0.12566371 
+    2500     4000    81.369163   0.15707963 
+    2600     4000    32.046943   0.18849556 
+    2700     4000    4.1926368   0.21991149 
+    2800     4000    3.9933453   0.25132741 
+    2900     4000    4.5062193   0.28274334 
+    3000     4000    5.3409521   0.31415927 
+    3100     4000    6.0165991   0.34557519 
+    3200     4000     6.606767   0.37699112 
+    3300     4000    7.3997751   0.40840704 
+    3400     4000    8.1098807   0.43982297 
+    3500     4000    8.6552424    0.4712389 
+    3600     4000    9.8445204   0.50265482 
+    3700     4000    10.098753   0.53407075 
+    3800     4000    10.039489   0.56548668 
+    3900     4000    9.6376278    0.5969026 
+    4000     4000    9.2598836   0.62831853 
+    4100     4000    8.7116037   0.65973446 
+    4200     4000    8.1274117   0.69115038 
+    4300     4000    7.1487627   0.72256631 
+    4400     4000    6.2253778   0.75398224 
+    4500     4000    5.3061398   0.78539816 
+    4600     4000    4.4319316   0.81681409 
+    4700     4000     4.205607   0.84823002 
+    4800     4000    3.2112987   0.87964594 
+    4900     4000    2.6449777   0.91106187 
+    5000     4000    2.3475497    0.9424778 
+Loop time of 32.4926 on 1 procs for 3000 steps with 4000 atoms
 
-Performance: 21682.142 tau/day, 250.951 timesteps/s
-99.7% CPU use with 1 MPI tasks x 1 OpenMP threads
+Performance: 7977.205 tau/day, 92.329 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 4.8291     | 4.8291     | 4.8291     |   0.0 | 40.40
-Neigh   | 2.7489     | 2.7489     | 2.7489     |   0.0 | 22.99
-Comm    | 0.071249   | 0.071249   | 0.071249   |   0.0 |  0.60
-Output  | 0.20547    | 0.20547    | 0.20547    |   0.0 |  1.72
-Modify  | 4.0179     | 4.0179     | 4.0179     |   0.0 | 33.61
-Other   |            | 0.0819     |            |       |  0.69
+Pair    | 8.0124     | 8.0124     | 8.0124     |   0.0 | 24.66
+Neigh   | 10.993     | 10.993     | 10.993     |   0.0 | 33.83
+Comm    | 0.86697    | 0.86697    | 0.86697    |   0.0 |  2.67
+Output  | 0.0021827  | 0.0021827  | 0.0021827  |   0.0 |  0.01
+Modify  | 12.367     | 12.367     | 12.367     |   0.0 | 38.06
+Other   |            | 0.2515     |            |       |  0.77
 
-Nlocal:    4000 ave 4000 max 4000 min
+Nlocal:        4000.00 ave        4000 max        4000 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Nghost:    322 ave 322 max 322 min
+Nghost:        318.000 ave         318 max         318 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
-Neighs:    14849 ave 14849 max 14849 min
+Neighs:        14807.0 ave       14807 max       14807 min
 Histogram: 1 0 0 0 0 0 0 0 0 0
 
-Total # of neighbors = 14849
-Ave neighs/atom = 3.71225
-Neighbor list builds = 1290
-Dangerous builds = 672
-Total wall time: 0:00:15
+Total # of neighbors = 14807
+Ave neighs/atom = 3.7017500
+Neighbor list builds = 2189
+Dangerous builds = 1536
+Total wall time: 0:00:43
diff --git a/examples/granular/log.29Mar19.pour.drum.g++.4 b/examples/granular/log.29Oct20.pour.drum.g++.4
similarity index 69%
rename from examples/granular/log.29Mar19.pour.drum.g++.4
rename to examples/granular/log.29Oct20.pour.drum.g++.4
index ccd4365a38..e53da50b9b 100644
--- a/examples/granular/log.29Mar19.pour.drum.g++.4
+++ b/examples/granular/log.29Oct20.pour.drum.g++.4
@@ -1,5 +1,4 @@
-LAMMPS (29 Mar 2019)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:88)
+LAMMPS (29 Oct 2020)
   using 1 OpenMP thread(s) per MPI task
 # pour two types of particles (cohesive and non-cohesive) into cylinder
 # 'turn' cylinder by changing direction of gravity, then rotate it.
@@ -55,10 +54,14 @@ region		boxreg block 0 30 0 ${boxy} 0 ${boxz}
 region		boxreg block 0 30 0 30 0 ${boxz}
 region		boxreg block 0 30 0 30 0 50
 create_box	2 boxreg
-Created orthogonal box = (0 0 0) to (30 30 50)
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (30.000000 30.000000 50.000000)
   2 by 2 by 1 MPI processor grid
 change_box	all boundary p p f
-comm_modify	vel yes
+Changing box ...
+
+pair_style	granular
+pair_coeff	1 * hertz/material 1e5 0.2 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji
+pair_coeff	2 2 jkr 1e5 0.1 0.3 50 tangential mindlin NULL 1.0 0.5 rolling sds 1e3 1e3 0.1 twisting marshall damping tsuji
 
 variable	theta equal 0
 
@@ -104,10 +107,6 @@ neighbor	${skin} bin
 neighbor	0.2 bin
 neigh_modify	delay 0 every 1 check yes
 
-pair_style	granular
-pair_coeff	1 * hertz/material 1e5 0.2 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji
-pair_coeff	2 2 jkr 1e5 0.1 0.3 50 tangential mindlin NULL 1.0 0.5 rolling sds 1e3 1e3 0.1 twisting marshall damping tsuji
-
 fix		3 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region curved_wall
 fix		4 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindlin NULL 1.0 0.5 damping tsuji region bottom_wall
 
@@ -117,8 +116,7 @@ thermo		100
 
 timestep	0.001
 
-dump		1 all custom 100 ${name}.dump id type radius mass x y z
-dump		1 all custom 100 rotating_drum_two_types.dump id type radius mass x y z
+#dump		1 all custom 100 ${name}.dump id type radius mass x y z
 
 #For removal later
 compute		1 all property/atom radius
@@ -163,32 +161,32 @@ Step Atoms KinEng v_theta
     1800     4000           -0            0 
     1900     4000           -0            0 
     2000     4000           -0            0 
-Loop time of 2.0709 on 4 procs for 2000 steps with 4000 atoms
+Loop time of 3.86825 on 4 procs for 2000 steps with 4000 atoms
 
-Performance: 83442.024 tau/day, 965.764 timesteps/s
-97.7% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 44671.398 tau/day, 517.030 timesteps/s
+96.7% CPU use with 4 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 0.24679    | 0.26336    | 0.28853    |   3.0 | 12.72
-Neigh   | 0.52279    | 0.5332     | 0.53858    |   0.9 | 25.75
-Comm    | 0.17418    | 0.20253    | 0.23266    |   4.7 |  9.78
-Output  | 0.092897   | 0.093531   | 0.09515    |   0.3 |  4.52
-Modify  | 0.88151    | 0.89571    | 0.90582    |   0.9 | 43.25
-Other   |            | 0.08257    |            |       |  3.99
+Pair    | 0.26114    | 0.27918    | 0.28728    |   2.0 |  7.22
+Neigh   | 1.2044     | 1.2414     | 1.3105     |   3.7 | 32.09
+Comm    | 0.38592    | 0.47065    | 0.51052    |   7.4 | 12.17
+Output  | 0.0007236  | 0.0013456  | 0.0024846  |   1.8 |  0.03
+Modify  | 1.6217     | 1.6723     | 1.7801     |   5.0 | 43.23
+Other   |            | 0.2034     |            |       |  5.26
 
-Nlocal:    1000 ave 1001 max 999 min
+Nlocal:        1000.00 ave        1012 max         988 min
 Histogram: 2 0 0 0 0 0 0 0 0 2
-Nghost:    267.75 ave 276 max 262 min
-Histogram: 1 0 1 0 1 0 0 0 0 1
-Neighs:    2031.5 ave 2091 max 1958 min
-Histogram: 1 0 0 0 1 0 0 1 0 1
+Nghost:        269.250 ave         278 max         256 min
+Histogram: 1 0 0 0 0 0 1 1 0 1
+Neighs:        2060.50 ave        2156 max        1921 min
+Histogram: 1 0 0 1 0 0 0 0 0 2
 
-Total # of neighbors = 8126
-Ave neighs/atom = 2.0315
+Total # of neighbors = 8242
+Ave neighs/atom = 2.0605000
 Neighbor list builds = 1004
-Dangerous builds = 3
+Dangerous builds = 4
 
 #Remove any particles that are above z > 0.5*drum_height
 delete_atoms	group delgroup
@@ -205,67 +203,69 @@ fix		5 all wall/gran/region granular hertz/material 1e5 0.1 0.3 tangential mindl
 
 # 'Turn' drum by switching the direction of gravity
 unfix		grav
+unfix		ins1
+unfix		ins2
 fix		grav all gravity 10 vector 0 -1 0
 
 variable	theta equal 2*PI*elapsed/20000.0
 run		3000
-Per MPI rank memory allocation (min/avg/max) = 21.6 | 22.6 | 23.82 Mbytes
+Per MPI rank memory allocation (min/avg/max) = 18.55 | 18.55 | 18.55 Mbytes
 Step Atoms KinEng v_theta 
-    2000     4000    64.255821            0 
-    2100     4000    106.47082  0.031415927 
-    2200     4000    121.52634  0.062831853 
-    2300     4000    87.748818   0.09424778 
-    2400     4000    82.712784   0.12566371 
-    2500     4000    90.618713   0.15707963 
-    2600     4000    30.096031   0.18849556 
-    2700     4000    4.0838611   0.21991149 
-    2800     4000    3.7485959   0.25132741 
-    2900     4000    4.2159774   0.28274334 
-    3000     4000    4.8730048   0.31415927 
-    3100     4000    5.6109465   0.34557519 
-    3200     4000    6.4290528   0.37699112 
-    3300     4000    7.2699677   0.40840704 
-    3400     4000    8.0895944   0.43982297 
-    3500     4000    8.7222781    0.4712389 
-    3600     4000     9.133205   0.50265482 
-    3700     4000    9.3404584   0.53407075 
-    3800     4000    9.3359844   0.56548668 
-    3900     4000    9.0916854    0.5969026 
-    4000     4000    8.5596424   0.62831853 
-    4100     4000    7.9734883   0.65973446 
-    4200     4000    7.2154383   0.69115038 
-    4300     4000    6.7039232   0.72256631 
-    4400     4000    6.1542738   0.75398224 
-    4500     4000    5.4049454   0.78539816 
-    4600     4000    4.4603192   0.81681409 
-    4700     4000    3.6197985   0.84823002 
-    4800     4000    2.9895571   0.87964594 
-    4900     4000    2.5314553   0.91106187 
-    5000     4000    2.2645533    0.9424778 
-Loop time of 6.64209 on 4 procs for 3000 steps with 4000 atoms
+    2000     4000    65.819213            0 
+    2100     4000    105.02389  0.031415927 
+    2200     4000    112.02469  0.062831853 
+    2300     4000    92.271262   0.09424778 
+    2400     4000    89.369506   0.12566371 
+    2500     4000    80.910925   0.15707963 
+    2600     4000    31.620722   0.18849556 
+    2700     4000    4.3019937   0.21991149 
+    2800     4000    3.9913967   0.25132741 
+    2900     4000    4.5203726   0.28274334 
+    3000     4000     5.484886   0.31415927 
+    3100     4000    6.1085958   0.34557519 
+    3200     4000    6.7085635   0.37699112 
+    3300     4000    7.4787777   0.40840704 
+    3400     4000    8.2116413   0.43982297 
+    3500     4000    8.7979302    0.4712389 
+    3600     4000     9.871649   0.50265482 
+    3700     4000    10.012426   0.53407075 
+    3800     4000    9.9067754   0.56548668 
+    3900     4000     9.725458    0.5969026 
+    4000     4000    9.3350056   0.62831853 
+    4100     4000    8.8337295   0.65973446 
+    4200     4000    8.2712493   0.69115038 
+    4300     4000    6.9609934   0.72256631 
+    4400     4000    6.0120294   0.75398224 
+    4500     4000    5.0490036   0.78539816 
+    4600     4000    4.2796544   0.81681409 
+    4700     4000    4.1736483   0.84823002 
+    4800     4000    3.0860106   0.87964594 
+    4900     4000    2.6670909   0.91106187 
+    5000     4000    2.2901814    0.9424778 
+Loop time of 10.7627 on 4 procs for 3000 steps with 4000 atoms
 
-Performance: 39023.861 tau/day, 451.665 timesteps/s
-96.6% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 24083.252 tau/day, 278.741 timesteps/s
+97.9% CPU use with 4 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 1.8376     | 2.126      | 2.3131     |  12.6 | 32.01
-Neigh   | 0.97762    | 1.0518     | 1.1337     |   5.4 | 15.84
-Comm    | 0.53699    | 0.84265    | 1.2325     |  27.6 | 12.69
-Output  | 0.13922    | 0.14159    | 0.14388    |   0.4 |  2.13
-Modify  | 1.8815     | 2.1026     | 2.3368     |  11.2 | 31.66
-Other   |            | 0.3774     |            |       |  5.68
+Pair    | 1.6731     | 2.0701     | 2.3327     |  18.9 | 19.23
+Neigh   | 2.7389     | 3.1706     | 3.5146     |  15.7 | 29.46
+Comm    | 0.93507    | 1.5441     | 2.1182     |  39.1 | 14.35
+Output  | 0.0021682  | 0.0044412  | 0.006026   |   2.2 |  0.04
+Modify  | 3.0031     | 3.4223     | 3.9262     |  18.3 | 31.80
+Other   |            | 0.5511     |            |       |  5.12
 
-Nlocal:    1000 ave 1256 max 744 min
+Nlocal:        1000.00 ave        1277 max         723 min
 Histogram: 2 0 0 0 0 0 0 0 0 2
-Nghost:    579.5 ave 789 max 498 min
-Histogram: 2 1 0 0 0 0 0 0 0 1
-Neighs:    3696.25 ave 4853 max 2590 min
-Histogram: 2 0 0 0 0 0 0 0 1 1
+Nghost:        569.750 ave         809 max         454 min
+Histogram: 1 2 0 0 0 0 0 0 0 1
+Neighs:        3690.50 ave        4937 max        2426 min
+Histogram: 1 1 0 0 0 0 0 0 0 2
 
-Total # of neighbors = 14785
-Ave neighs/atom = 3.69625
-Neighbor list builds = 1230
-Dangerous builds = 676
-Total wall time: 0:00:08
+Total # of neighbors = 14762
+Ave neighs/atom = 3.6905000
+Neighbor list builds = 2187
+Dangerous builds = 1610
+Total wall time: 0:00:14
diff --git a/examples/granular/log.29Mar19.pour.flatwall.g++.1 b/examples/granular/log.29Oct20.pour.flatwall.g++.1
similarity index 56%
rename from examples/granular/log.29Mar19.pour.flatwall.g++.1
rename to examples/granular/log.29Oct20.pour.flatwall.g++.1
index daab5efce3..00fc8e31a1 100644
--- a/examples/granular/log.29Mar19.pour.flatwall.g++.1
+++ b/examples/granular/log.29Oct20.pour.flatwall.g++.1
@@ -1,5 +1,4 @@
-LAMMPS (29 Mar 2019)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:88)
+LAMMPS (29 Oct 2020)
   using 1 OpenMP thread(s) per MPI task
 # pour two types of particles (cohesive and non-cohesive) on flat wall
 
@@ -45,9 +44,14 @@ region 		boxreg block 0 20 0 ${boxy} 0 ${boxz}
 region 		boxreg block 0 20 0 20 0 ${boxz}
 region 		boxreg block 0 20 0 20 0 30
 create_box	2 boxreg
-Created orthogonal box = (0 0 0) to (20 20 30)
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (20.000000 20.000000 30.000000)
   1 by 1 by 1 MPI processor grid
 change_box	all boundary p p f
+Changing box ...
+
+pair_style	granular
+pair_coeff 	1 * jkr 1000.0 50.0 0.3 10 tangential mindlin 800.0 1.0 0.5 rolling sds 500.0 200.0 0.5 twisting marshall
+pair_coeff 	2 2 hertz 200.0 20.0 tangential linear_history 300.0 1.0 0.1 rolling sds 200.0 100.0 0.1 twisting marshall
 
 comm_modify 	vel yes
 
@@ -75,19 +79,13 @@ fix		ins2 all pour 1500 2 3123 region insreg2 diam range 0.5 1 dens 1 ${dens}
 fix		ins2 all pour 1500 2 3123 region insreg2 diam range 0.5 1 dens 1 1
 Particle insertion: 562 every 346 steps, 1500 by step 693
 
-comm_modify	vel yes
-
 neighbor	${skin} bin
 neighbor	0.15 bin
 neigh_modify	delay 0 every 1 check yes
 
-pair_style	granular
-pair_coeff 	1 * jkr 1000.0 50.0 0.3 10 tangential mindlin 800.0 1.0 0.5 rolling sds 500.0 200.0 0.5 twisting marshall
-pair_coeff 	2 2 hertz 200.0 20.0 tangential linear_history 300.0 1.0 0.1 rolling sds 200.0 100.0 0.1 twisting marshall
-
 fix		3 all wall/gran granular hertz/material 1e5 1e3 0.3 tangential mindlin NULL 1.0 0.5 zplane 0 NULL
 
-thermo_style	custom step cpu atoms ke
+thermo_style	custom step atoms ke
 thermo_modify	lost warn
 thermo		100
 
@@ -109,26 +107,82 @@ Neighbor list info ...
       stencil: half/bin/3d/newton
       bin: standard
 Per MPI rank memory allocation (min/avg/max) = 12.22 | 12.22 | 12.22 Mbytes
-Step CPU Atoms KinEng 
-       0            0        0           -0 
-     100    3.8153191      855           -0 
-     200     4.195287      855           -0 
-     300    4.5890362      855           -0 
-     400    10.636087     1500           -0 
-     500    11.306909     1500           -0 
-     600    11.968198     1500           -0 
-     700    22.631892     2288           -0 
-     800    23.711387     2288           -0 
-     900    24.754344     2288           -0 
-    1000    25.811778     2288           -0 
-    1100    35.368869     2845           -0 
-    1200    37.149843     2845           -0 
-    1300    39.026458     2845           -0 
-    1400    41.757583     3000           -0 
-    1500    45.155503     3000           -0 
-    1600    48.570241     3000           -0 
-    1700    52.839322     3000           -0 
-    1800    59.772697     3000           -0 
-    1900    69.493305     3000           -0 
-    2000    114.61886     3000           -0 
-    2100    152.89232     3000           -0 
+Step Atoms KinEng 
+       0        0           -0 
+     100      926           -0 
+     200      926           -0 
+     300      926           -0 
+     400     1498           -0 
+     500     1498           -0 
+     600     1498           -0 
+     700     2275           -0 
+     800     2275           -0 
+     900     2275           -0 
+    1000     2275           -0 
+    1100     2954           -0 
+    1200     2954           -0 
+    1300     2954           -0 
+    1400     3000           -0 
+    1500     3000           -0 
+    1600     3000           -0 
+    1700     3000           -0 
+    1800     3000           -0 
+    1900     3000           -0 
+    2000     3000           -0 
+    2100     3000           -0 
+    2200     3000           -0 
+    2300     3000           -0 
+    2400     3000           -0 
+    2500     3000           -0 
+    2600     3000           -0 
+    2700     3000           -0 
+    2800     3000           -0 
+    2900     3000           -0 
+    3000     3000           -0 
+    3100     3000           -0 
+    3200     3000           -0 
+    3300     3000           -0 
+    3400     3000           -0 
+    3500     3000           -0 
+    3600     3000           -0 
+    3700     3000           -0 
+    3800     3000           -0 
+    3900     3000           -0 
+    4000     3000           -0 
+    4100     3000           -0 
+    4200     3000           -0 
+    4300     3000           -0 
+    4400     3000           -0 
+    4500     3000           -0 
+    4600     3000           -0 
+    4700     3000           -0 
+    4800     3000           -0 
+    4900     3000           -0 
+    5000     3000           -0 
+Loop time of 24.3889 on 1 procs for 5000 steps with 3000 atoms
+
+Performance: 17713.003 tau/day, 205.012 timesteps/s
+99.9% CPU use with 1 MPI tasks x 1 OpenMP threads
+
+MPI task timing breakdown:
+Section |  min time  |  avg time  |  max time  |%varavg| %total
+---------------------------------------------------------------
+Pair    | 14.362     | 14.362     | 14.362     |   0.0 | 58.89
+Neigh   | 3.3483     | 3.3483     | 3.3483     |   0.0 | 13.73
+Comm    | 0.42893    | 0.42893    | 0.42893    |   0.0 |  1.76
+Output  | 0.0025065  | 0.0025065  | 0.0025065  |   0.0 |  0.01
+Modify  | 6.059      | 6.059      | 6.059      |   0.0 | 24.84
+Other   |            | 0.1876     |            |       |  0.77
+
+Nlocal:        3000.00 ave        3000 max        3000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost:        462.000 ave         462 max         462 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs:        17694.0 ave       17694 max       17694 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 17694
+Ave neighs/atom = 5.8980000
+Neighbor list builds = 1133
+Dangerous builds = 0
+Total wall time: 0:00:24
diff --git a/examples/granular/log.29Mar19.pour.flatwall.g++.4 b/examples/granular/log.29Oct20.pour.flatwall.g++.4
similarity index 52%
rename from examples/granular/log.29Mar19.pour.flatwall.g++.4
rename to examples/granular/log.29Oct20.pour.flatwall.g++.4
index 62a8b96c05..1688e52b43 100644
--- a/examples/granular/log.29Mar19.pour.flatwall.g++.4
+++ b/examples/granular/log.29Oct20.pour.flatwall.g++.4
@@ -1,5 +1,4 @@
-LAMMPS (29 Mar 2019)
-OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (../comm.cpp:88)
+LAMMPS (29 Oct 2020)
   using 1 OpenMP thread(s) per MPI task
 # pour two types of particles (cohesive and non-cohesive) on flat wall
 
@@ -45,9 +44,14 @@ region 		boxreg block 0 20 0 ${boxy} 0 ${boxz}
 region 		boxreg block 0 20 0 20 0 ${boxz}
 region 		boxreg block 0 20 0 20 0 30
 create_box	2 boxreg
-Created orthogonal box = (0 0 0) to (20 20 30)
+Created orthogonal box = (0.0000000 0.0000000 0.0000000) to (20.000000 20.000000 30.000000)
   2 by 2 by 1 MPI processor grid
 change_box	all boundary p p f
+Changing box ...
+
+pair_style	granular
+pair_coeff 	1 * jkr 1000.0 50.0 0.3 10 tangential mindlin 800.0 1.0 0.5 rolling sds 500.0 200.0 0.5 twisting marshall
+pair_coeff 	2 2 hertz 200.0 20.0 tangential linear_history 300.0 1.0 0.1 rolling sds 200.0 100.0 0.1 twisting marshall
 
 comm_modify 	vel yes
 
@@ -75,26 +79,19 @@ fix		ins2 all pour 1500 2 3123 region insreg2 diam range 0.5 1 dens 1 ${dens}
 fix		ins2 all pour 1500 2 3123 region insreg2 diam range 0.5 1 dens 1 1
 Particle insertion: 562 every 346 steps, 1500 by step 693
 
-comm_modify	vel yes
-
 neighbor	${skin} bin
 neighbor	0.15 bin
 neigh_modify	delay 0 every 1 check yes
 
-pair_style	granular
-pair_coeff 	1 * jkr 1000.0 50.0 0.3 10 tangential mindlin 800.0 1.0 0.5 rolling sds 500.0 200.0 0.5 twisting marshall
-pair_coeff 	2 2 hertz 200.0 20.0 tangential linear_history 300.0 1.0 0.1 rolling sds 200.0 100.0 0.1 twisting marshall
-
 fix		3 all wall/gran granular hertz/material 1e5 1e3 0.3 tangential mindlin NULL 1.0 0.5 zplane 0 NULL
 
-thermo_style	custom step cpu atoms ke
+thermo_style	custom step atoms ke
 thermo_modify	lost warn
 thermo		100
 
 timestep	0.001
 
-dump		1 all custom 100 ${name}.dump id type radius mass x y z
-dump		1 all custom 100 pour_two_types.dump id type radius mass x y z
+#dump		1 all custom 100 ${name}.dump id type radius mass x y z
 
 run		5000
 Neighbor list info ...
@@ -110,82 +107,82 @@ Neighbor list info ...
       stencil: half/bin/3d/newton
       bin: standard
 Per MPI rank memory allocation (min/avg/max) = 11.98 | 11.98 | 11.98 Mbytes
-Step CPU Atoms KinEng 
-       0            0        0           -0 
-     100   0.11584234      855           -0 
-     200   0.12743592      855           -0 
-     300   0.13925815      855           -0 
-     400   0.35203671     1500           -0 
-     500   0.37055922     1500           -0 
-     600   0.38671875     1500           -0 
-     700   0.71736908     2288           -0 
-     800   0.74506783     2288           -0 
-     900   0.77112222     2288           -0 
-    1000   0.79632139     2288           -0 
-    1100    1.0384252     2845           -0 
-    1200      1.08093     2845           -0 
-    1300    1.1224561     2845           -0 
-    1400    1.1811485     3000           -0 
-    1500    1.2414908     3000           -0 
-    1600    1.3105879     3000           -0 
-    1700     1.390928     3000           -0 
-    1800    1.4869275     3000           -0 
-    1900    1.5958266     3000           -0 
-    2000    1.7172487     3000           -0 
-    2100     1.851155     3000           -0 
-    2200    1.9957182     3000           -0 
-    2300    2.1593764     3000           -0 
-    2400    2.3433132     3000           -0 
-    2500     2.532742     3000           -0 
-    2600    2.7376895     3000           -0 
-    2700    2.9463468     3000           -0 
-    2800    3.1645725     3000           -0 
-    2900    3.3879526     3000           -0 
-    3000    3.6152103     3000           -0 
-    3100    3.8467371     3000           -0 
-    3200    4.0787683     3000           -0 
-    3300    4.3097105     3000           -0 
-    3400    4.5423617     3000           -0 
-    3500    4.7773693     3000           -0 
-    3600    5.0127218     3000           -0 
-    3700    5.2519271     3000           -0 
-    3800    5.4951298     3000           -0 
-    3900    5.7210469     3000           -0 
-    4000    5.9432652     3000           -0 
-    4100    6.1687591     3000           -0 
-    4200    6.3942792     3000           -0 
-    4300    6.6331475     3000           -0 
-    4400    6.8632154     3000           -0 
-    4500    7.0979366     3000           -0 
-    4600    7.3305347     3000           -0 
-    4700    7.5670528     3000           -0 
-    4800    7.8086057     3000           -0 
-    4900    8.0407174     3000           -0 
-    5000    8.2765219     3000           -0 
-Loop time of 8.27669 on 4 procs for 5000 steps with 3000 atoms
+Step Atoms KinEng 
+       0        0           -0 
+     100      926           -0 
+     200      926           -0 
+     300      926           -0 
+     400     1498           -0 
+     500     1498           -0 
+     600     1498           -0 
+     700     2275           -0 
+     800     2275           -0 
+     900     2275           -0 
+    1000     2275           -0 
+    1100     2954           -0 
+    1200     2954           -0 
+    1300     2954           -0 
+    1400     3000           -0 
+    1500     3000           -0 
+    1600     3000           -0 
+    1700     3000           -0 
+    1800     3000           -0 
+    1900     3000           -0 
+    2000     3000           -0 
+    2100     3000           -0 
+    2200     3000           -0 
+    2300     3000           -0 
+    2400     3000           -0 
+    2500     3000           -0 
+    2600     3000           -0 
+    2700     3000           -0 
+    2800     3000           -0 
+    2900     3000           -0 
+    3000     3000           -0 
+    3100     3000           -0 
+    3200     3000           -0 
+    3300     3000           -0 
+    3400     3000           -0 
+    3500     3000           -0 
+    3600     3000           -0 
+    3700     3000           -0 
+    3800     3000           -0 
+    3900     3000           -0 
+    4000     3000           -0 
+    4100     3000           -0 
+    4200     3000           -0 
+    4300     3000           -0 
+    4400     3000           -0 
+    4500     3000           -0 
+    4600     3000           -0 
+    4700     3000           -0 
+    4800     3000           -0 
+    4900     3000           -0 
+    5000     3000           -0 
+Loop time of 12.1982 on 4 procs for 5000 steps with 3000 atoms
 
-Performance: 52194.788 tau/day, 604.106 timesteps/s
-97.7% CPU use with 4 MPI tasks x 1 OpenMP threads
+Performance: 35414.923 tau/day, 409.895 timesteps/s
+97.0% CPU use with 4 MPI tasks x 1 OpenMP threads
 
 MPI task timing breakdown:
 Section |  min time  |  avg time  |  max time  |%varavg| %total
 ---------------------------------------------------------------
-Pair    | 1.6106     | 3.4073     | 5.4191     |  95.7 | 41.17
-Neigh   | 0.51456    | 0.64572    | 0.81542    |  16.6 |  7.80
-Comm    | 0.2808     | 2.5222     | 4.4998     | 121.9 | 30.47
-Output  | 0.15695    | 0.15919    | 0.16502    |   0.8 |  1.92
-Modify  | 1.3517     | 1.4192     | 1.4904     |   4.9 | 17.15
-Other   |            | 0.123      |            |       |  1.49
+Pair    | 1.7141     | 3.8131     | 6.2143     | 107.3 | 31.26
+Neigh   | 0.77648    | 0.96585    | 1.1892     |  18.3 |  7.92
+Comm    | 0.7427     | 3.5566     | 5.9731     | 128.4 | 29.16
+Output  | 0.0067544  | 0.0086352  | 0.011408   |   1.8 |  0.07
+Modify  | 3.3476     | 3.5826     | 3.8235     |  11.5 | 29.37
+Other   |            | 0.2715     |            |       |  2.23
 
-Nlocal:    750 ave 1036 max 482 min
-Histogram: 2 0 0 0 0 0 0 0 1 1
-Nghost:    429.75 ave 475 max 386 min
+Nlocal:        750.000 ave        1033 max         463 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost:        435.000 ave         492 max         378 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs:        4434.50 ave        7028 max        1967 min
 Histogram: 2 0 0 0 0 0 0 0 0 2
-Neighs:    4051.75 ave 6274 max 2057 min
-Histogram: 2 0 0 0 0 0 0 0 1 1
 
-Total # of neighbors = 16207
-Ave neighs/atom = 5.40233
-Neighbor list builds = 1165
+Total # of neighbors = 17738
+Ave neighs/atom = 5.9126667
+Neighbor list builds = 1139
 Dangerous builds = 0
-Total wall time: 0:00:08
+Total wall time: 0:00:12

From 773a31a628e41a21f5379ab428dfab763c08853b Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Fri, 6 Nov 2020 17:30:08 -0500
Subject: [PATCH 45/64] improve read_dump and rerun documentation. mention that
 native binary dumps are not supported

---
 doc/src/read_dump.rst |  2 ++
 doc/src/rerun.rst     | 29 ++++++++++++++++++-----------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/doc/src/read_dump.rst b/doc/src/read_dump.rst
index c46c12c951..f9a8c87c86 100644
--- a/doc/src/read_dump.rst
+++ b/doc/src/read_dump.rst
@@ -370,6 +370,8 @@ needed to generate absolute, unscaled coordinates.
 Restrictions
 """"""""""""
 
+The *native* dump file reader does not support binary .bin dump files.
+
 To read gzipped dump files, you must compile LAMMPS with the
 -DLAMMPS_GZIP option.  See the :doc:`Build settings <Build_settings>`
 doc page for details.
diff --git a/doc/src/rerun.rst b/doc/src/rerun.rst
index 16b036b449..7d51fba868 100644
--- a/doc/src/rerun.rst
+++ b/doc/src/rerun.rst
@@ -99,14 +99,15 @@ files do not match the specified output frequency.
 ----------
 
 If more than one dump file is specified, the dump files are read one
-after the other.  It is assumed that snapshot timesteps will be in
-ascending order.  If a snapshot is encountered that is not in
-ascending order, it will skip the snapshot until it reads one that is.
+after the other in the order specified.  It is assumed that snapshot
+timesteps will be in ascending order.  If a snapshot is encountered that
+is not in ascending order, it will skip the snapshot until it reads one
+that is.
 This allows skipping of a duplicate snapshot (same timestep),
 e.g. that appeared at the end of one file and beginning of the next.
 However if you specify a series of dump files in an incorrect order
 (with respect to the timesteps they contain), you may skip large
-numbers of snapshots
+numbers of snapshots.
 
 Note that the dump files specified as part of the *dump* keyword can be
 parallel files, i.e. written as multiple files either per processor
@@ -118,17 +119,24 @@ and write parallel dump files.
 
 The *first*\ , *last*\ , *every*\ , *skip* keywords determine which
 snapshots are read from the dump file(s).  Snapshots are skipped until
-they have a timestamp >= *Nfirst*\ .  When a snapshot with a timestamp >
-*Nlast* is encountered, the rerun command finishes.  Note below that
+they have a timestep >= *Nfirst*\ .  When a snapshot with a timestep >
+*Nlast* is encountered, the rerun command finishes.  Note that
 the defaults for *first* and *last* are to read all snapshots.  If the
 *every* keyword is set to a value > 0, then only snapshots with
-timestamps that are a multiple of *Nevery* are read (the first
+timesteps that are a multiple of *Nevery* are read (the first
 snapshot is always read).  If *Nevery* = 0, then this criterion is
 ignored, i.e. every snapshot is read that meets the other criteria.
 If the *skip* keyword is used, then after the first snapshot is read,
 every Nth snapshot is read, where N = *Nskip*\ .  E.g. if *Nskip* = 3,
 then only 1 out of every 3 snapshots is read, assuming the snapshot
-timestamp is also consistent with the other criteria.
+timestep is also consistent with the other criteria.
+
+.. note::
+
+   Not all dump formats contain the timestep and not all dump readers
+   support reading it.  In that case individual snapshots are assigned
+   consecutive timestep numbers starting at 1.
+
 
 The *start* and *stop* keywords do not affect which snapshots are read
 from the dump file(s).  Rather, they have the same meaning that they
@@ -205,9 +213,8 @@ thermodynamic output or new dump file output.
 Restrictions
 """"""""""""
 
-To read gzipped dump files, you must compile LAMMPS with the
--DLAMMPS_GZIP option.  See the :doc:`Build settings <Build_settings>`
-doc page for details.
+The *rerun* command is subject to all restrictions of
+the :doc:`read_dump <read_dump>` command.
 
 Related commands
 """"""""""""""""

From d55eeefc323792f0bd1561a1b835ee68742f18b6 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Mon, 9 Nov 2020 13:47:27 -0500
Subject: [PATCH 46/64] Undo change in library interface breaking compatibility

---
 src/library.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/library.cpp b/src/library.cpp
index 2fd1486bc2..074cb3cffa 100644
--- a/src/library.cpp
+++ b/src/library.cpp
@@ -1976,7 +1976,7 @@ void lammps_gather_atoms(void *handle, char *name, int type, int count, void *da
     // use atom ID to insert each atom's values into copy
     // MPI_Allreduce with MPI_SUM to merge into data, ordered by atom ID
 
-    if (type == LAMMPS_INT) {
+    if (type == 0) {
       int *vector = nullptr;
       int **array = nullptr;
       const int imgunpack = (count == 3) && (strcmp(name,"image") == 0);
@@ -2015,7 +2015,7 @@ void lammps_gather_atoms(void *handle, char *name, int type, int count, void *da
       MPI_Allreduce(copy,data,count*natoms,MPI_INT,MPI_SUM,lmp->world);
       lmp->memory->destroy(copy);
 
-    } else if (type == LAMMPS_DOUBLE) {
+    } else if (type == 1) {
       double *vector = nullptr;
       double **array = nullptr;
       if (count == 1) vector = (double *) vptr;

From df672fe7d4f8e0d229ef2e72f12f38cb75ad005a Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Mon, 9 Nov 2020 12:42:12 -0700
Subject: [PATCH 47/64] Correcting indentation issue in pair_spin_dmi.cpp

---
 src/SPIN/pair_spin_dmi.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/SPIN/pair_spin_dmi.cpp b/src/SPIN/pair_spin_dmi.cpp
index d7b7d1b3d9..69a9873303 100644
--- a/src/SPIN/pair_spin_dmi.cpp
+++ b/src/SPIN/pair_spin_dmi.cpp
@@ -257,16 +257,15 @@ void PairSpinDmi::compute(int eflag, int vflag)
         f[i][0] += fi[0];
         f[i][1] += fi[1];
         f[i][2] += fi[2];
-          if (newton_pair || j < nlocal) {
-            f[j][0] -= fi[0];
-            f[j][1] -= fi[1];
-            f[j][2] -= fi[2];
-          }
+        if (newton_pair || j < nlocal) {
+          f[j][0] -= fi[0];
+          f[j][1] -= fi[1];
+          f[j][2] -= fi[2];
+        }
         fm[i][0] += fmi[0];
         fm[i][1] += fmi[1];
         fm[i][2] += fmi[2];
 
-
         if (evflag) ev_tally_xyz(i,j,nlocal,newton_pair,
             evdwl,ecoul,fi[0],fi[1],fi[2],delx,dely,delz);
       }

From 2acb0aaedd435390565c65093bc556f8aff7ad5f Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 10 Nov 2020 05:00:56 -0500
Subject: [PATCH 48/64] fix typo that has propagated to multiple pair style doc
 files

---
 doc/src/pair_atm.rst         | 2 +-
 doc/src/pair_buck_long.rst   | 2 +-
 doc/src/pair_gayberne.rst    | 2 +-
 doc/src/pair_lcbop.rst       | 9 +++++----
 doc/src/pair_polymorphic.rst | 2 +-
 doc/src/pair_resquared.rst   | 2 +-
 doc/src/pair_srp.rst         | 2 +-
 7 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/doc/src/pair_atm.rst b/doc/src/pair_atm.rst
index c0dfb64c50..0bdfecd517 100644
--- a/doc/src/pair_atm.rst
+++ b/doc/src/pair_atm.rst
@@ -143,7 +143,7 @@ combinations, else an error will result.
 Mixing, shift, table, tail correction, restart, rRESPA info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-This pair styles do not support the :doc:`pair_modify <pair_modify>`
+This pair style do not support the :doc:`pair_modify <pair_modify>`
 mix, shift, table, and tail options.
 
 This pair style writes its information to :doc:`binary restart files
diff --git a/doc/src/pair_buck_long.rst b/doc/src/pair_buck_long.rst
index 1883cee637..0e19873500 100644
--- a/doc/src/pair_buck_long.rst
+++ b/doc/src/pair_buck_long.rst
@@ -117,7 +117,7 @@ global Coulombic cutoff is allowed.
 Mixing, shift, table, tail correction, restart, rRESPA info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-This pair styles does not support mixing.  Thus, coefficients for all
+This pair style does not support mixing.  Thus, coefficients for all
 I,J pairs must be specified explicitly.
 
 This pair style supports the :doc:`pair_modify <pair_modify>` shift
diff --git a/doc/src/pair_gayberne.rst b/doc/src/pair_gayberne.rst
index 309e949f97..19597b9018 100644
--- a/doc/src/pair_gayberne.rst
+++ b/doc/src/pair_gayberne.rst
@@ -160,7 +160,7 @@ For atom type pairs I,J and I != J, the epsilon and sigma coefficients
 and cutoff distance for this pair style can be mixed.  The default mix
 value is *geometric*\ .  See the "pair_modify" command for details.
 
-This pair styles supports the :doc:`pair_modify <pair_modify>` shift
+This pair style supports the :doc:`pair_modify <pair_modify>` shift
 option for the energy of the Lennard-Jones portion of the pair
 interaction, but only for sphere-sphere interactions.  There is no
 shifting performed for ellipsoidal interactions due to the anisotropic
diff --git a/doc/src/pair_lcbop.rst b/doc/src/pair_lcbop.rst
index fa2d3c0609..c44ad1f8a4 100644
--- a/doc/src/pair_lcbop.rst
+++ b/doc/src/pair_lcbop.rst
@@ -75,14 +75,15 @@ This pair style can only be used via the *pair* keyword of the
 Restrictions
 """"""""""""
 
-This pair styles is part of the MANYBODY package.  It is only enabled
-if LAMMPS was built with that package.  See the :doc:`Build package <Build_package>` doc page for more info.
+This pair style is part of the MANYBODY package.  It is only enabled
+if LAMMPS was built with that package.
+See the :doc:`Build package <Build_package>` doc page for more info.
 
 This pair potential requires the :doc:`newton <newton>` setting to be
 "on" for pair interactions.
 
-The C.lcbop potential file provided with LAMMPS (see the potentials
-directory) is parameterized for metal :doc:`units <units>`.  You can use
+The ``C.lcbop`` potential file provided with LAMMPS (see the potentials
+directory) is parameterized for :doc:`metal units <units>`.  You can use
 the LCBOP potential with any LAMMPS units, but you would need to
 create your own LCBOP potential file with coefficients listed in the
 appropriate units if your simulation does not use "metal" units.
diff --git a/doc/src/pair_polymorphic.rst b/doc/src/pair_polymorphic.rst
index 04be107e02..6abe037581 100644
--- a/doc/src/pair_polymorphic.rst
+++ b/doc/src/pair_polymorphic.rst
@@ -298,7 +298,7 @@ described above.  For each of the F functions, nx values are listed.
 Mixing, shift, table, tail correction, restart, rRESPA info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-This pair styles does not support the :doc:`pair_modify <pair_modify>`
+This pair style does not support the :doc:`pair_modify <pair_modify>`
 shift, table, and tail options.
 
 This pair style does not write their information to :doc:`binary restart
diff --git a/doc/src/pair_resquared.rst b/doc/src/pair_resquared.rst
index d34588682c..0932730469 100644
--- a/doc/src/pair_resquared.rst
+++ b/doc/src/pair_resquared.rst
@@ -173,7 +173,7 @@ equation for the Hamaker constant presented here.  Mixing of sigma and
 epsilon followed by calculation of the energy prefactors using the
 equations above is recommended.
 
-This pair styles supports the :doc:`pair_modify <pair_modify>` shift
+This pair style supports the :doc:`pair_modify <pair_modify>` shift
 option for the energy of the Lennard-Jones portion of the pair
 interaction, but only for sphere-sphere interactions.  There is no
 shifting performed for ellipsoidal interactions due to the anisotropic
diff --git a/doc/src/pair_srp.rst b/doc/src/pair_srp.rst
index 620c74d515..59df8be1e2 100644
--- a/doc/src/pair_srp.rst
+++ b/doc/src/pair_srp.rst
@@ -124,7 +124,7 @@ at the cutoff distance :math:`r_c`.
 Mixing, shift, table, tail correction, restart, rRESPA info
 """""""""""""""""""""""""""""""""""""""""""""""""""""""""""
 
-This pair styles does not support mixing.
+This pair style does not support mixing.
 
 This pair style does not support the :doc:`pair_modify <pair_modify>`
 shift option for the energy of the pair interaction. Note that as

From 5aae2cb44ded9af4596ca10505e9da130747cc48 Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Tue, 10 Nov 2020 14:03:16 +0100
Subject: [PATCH 49/64] Fix typo in Howto Walls

---
 doc/src/Howto_walls.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/src/Howto_walls.rst b/doc/src/Howto_walls.rst
index 4d35cd66b3..6e3e22a3f0 100644
--- a/doc/src/Howto_walls.rst
+++ b/doc/src/Howto_walls.rst
@@ -67,5 +67,5 @@ rotate.
 
 The only frictional idealized walls currently in LAMMPS are flat or
 curved surfaces specified by the :doc:`fix wall/gran <fix_wall_gran>`
-command.  At some point we plan to allow regoin surfaces to be used as
+command.  At some point we plan to allow region surfaces to be used as
 frictional walls, as well as triangulated surfaces.

From eae9fea02615b0aaba3e0b92350e78e70f302e94 Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Tue, 10 Nov 2020 14:04:49 +0100
Subject: [PATCH 50/64] Consistently use  instead of

---
 doc/src/atc_output.rst          | 2 +-
 doc/src/fix_filter_corotate.rst | 2 +-
 doc/src/fix_rx.rst              | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/src/atc_output.rst b/doc/src/atc_output.rst
index 3fb1491be1..5003817daa 100644
--- a/doc/src/atc_output.rst
+++ b/doc/src/atc_output.rst
@@ -14,7 +14,7 @@ Syntax
 * AtC fixID = ID of :doc:`fix atc <fix_atc>` instance
 * *output* or *output index* = name of the AtC sub-command
 * filename_prefix = prefix for data files (for *output*)
-* frequency = frequency of output in time-steps (for *output*)
+* frequency = frequency of output in timesteps (for *output*)
 * optional keywords for *output*:
 
   - text = creates text output of index, step and nodal variable values for unique nodes
diff --git a/doc/src/fix_filter_corotate.rst b/doc/src/fix_filter_corotate.rst
index ee608e5361..e33fc0ac4a 100644
--- a/doc/src/fix_filter_corotate.rst
+++ b/doc/src/fix_filter_corotate.rst
@@ -56,7 +56,7 @@ is slightly modified only for the computation of long-range forces. A
 good cluster decomposition constitutes in building clusters which
 contain the fastest covalent bonds inside clusters.
 
-If the clusters are chosen suitably, the :doc:`run_style respa <run_style>` is stable for outer time-steps of at least 8fs.
+If the clusters are chosen suitably, the :doc:`run_style respa <run_style>` is stable for outer timesteps of at least 8fs.
 
 ----------
 
diff --git a/doc/src/fix_rx.rst b/doc/src/fix_rx.rst
index c1a1d0950c..9eab06ffad 100644
--- a/doc/src/fix_rx.rst
+++ b/doc/src/fix_rx.rst
@@ -90,10 +90,10 @@ accepted, *h* is increased by a proportional amount, and the next ODE step is be
 Otherwise, *h* is shrunk and the ODE step is repeated.
 
 Run-time diagnostics are available for the rkf45 ODE solver. The frequency
-(in time-steps) that diagnostics are reported is controlled by the last (optional)
+(in timesteps) that diagnostics are reported is controlled by the last (optional)
 12th argument. A negative frequency means that diagnostics are reported once at the
 end of each run. A positive value N means that the diagnostics are reported once
-per N time-steps.
+per N timesteps.
 
 The diagnostics report the average # of integrator steps and RHS function evaluations
 and run-time per ODE as well as the average/RMS/min/max per process. If the

From ad56e0ca9ff75b7129c1386dc615e490aefcb6f6 Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Tue, 10 Nov 2020 14:16:12 +0100
Subject: [PATCH 51/64] Fix casing of the word

---
 .github/CONTRIBUTING.md            | 2 +-
 doc/github-development-workflow.md | 2 +-
 doc/src/Howto_github.rst           | 4 ++--
 lib/kokkos/README.md               | 4 ++--
 lib/quip/README                    | 2 +-
 lib/scafacos/README                | 2 +-
 src/USER-PLUMED/README             | 2 +-
 tools/replica/reorder_remd_traj.py | 2 +-
 8 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 60fe82d86c..62e7186360 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -108,7 +108,7 @@ For bug reports, the next step is that one of the core LAMMPS developers will se
 
 For submitting pull requests, there is a [detailed tutorial](https://lammps.sandia.gov/doc/Howto_github.html) in the LAMMPS manual. Thus only a brief breakdown of the steps is presented here. Please note, that the LAMMPS developers are still reviewing and trying to improve the process. If you are unsure about something, do not hesitate to post a question on the lammps-users mailing list or contact one fo the core LAMMPS developers.
 Immediately after the submission, the LAMMPS continuing integration server at ci.lammps.org will download your submitted branch and perform a simple compilation test, i.e. will test whether your submitted code can be compiled under various conditions. It will also do a check on whether your included documentation translates cleanly. Whether these tests are successful or fail will be recorded. If a test fails, please inspect the corresponding output on the CI server and take the necessary steps, if needed, so that the code can compile cleanly again. The test will be re-run each the pull request is updated with a push to the remote branch on GitHub.
-Next a LAMMPS core developer will self-assign and do an overall technical assessment of the submission. If you are not yet registered as a LAMMPS collaborator, you will receive an invitation for that. As part of the assesment, the pull request will be categorized with labels. There are two special labels: `needs_work` (indicates that work from the submitter of the pull request is needed) and `work_in_progress` (indicates, that the assigned LAMMPS developer will make changes, if not done by the contributor who made the submit). 
+Next a LAMMPS core developer will self-assign and do an overall technical assessment of the submission. If you are not yet registered as a LAMMPS collaborator, you will receive an invitation for that. As part of the assessment, the pull request will be categorized with labels. There are two special labels: `needs_work` (indicates that work from the submitter of the pull request is needed) and `work_in_progress` (indicates, that the assigned LAMMPS developer will make changes, if not done by the contributor who made the submit). 
 You may also receive comments and suggestions on the overall submission or specific details and on occasion specific requests for changes as part of the review. If permitted, also additional changes may be pushed into your pull request branch or a pull request may be filed in your LAMMPS fork on GitHub to include those changes.
 The LAMMPS developer may then decide to assign the pull request to another developer (e.g. when that developer is more knowledgeable about the submitted feature or enhancement or has written the modified code). It may also happen, that additional developers are requested to provide a review and approve the changes. For submissions, that may change the general behavior of LAMMPS, or where a possibility of unwanted side effects exists, additional tests may be requested by the assigned developer.
 If the assigned developer is satisfied and considers the submission ready for inclusion into LAMMPS, the pull request will receive approvals and be merged into the master branch by one of the core LAMMPS developers. After the pull request is merged, you may delete the feature branch used for the pull request in your personal LAMMPS fork.
diff --git a/doc/github-development-workflow.md b/doc/github-development-workflow.md
index a7d41dd32a..503a33be4e 100644
--- a/doc/github-development-workflow.md
+++ b/doc/github-development-workflow.md
@@ -95,7 +95,7 @@ on the pull request discussion page on GitHub, so that other developers
 can later review the entire discussion after the fact and understand the
 rationale behind choices made.  Exceptions to this policy are technical
 discussions, that are centered on tools or policies themselves
-(git, github, c++) rather than on the content of the pull request.
+(git, c++) rather than on the content of the pull request.
 
 ### Checklist for Pull Requests
 
diff --git a/doc/src/Howto_github.rst b/doc/src/Howto_github.rst
index 63cb8945e8..6303feb407 100644
--- a/doc/src/Howto_github.rst
+++ b/doc/src/Howto_github.rst
@@ -72,7 +72,7 @@ explained in more detail here: `feature branch workflow <https://www.atlassian.c
 
 **Feature branches**
 
-First of all, create a clone of your version on github on your local
+First of all, create a clone of your version onon your local
 machine via HTTPS:
 
 .. code-block:: bash
@@ -155,7 +155,7 @@ useful message that explains the change.
 
 .. code-block:: bash
 
-     $ git commit -m 'Finally updated the github tutorial'
+     $ git commit -m 'Finally updated thetutorial'
 
 After the commit, the changes can be pushed to the same branch on GitHub:
 
diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md
index a08d238e5d..f9facbe96d 100644
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@@ -18,7 +18,7 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
 
 A programming guide can be found on the Wiki, the API reference is under development.
 
-For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
+For questions find us on Slack: https://kokkosteam.slack.com or open aissue.
 
 For non-public questions send an email to
 crtrott(at)sandia.gov
@@ -44,7 +44,7 @@ To learn more about Kokkos consider watching one of our presentations:
 We are open and try to encourage contributions from external developers.
 To do so please first open an issue describing the contribution and then issue
 a pull request against the develop branch. For larger features it may be good
-to get guidance from the core development team first through the github issue.
+to get guidance from the core development team first through theissue.
 
 Note that Kokkos Core is licensed under standard 3-clause BSD terms of use.
 Which means contributing to Kokkos allows anyone else to use your contributions
diff --git a/lib/quip/README b/lib/quip/README
index e6cc3903bd..bf316d036a 100644
--- a/lib/quip/README
+++ b/lib/quip/README
@@ -17,7 +17,7 @@ Building LAMMPS with QUIP support:
 1) Building QUIP
 1.1) Obtaining QUIP
 
-The most current release of QUIP can be obtained from github:
+The most current release of QUIP can be obtained from
 
 $ git clone https://github.com/libAtoms/QUIP.git QUIP
 
diff --git a/lib/scafacos/README b/lib/scafacos/README
index 86335d9f98..9d202d704b 100644
--- a/lib/scafacos/README
+++ b/lib/scafacos/README
@@ -3,7 +3,7 @@ is required to use the KSPACE scafacos and its kspace_style
 scafacos command in a LAMMPS input script.
 
 The ScaFaCoS library is available at http://scafacos.de or
-on github at https://github.com/scafacos, the library was
+onat https://github.com/scafacos, the library was
 developed by a consortium of different universities in
 Germany (Bonn, Chemnitz, Stuttgart, Wuppertal) and
 the Research Centre Juelich (Juelich Supercomputing Centre).
diff --git a/src/USER-PLUMED/README b/src/USER-PLUMED/README
index f46b2cd9bd..ed166cda90 100644
--- a/src/USER-PLUMED/README
+++ b/src/USER-PLUMED/README
@@ -30,7 +30,7 @@ even if PLUMED is not in the path if as long as the input does not contain a fix
 plumed command.
 
 If you wish to statically link PLUMED you must download PLUMED to the /lib/plumed directory before compiling LAMMPS.  You can
-download a tar ball into that directory or you can clone the plumed2 repository from github there.  Once you have created a
+download a tar ball into that directory or you can clone the plumed2 repository fromthere.  Once you have created a
 directory containing a distribution of PLUMED within /lib/plumed you then must build PLUMED within that directory by issuing
 the usual commands.  It is worth noting that we have provided a script that will download and build PLUMED for you with
 a minimal set of options.  To run this script you need to issue the following command:
diff --git a/tools/replica/reorder_remd_traj.py b/tools/replica/reorder_remd_traj.py
index 5033ae1e53..ff525c06b2 100644
--- a/tools/replica/reorder_remd_traj.py
+++ b/tools/replica/reorder_remd_traj.py
@@ -325,7 +325,7 @@ def get_canonical_logw(enefn, frametuple_dict, temps, nprod, writefreq,
               pip install --user pymbar
               sudo pip install pymbar
 
-              To install the dev. version directly from github, use:
+              To install the dev. version directly from use:
               pip install pip install git+https://github.com/choderalab/pymbar.git
               """)
 

From 4be2a99977bb4620b8494c3d1f59af902b9a57aa Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Tue, 10 Nov 2020 14:20:52 +0100
Subject: [PATCH 52/64] Fix casing of the word GitHub

---
 .github/CONTRIBUTING.md            |   2 +-
 doc/github-development-workflow.md |   2 +-
 doc/src/Howto_github.rst           |   4 +-
 lib/kokkos/README.md               |   4 +-
 lib/quip/README                    |   2 +-
 lib/scafacos/README                |   2 +-
 src/USER-PLUMED/README             |   2 +-
 tools/replica/reorder_remd_traj.py | 231 +++++++++++++++--------------
 8 files changed, 131 insertions(+), 118 deletions(-)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 60fe82d86c..62e7186360 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -108,7 +108,7 @@ For bug reports, the next step is that one of the core LAMMPS developers will se
 
 For submitting pull requests, there is a [detailed tutorial](https://lammps.sandia.gov/doc/Howto_github.html) in the LAMMPS manual. Thus only a brief breakdown of the steps is presented here. Please note, that the LAMMPS developers are still reviewing and trying to improve the process. If you are unsure about something, do not hesitate to post a question on the lammps-users mailing list or contact one fo the core LAMMPS developers.
 Immediately after the submission, the LAMMPS continuing integration server at ci.lammps.org will download your submitted branch and perform a simple compilation test, i.e. will test whether your submitted code can be compiled under various conditions. It will also do a check on whether your included documentation translates cleanly. Whether these tests are successful or fail will be recorded. If a test fails, please inspect the corresponding output on the CI server and take the necessary steps, if needed, so that the code can compile cleanly again. The test will be re-run each the pull request is updated with a push to the remote branch on GitHub.
-Next a LAMMPS core developer will self-assign and do an overall technical assessment of the submission. If you are not yet registered as a LAMMPS collaborator, you will receive an invitation for that. As part of the assesment, the pull request will be categorized with labels. There are two special labels: `needs_work` (indicates that work from the submitter of the pull request is needed) and `work_in_progress` (indicates, that the assigned LAMMPS developer will make changes, if not done by the contributor who made the submit). 
+Next a LAMMPS core developer will self-assign and do an overall technical assessment of the submission. If you are not yet registered as a LAMMPS collaborator, you will receive an invitation for that. As part of the assessment, the pull request will be categorized with labels. There are two special labels: `needs_work` (indicates that work from the submitter of the pull request is needed) and `work_in_progress` (indicates, that the assigned LAMMPS developer will make changes, if not done by the contributor who made the submit). 
 You may also receive comments and suggestions on the overall submission or specific details and on occasion specific requests for changes as part of the review. If permitted, also additional changes may be pushed into your pull request branch or a pull request may be filed in your LAMMPS fork on GitHub to include those changes.
 The LAMMPS developer may then decide to assign the pull request to another developer (e.g. when that developer is more knowledgeable about the submitted feature or enhancement or has written the modified code). It may also happen, that additional developers are requested to provide a review and approve the changes. For submissions, that may change the general behavior of LAMMPS, or where a possibility of unwanted side effects exists, additional tests may be requested by the assigned developer.
 If the assigned developer is satisfied and considers the submission ready for inclusion into LAMMPS, the pull request will receive approvals and be merged into the master branch by one of the core LAMMPS developers. After the pull request is merged, you may delete the feature branch used for the pull request in your personal LAMMPS fork.
diff --git a/doc/github-development-workflow.md b/doc/github-development-workflow.md
index a7d41dd32a..c34a67dfcf 100644
--- a/doc/github-development-workflow.md
+++ b/doc/github-development-workflow.md
@@ -95,7 +95,7 @@ on the pull request discussion page on GitHub, so that other developers
 can later review the entire discussion after the fact and understand the
 rationale behind choices made.  Exceptions to this policy are technical
 discussions, that are centered on tools or policies themselves
-(git, github, c++) rather than on the content of the pull request.
+(git, GitHub, c++) rather than on the content of the pull request.
 
 ### Checklist for Pull Requests
 
diff --git a/doc/src/Howto_github.rst b/doc/src/Howto_github.rst
index 63cb8945e8..311d716f18 100644
--- a/doc/src/Howto_github.rst
+++ b/doc/src/Howto_github.rst
@@ -72,7 +72,7 @@ explained in more detail here: `feature branch workflow <https://www.atlassian.c
 
 **Feature branches**
 
-First of all, create a clone of your version on github on your local
+First of all, create a clone of your version on GitHub on your local
 machine via HTTPS:
 
 .. code-block:: bash
@@ -155,7 +155,7 @@ useful message that explains the change.
 
 .. code-block:: bash
 
-     $ git commit -m 'Finally updated the github tutorial'
+     $ git commit -m 'Finally updated the GitHub tutorial'
 
 After the commit, the changes can be pushed to the same branch on GitHub:
 
diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md
index a08d238e5d..f820b7be10 100644
--- a/lib/kokkos/README.md
+++ b/lib/kokkos/README.md
@@ -18,7 +18,7 @@ profiling and debugging tools (https://github.com/kokkos/kokkos-tools).
 
 A programming guide can be found on the Wiki, the API reference is under development.
 
-For questions find us on Slack: https://kokkosteam.slack.com or open a github issue.
+For questions find us on Slack: https://kokkosteam.slack.com or open a GitHub issue.
 
 For non-public questions send an email to
 crtrott(at)sandia.gov
@@ -44,7 +44,7 @@ To learn more about Kokkos consider watching one of our presentations:
 We are open and try to encourage contributions from external developers.
 To do so please first open an issue describing the contribution and then issue
 a pull request against the develop branch. For larger features it may be good
-to get guidance from the core development team first through the github issue.
+to get guidance from the core development team first through the GitHub issue.
 
 Note that Kokkos Core is licensed under standard 3-clause BSD terms of use.
 Which means contributing to Kokkos allows anyone else to use your contributions
diff --git a/lib/quip/README b/lib/quip/README
index e6cc3903bd..5e737db11d 100644
--- a/lib/quip/README
+++ b/lib/quip/README
@@ -17,7 +17,7 @@ Building LAMMPS with QUIP support:
 1) Building QUIP
 1.1) Obtaining QUIP
 
-The most current release of QUIP can be obtained from github:
+The most current release of QUIP can be obtained from GitHub:
 
 $ git clone https://github.com/libAtoms/QUIP.git QUIP
 
diff --git a/lib/scafacos/README b/lib/scafacos/README
index 86335d9f98..c8181ac7ae 100644
--- a/lib/scafacos/README
+++ b/lib/scafacos/README
@@ -3,7 +3,7 @@ is required to use the KSPACE scafacos and its kspace_style
 scafacos command in a LAMMPS input script.
 
 The ScaFaCoS library is available at http://scafacos.de or
-on github at https://github.com/scafacos, the library was
+on GitHub at https://github.com/scafacos, the library was
 developed by a consortium of different universities in
 Germany (Bonn, Chemnitz, Stuttgart, Wuppertal) and
 the Research Centre Juelich (Juelich Supercomputing Centre).
diff --git a/src/USER-PLUMED/README b/src/USER-PLUMED/README
index f46b2cd9bd..31910bc951 100644
--- a/src/USER-PLUMED/README
+++ b/src/USER-PLUMED/README
@@ -30,7 +30,7 @@ even if PLUMED is not in the path if as long as the input does not contain a fix
 plumed command.
 
 If you wish to statically link PLUMED you must download PLUMED to the /lib/plumed directory before compiling LAMMPS.  You can
-download a tar ball into that directory or you can clone the plumed2 repository from github there.  Once you have created a
+download a tar ball into that directory or you can clone the plumed2 repository from GitHub there.  Once you have created a
 directory containing a distribution of PLUMED within /lib/plumed you then must build PLUMED within that directory by issuing
 the usual commands.  It is worth noting that we have provided a script that will download and build PLUMED for you with
 a minimal set of options.  To run this script you need to issue the following command:
diff --git a/tools/replica/reorder_remd_traj.py b/tools/replica/reorder_remd_traj.py
index 5033ae1e53..6eee4770ab 100644
--- a/tools/replica/reorder_remd_traj.py
+++ b/tools/replica/reorder_remd_traj.py
@@ -37,13 +37,17 @@ StringIO (or io if in Python 3.x)
 """
 
 
-
-import os, numpy as np, argparse, time, pickle
+import os
+import numpy as np
+import argparse
+import time
+import pickle
 from scipy.special import logsumexp
 from mpi4py import MPI
 
 from tqdm import tqdm
-import gzip, bz2
+import gzip
+import bz2
 try:
     # python-2
     from StringIO import StringIO as IOBuffer
@@ -52,12 +56,11 @@ except ImportError:
     from io import BytesIO as IOBuffer
 
 
-
 #### INITIALIZE MPI ####
 # (note that all output on screen will be printed only on the ROOT proc)
 ROOT = 0
 comm = MPI.COMM_WORLD
-me = comm.rank # my proc id
+me = comm.rank  # my proc id
 nproc = comm.size
 
 
@@ -77,7 +80,8 @@ def _get_nearest_temp(temps, query_temp):
     out_temp: nearest temp from the list
     """
 
-    if isinstance(temps, list): temps = np.array(temps)
+    if isinstance(temps, list):
+        temps = np.array(temps)
     return temps[np.argmin(np.abs(temps-query_temp))]
 
 
@@ -95,10 +99,10 @@ def readwrite(trajfn, mode):
 
     if trajfn.endswith(".gz"):
         of = gzip.open(trajfn, mode)
-        #return gzip.GzipFile(trajfn, mode)
+        # return gzip.GzipFile(trajfn, mode)
     elif trajfn.endswith(".bz2"):
         of = bz2.open(trajfn, mode)
-        #return bz2.BZ2File(trajfn, mode)
+        # return bz2.BZ2File(trajfn, mode)
     else:
         of = open(trajfn, mode)
     return of
@@ -123,8 +127,8 @@ def get_replica_frames(logfn, temps, nswap, writefreq):
     """
 
     n_rep = len(temps)
-    swap_history = np.loadtxt(logfn, skiprows = 3)
-    master_frametuple_dict = dict( (n, []) for n in range(n_rep) )
+    swap_history = np.loadtxt(logfn, skiprows=3)
+    master_frametuple_dict = dict((n, []) for n in range(n_rep))
 
     # walk through the replicas
     print("Getting frames from all replicas at temperature:")
@@ -136,15 +140,15 @@ def get_replica_frames(logfn, temps, nswap, writefreq):
         if writefreq <= nswap:
             for ii, i in enumerate(rep_inds[:-1]):
                 start = int(ii * nswap / writefreq)
-                stop = int( (ii+1) * nswap / writefreq)
-                [master_frametuple_dict[n].append( (i,x) ) \
-                                        for x in range(start, stop)]
+                stop = int((ii+1) * nswap / writefreq)
+                [master_frametuple_dict[n].append((i, x))
+                 for x in range(start, stop)]
 
         # case-2: when temps. are swapped faster than dumping frames
         else:
             nskip = int(writefreq / nswap)
-            [master_frametuple_dict[n].append( (i,ii) ) \
-            for ii, i in enumerate(rep_inds[0::nskip])]
+            [master_frametuple_dict[n].append((i, ii))
+             for ii, i in enumerate(rep_inds[0::nskip])]
 
     return master_frametuple_dict
 
@@ -161,11 +165,12 @@ def get_byte_index(rep_inds, byteindfns, intrajfns):
     """
     for n in rep_inds:
         # check if the byte indices for this traj has already been computed
-        if os.path.isfile(byteindfns[n]): continue
+        if os.path.isfile(byteindfns[n]):
+            continue
 
         # extract bytes
         fobj = readwrite(intrajfns[n], "rb")
-        byteinds = [ [0,0] ]
+        byteinds = [[0, 0]]
 
         # place file pointer at first line
         nframe = 0
@@ -175,33 +180,37 @@ def get_byte_index(rep_inds, byteindfns, intrajfns):
         # status printed only for replica read on root proc
         # this assumes that each proc takes roughly the same time
         if me == ROOT:
-            pb = tqdm(desc = "Reading replicas", leave = True,
-                  position = ROOT + 2*me,
-                  unit = "B/replica", unit_scale = True,
-                  unit_divisor = 1024)
+            pb = tqdm(desc="Reading replicas", leave=True,
+                      position=ROOT + 2*me,
+                      unit="B/replica", unit_scale=True,
+                      unit_divisor=1024)
 
         # start crawling through the bytes
         while True:
             next_line = fobj.readline()
-            if len(next_line) == 0: break
+            if len(next_line) == 0:
+                break
             # this will only work with lammpstrj traj format.
             # this condition essentially checks periodic recurrences
             # of the token TIMESTEP. Each time it is found,
             # we have crawled through a frame (snapshot)
             if next_line == first_line:
                 nframe += 1
-                byteinds.append( [nframe, cur_pos] )
-                if me == ROOT: pb.update()
+                byteinds.append([nframe, cur_pos])
+                if me == ROOT:
+                    pb.update()
             cur_pos = fobj.tell()
-            if me == ROOT: pb.update(0)
-        if me == ROOT: pb.close()
+            if me == ROOT:
+                pb.update(0)
+        if me == ROOT:
+            pb.close()
 
         # take care of the EOF
         cur_pos = fobj.tell()
-        byteinds.append( [nframe+1, cur_pos] ) # dummy index for the EOF
+        byteinds.append([nframe+1, cur_pos])  # dummy index for the EOF
 
         # write to file
-        np.savetxt(byteindfns[n], np.array(byteinds), fmt = "%d")
+        np.savetxt(byteindfns[n], np.array(byteinds), fmt="%d")
 
         # close the trajfile object
         fobj.close()
@@ -247,15 +256,15 @@ def write_reordered_traj(temp_inds, byte_inds, outtemps, temps,
         of = readwrite(outtrajfns[n], "wb")
 
         # get frames
-        abs_temp_ind = np.argmin( abs(temps - outtemps[n]) )
+        abs_temp_ind = np.argmin(abs(temps - outtemps[n]))
         frametuple = frametuple_dict[abs_temp_ind][-nframes:]
 
         # write frames to buffer
         if me == ROOT:
             pb = tqdm(frametuple,
-                  desc = ("Buffering trajectories for writing"),
-                  leave = True, position = ROOT + 2*me,
-                  unit = 'frame/replica', unit_scale = True)
+                      desc=("Buffering trajectories for writing"),
+                      leave=True, position=ROOT + 2*me,
+                      unit='frame/replica', unit_scale=True)
 
             iterable = pb
         else:
@@ -263,20 +272,23 @@ def write_reordered_traj(temp_inds, byte_inds, outtemps, temps,
 
         for i, (rep, frame) in enumerate(iterable):
             infobj = infobjs[rep]
-            start_ptr = int(byte_inds[rep][frame,1])
-            stop_ptr = int(byte_inds[rep][frame+1,1])
+            start_ptr = int(byte_inds[rep][frame, 1])
+            stop_ptr = int(byte_inds[rep][frame+1, 1])
             byte_len = stop_ptr - start_ptr
             infobj.seek(start_ptr)
             buf.write(infobj.read(byte_len))
-        if me == ROOT: pb.close()
+        if me == ROOT:
+            pb.close()
 
         # write buffer to disk
-        if me == ROOT: print("Writing buffer to file")
+        if me == ROOT:
+            print("Writing buffer to file")
         of.write(buf.getvalue())
         of.close()
         buf.close()
 
-    for i in infobjs: i.close()
+    for i in infobjs:
+        i.close()
 
     return
 
@@ -325,13 +337,13 @@ def get_canonical_logw(enefn, frametuple_dict, temps, nprod, writefreq,
               pip install --user pymbar
               sudo pip install pymbar
 
-              To install the dev. version directly from github, use:
+              To install the dev. version directly from GitHub, use:
               pip install pip install git+https://github.com/choderalab/pymbar.git
               """)
 
     u_rn = np.loadtxt(enefn)
-    ntemps = u_rn.shape[0] # number of temps.
-    nframes = int(nprod / writefreq) # number of frames at each temp.
+    ntemps = u_rn.shape[0]  # number of temps.
+    nframes = int(nprod / writefreq)  # number of frames at each temp.
 
     # reorder the temps
     u_kn = np.zeros([ntemps, nframes], float)
@@ -341,91 +353,90 @@ def get_canonical_logw(enefn, frametuple_dict, temps, nprod, writefreq,
             u_kn[k, i] = u_rn[rep, frame]
 
     # prep input for pymbar
-    #1) array of frames at each temp.
+    # 1) array of frames at each temp.
     nframes_k = nframes * np.ones(ntemps, np.uint8)
 
-    #2) inverse temps. for chosen energy scale
+    # 2) inverse temps. for chosen energy scale
     beta_k = 1.0 / (kB * temps)
 
-    #3) get reduced energies (*ONLY FOR THE CANONICAL ENSEMBLE*)
+    # 3) get reduced energies (*ONLY FOR THE CANONICAL ENSEMBLE*)
     u_kln = np.zeros([ntemps, ntemps, nframes], float)
     for k in range(ntemps):
         u_kln[k] = np.outer(beta_k, u_kn[k])
 
     # run pymbar and extract the free energies
     print("\nRunning pymbar...")
-    mbar = pymbar.mbar.MBAR(u_kln, nframes_k, verbose = True)
-    f_k = mbar.f_k # (1 x k array)
+    mbar = pymbar.mbar.MBAR(u_kln, nframes_k, verbose=True)
+    f_k = mbar.f_k  # (1 x k array)
 
     # calculate the log-weights
     print("\nExtracting log-weights...")
     log_nframes = np.log(nframes)
-    logw = dict( (k, np.zeros([ntemps, nframes], float)) for k in range(ntemps) )
+    logw = dict((k, np.zeros([ntemps, nframes], float)) for k in range(ntemps))
     # get log-weights to reweight to this temp.
     for k in range(ntemps):
         for n in range(nframes):
-            num = -beta_k[k] * u_kn[k,n]
-            denom = f_k - beta_k[k] * u_kn[k,n]
+            num = -beta_k[k] * u_kn[k, n]
+            denom = f_k - beta_k[k] * u_kn[k, n]
             for l in range(ntemps):
-                logw[l][k,n] = num - logsumexp(denom) - log_nframes
+                logw[l][k, n] = num - logsumexp(denom) - log_nframes
 
     return logw
 
 
-
 #### MAIN WORKFLOW ####
 if __name__ == "__main__":
     # accept user inputs
-    parser = argparse.ArgumentParser(description = __doc__,
-             formatter_class = argparse.RawDescriptionHelpFormatter)
+    parser = argparse.ArgumentParser(description=__doc__,
+                                     formatter_class=argparse.RawDescriptionHelpFormatter)
 
     parser.add_argument("prefix",
-                        help = "Prefix of REMD LAMMPS trajectories.\
+                        help="Prefix of REMD LAMMPS trajectories.\
                         Supply full path. Trajectories assumed to be named as \
                         <prefix>.%%d.lammpstrj. \
                         Can be in compressed (.gz or .bz2) format. \
                         This is a required argument")
 
-    parser.add_argument("-logfn", "--logfn", default = "log.lammps",
-                        help = "LAMMPS log file that contains swap history \
+    parser.add_argument("-logfn", "--logfn", default="log.lammps",
+                        help="LAMMPS log file that contains swap history \
                         of temperatures among replicas. \
                         Default = 'lammps.log'")
 
-    parser.add_argument("-tfn", "--tempfn", default = "temps.txt",
-                        help = "ascii file (readable by numpy.loadtxt) with \
+    parser.add_argument("-tfn", "--tempfn", default="temps.txt",
+                        help="ascii file (readable by numpy.loadtxt) with \
                         the temperatures used in the REMD simulation.")
 
-    parser.add_argument("-ns", "--nswap", type = int,
-                        help = "Swap frequency used in LAMMPS temper command")
+    parser.add_argument("-ns", "--nswap", type=int,
+                        help="Swap frequency used in LAMMPS temper command")
 
-    parser.add_argument("-nw", "--nwrite", type = int, default = 1,
-                        help = "Trajectory writing frequency used \
+    parser.add_argument("-nw", "--nwrite", type=int, default=1,
+                        help="Trajectory writing frequency used \
                         in LAMMPS dump command")
 
-    parser.add_argument("-np", "--nprod", type = int, default = 0,
-                        help = "Number of timesteps to save in the reordered\
+    parser.add_argument("-np", "--nprod", type=int, default=0,
+                        help="Number of timesteps to save in the reordered\
                         trajectories.\
                         This should be in units of the LAMMPS timestep")
 
-    parser.add_argument("-logw", "--logw", action = 'store_true',
-                        help = "Supplying this flag \
+    parser.add_argument("-logw", "--logw", action='store_true',
+                        help="Supplying this flag \
                         calculates *canonical* (NVT ensemble) log weights")
 
     parser.add_argument("-e", "--enefn",
-                        help = "File that has n_replica x n_frames array\
+                        help="File that has n_replica x n_frames array\
                         of total potential energies")
 
     parser.add_argument("-kB", "--boltzmann_const",
-                        type = float, default = 0.001987,
-                        help = "Boltzmann constant in appropriate units. \
+                        type=float, default=0.001987,
+                        help="Boltzmann constant in appropriate units. \
                         Default is kcal/mol")
 
-    parser.add_argument("-ot", "--out_temps", nargs = '+', type = np.float64,
-                        help = "Reorder trajectories at these temperatures.\n \
+    parser.add_argument("-ot", "--out_temps", nargs='+', type=np.float64,
+                        help="Reorder trajectories at these temperatures.\n \
                         Default is all temperatures used in the simulation")
 
-    parser.add_argument("-od", "--outdir", default = ".",
-                        help = "All output will be saved to this directory")
+    parser.add_argument("-od", "--outdir", default=".",
+                        help="All output will be saved to this directory")
 
     # parse inputs
     args = parser.parse_args()
@@ -438,14 +449,16 @@ if __name__ == "__main__":
     nprod = args.nprod
 
     enefn = args.enefn
-    if not enefn is None: enefn = os.path.abspath(enefn)
+    if not enefn is None:
+        enefn = os.path.abspath(enefn)
     get_logw = args.logw
     kB = args.boltzmann_const
 
     out_temps = args.out_temps
     outdir = os.path.abspath(args.outdir)
     if not os.path.isdir(outdir):
-        if me == ROOT: os.mkdir(outdir)
+        if me == ROOT:
+            os.mkdir(outdir)
 
     # check that all input files are present (only on the ROOT proc)
     if me == ROOT:
@@ -465,7 +478,8 @@ if __name__ == "__main__":
     for i in range(ntemps):
         this_intrajfn = intrajfns[i]
         x = this_intrajfn + ".gz"
-        if os.path.isfile(this_intrajfn): continue
+        if os.path.isfile(this_intrajfn):
+            continue
         elif os.path.isfile(this_intrajfn + ".gz"):
             intrajfns[i] = this_intrajfn + ".gz"
         elif os.path.isfile(this_intrajfn + ".bz2"):
@@ -476,42 +490,41 @@ if __name__ == "__main__":
 
     # set output filenames
     outprefix = os.path.join(outdir, traj_prefix.split('/')[-1])
-    outtrajfns = ["%s.%3.2f.lammpstrj.gz" % \
-                 (outprefix, _get_nearest_temp(temps, t)) \
+    outtrajfns = ["%s.%3.2f.lammpstrj.gz" %
+                  (outprefix, _get_nearest_temp(temps, t))
                   for t in out_temps]
-    byteindfns = [os.path.join(outdir, ".byteind_%d.gz" % k) \
+    byteindfns = [os.path.join(outdir, ".byteind_%d.gz" % k)
                   for k in range(ntemps)]
     frametuplefn = outprefix + '.frametuple.pickle'
     if get_logw:
         logwfn = outprefix + ".logw.pickle"
 
-
     # get a list of all frames at a particular temp visited by each replica
     # this is fast so run only on ROOT proc.
     master_frametuple_dict = {}
     if me == ROOT:
-        master_frametuple_dict = get_replica_frames(logfn = logfn,
-                                                    temps = temps,
-                                                    nswap = nswap,
-                                                    writefreq = writefreq)
+        master_frametuple_dict = get_replica_frames(logfn=logfn,
+                                                    temps=temps,
+                                                    nswap=nswap,
+                                                    writefreq=writefreq)
         # save to a pickle from the ROOT proc
         with open(frametuplefn, 'wb') as of:
             pickle.dump(master_frametuple_dict, of)
 
     # broadcast to all procs
-    master_frametuple_dict = comm.bcast(master_frametuple_dict, root = ROOT)
+    master_frametuple_dict = comm.bcast(master_frametuple_dict, root=ROOT)
 
     # define a chunk of replicas  to process on each proc
     CHUNKSIZE_1 = int(ntemps/nproc)
     if me < nproc - 1:
-        my_rep_inds = range( (me*CHUNKSIZE_1), (me+1)*CHUNKSIZE_1 )
+        my_rep_inds = range((me*CHUNKSIZE_1), (me+1)*CHUNKSIZE_1)
     else:
-        my_rep_inds = range( (me*CHUNKSIZE_1), ntemps )
+        my_rep_inds = range((me*CHUNKSIZE_1), ntemps)
 
     # get byte indices from replica (un-ordered) trajs. in parallel
-    get_byte_index(rep_inds = my_rep_inds,
-                   byteindfns = byteindfns,
-                   intrajfns = intrajfns)
+    get_byte_index(rep_inds=my_rep_inds,
+                   byteindfns=byteindfns,
+                   intrajfns=intrajfns)
 
     # block until all procs have finished
     comm.barrier()
@@ -520,7 +533,7 @@ if __name__ == "__main__":
     infobjs = [readwrite(i, "rb") for i in intrajfns]
 
     # open all byteindex files
-    byte_inds = dict( (i, np.loadtxt(fn)) for i, fn in enumerate(byteindfns) )
+    byte_inds = dict((i, np.loadtxt(fn)) for i, fn in enumerate(byteindfns))
 
     # define a chunk of output trajs. to process for each proc.
     # # of reordered trajs. to write may be less than the total # of replicas
@@ -536,38 +549,38 @@ if __name__ == "__main__":
     else:
         nproc_active = nproc
     if me < nproc_active-1:
-        my_temp_inds = range( (me*CHUNKSIZE_2), (me+1)*CHUNKSIZE_1 )
+        my_temp_inds = range((me*CHUNKSIZE_2), (me+1)*CHUNKSIZE_1)
     else:
-        my_temp_inds = range( (me*CHUNKSIZE_2), n_out_temps)
+        my_temp_inds = range((me*CHUNKSIZE_2), n_out_temps)
 
     # retire the excess procs
     # dont' forget to close any open file objects
     if me >= nproc_active:
-        for fobj in infobjs: fobj.close()
+        for fobj in infobjs:
+            fobj.close()
         exit()
 
     # write reordered trajectories to disk from active procs in parallel
-    write_reordered_traj(temp_inds = my_temp_inds,
-                         byte_inds = byte_inds,
-                         outtemps = out_temps, temps = temps,
-                         frametuple_dict = master_frametuple_dict,
-                         nprod = nprod, writefreq = writefreq,
-                         outtrajfns = outtrajfns,
-                         infobjs = infobjs)
+    write_reordered_traj(temp_inds=my_temp_inds,
+                         byte_inds=byte_inds,
+                         outtemps=out_temps, temps=temps,
+                         frametuple_dict=master_frametuple_dict,
+                         nprod=nprod, writefreq=writefreq,
+                         outtrajfns=outtrajfns,
+                         infobjs=infobjs)
 
     # calculate canonical log-weights if requested
     # usually this is very fast so retire all but the ROOT proc
-    if not get_logw: exit()
-    if not me == ROOT: exit()
-
-    logw = get_canonical_logw(enefn = enefn, temps = temps,
-                              frametuple_dict = master_frametuple_dict,
-                              nprod = nprod, writefreq = writefreq,
-                              kB = kB)
+    if not get_logw:
+        exit()
+    if not me == ROOT:
+        exit()
 
+    logw = get_canonical_logw(enefn=enefn, temps=temps,
+                              frametuple_dict=master_frametuple_dict,
+                              nprod=nprod, writefreq=writefreq,
+                              kB=kB)
 
     # save the logweights to a pickle
     with open(logwfn, 'wb') as of:
         pickle.dump(logw, of)
-
-

From 2c65df1bc2efd9c39aae3a3ceeca06fecf25b698 Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Tue, 10 Nov 2020 16:29:02 +0100
Subject: [PATCH 53/64] Revert typo fix in python due to auto-formatter
 changing too much

---
 tools/replica/reorder_remd_traj.py | 231 ++++++++++++++---------------
 1 file changed, 109 insertions(+), 122 deletions(-)

diff --git a/tools/replica/reorder_remd_traj.py b/tools/replica/reorder_remd_traj.py
index 6eee4770ab..5033ae1e53 100644
--- a/tools/replica/reorder_remd_traj.py
+++ b/tools/replica/reorder_remd_traj.py
@@ -37,17 +37,13 @@ StringIO (or io if in Python 3.x)
 """
 
 
-import os
-import numpy as np
-import argparse
-import time
-import pickle
+
+import os, numpy as np, argparse, time, pickle
 from scipy.special import logsumexp
 from mpi4py import MPI
 
 from tqdm import tqdm
-import gzip
-import bz2
+import gzip, bz2
 try:
     # python-2
     from StringIO import StringIO as IOBuffer
@@ -56,11 +52,12 @@ except ImportError:
     from io import BytesIO as IOBuffer
 
 
+
 #### INITIALIZE MPI ####
 # (note that all output on screen will be printed only on the ROOT proc)
 ROOT = 0
 comm = MPI.COMM_WORLD
-me = comm.rank  # my proc id
+me = comm.rank # my proc id
 nproc = comm.size
 
 
@@ -80,8 +77,7 @@ def _get_nearest_temp(temps, query_temp):
     out_temp: nearest temp from the list
     """
 
-    if isinstance(temps, list):
-        temps = np.array(temps)
+    if isinstance(temps, list): temps = np.array(temps)
     return temps[np.argmin(np.abs(temps-query_temp))]
 
 
@@ -99,10 +95,10 @@ def readwrite(trajfn, mode):
 
     if trajfn.endswith(".gz"):
         of = gzip.open(trajfn, mode)
-        # return gzip.GzipFile(trajfn, mode)
+        #return gzip.GzipFile(trajfn, mode)
     elif trajfn.endswith(".bz2"):
         of = bz2.open(trajfn, mode)
-        # return bz2.BZ2File(trajfn, mode)
+        #return bz2.BZ2File(trajfn, mode)
     else:
         of = open(trajfn, mode)
     return of
@@ -127,8 +123,8 @@ def get_replica_frames(logfn, temps, nswap, writefreq):
     """
 
     n_rep = len(temps)
-    swap_history = np.loadtxt(logfn, skiprows=3)
-    master_frametuple_dict = dict((n, []) for n in range(n_rep))
+    swap_history = np.loadtxt(logfn, skiprows = 3)
+    master_frametuple_dict = dict( (n, []) for n in range(n_rep) )
 
     # walk through the replicas
     print("Getting frames from all replicas at temperature:")
@@ -140,15 +136,15 @@ def get_replica_frames(logfn, temps, nswap, writefreq):
         if writefreq <= nswap:
             for ii, i in enumerate(rep_inds[:-1]):
                 start = int(ii * nswap / writefreq)
-                stop = int((ii+1) * nswap / writefreq)
-                [master_frametuple_dict[n].append((i, x))
-                 for x in range(start, stop)]
+                stop = int( (ii+1) * nswap / writefreq)
+                [master_frametuple_dict[n].append( (i,x) ) \
+                                        for x in range(start, stop)]
 
         # case-2: when temps. are swapped faster than dumping frames
         else:
             nskip = int(writefreq / nswap)
-            [master_frametuple_dict[n].append((i, ii))
-             for ii, i in enumerate(rep_inds[0::nskip])]
+            [master_frametuple_dict[n].append( (i,ii) ) \
+            for ii, i in enumerate(rep_inds[0::nskip])]
 
     return master_frametuple_dict
 
@@ -165,12 +161,11 @@ def get_byte_index(rep_inds, byteindfns, intrajfns):
     """
     for n in rep_inds:
         # check if the byte indices for this traj has already been computed
-        if os.path.isfile(byteindfns[n]):
-            continue
+        if os.path.isfile(byteindfns[n]): continue
 
         # extract bytes
         fobj = readwrite(intrajfns[n], "rb")
-        byteinds = [[0, 0]]
+        byteinds = [ [0,0] ]
 
         # place file pointer at first line
         nframe = 0
@@ -180,37 +175,33 @@ def get_byte_index(rep_inds, byteindfns, intrajfns):
         # status printed only for replica read on root proc
         # this assumes that each proc takes roughly the same time
         if me == ROOT:
-            pb = tqdm(desc="Reading replicas", leave=True,
-                      position=ROOT + 2*me,
-                      unit="B/replica", unit_scale=True,
-                      unit_divisor=1024)
+            pb = tqdm(desc = "Reading replicas", leave = True,
+                  position = ROOT + 2*me,
+                  unit = "B/replica", unit_scale = True,
+                  unit_divisor = 1024)
 
         # start crawling through the bytes
         while True:
             next_line = fobj.readline()
-            if len(next_line) == 0:
-                break
+            if len(next_line) == 0: break
             # this will only work with lammpstrj traj format.
             # this condition essentially checks periodic recurrences
             # of the token TIMESTEP. Each time it is found,
             # we have crawled through a frame (snapshot)
             if next_line == first_line:
                 nframe += 1
-                byteinds.append([nframe, cur_pos])
-                if me == ROOT:
-                    pb.update()
+                byteinds.append( [nframe, cur_pos] )
+                if me == ROOT: pb.update()
             cur_pos = fobj.tell()
-            if me == ROOT:
-                pb.update(0)
-        if me == ROOT:
-            pb.close()
+            if me == ROOT: pb.update(0)
+        if me == ROOT: pb.close()
 
         # take care of the EOF
         cur_pos = fobj.tell()
-        byteinds.append([nframe+1, cur_pos])  # dummy index for the EOF
+        byteinds.append( [nframe+1, cur_pos] ) # dummy index for the EOF
 
         # write to file
-        np.savetxt(byteindfns[n], np.array(byteinds), fmt="%d")
+        np.savetxt(byteindfns[n], np.array(byteinds), fmt = "%d")
 
         # close the trajfile object
         fobj.close()
@@ -256,15 +247,15 @@ def write_reordered_traj(temp_inds, byte_inds, outtemps, temps,
         of = readwrite(outtrajfns[n], "wb")
 
         # get frames
-        abs_temp_ind = np.argmin(abs(temps - outtemps[n]))
+        abs_temp_ind = np.argmin( abs(temps - outtemps[n]) )
         frametuple = frametuple_dict[abs_temp_ind][-nframes:]
 
         # write frames to buffer
         if me == ROOT:
             pb = tqdm(frametuple,
-                      desc=("Buffering trajectories for writing"),
-                      leave=True, position=ROOT + 2*me,
-                      unit='frame/replica', unit_scale=True)
+                  desc = ("Buffering trajectories for writing"),
+                  leave = True, position = ROOT + 2*me,
+                  unit = 'frame/replica', unit_scale = True)
 
             iterable = pb
         else:
@@ -272,23 +263,20 @@ def write_reordered_traj(temp_inds, byte_inds, outtemps, temps,
 
         for i, (rep, frame) in enumerate(iterable):
             infobj = infobjs[rep]
-            start_ptr = int(byte_inds[rep][frame, 1])
-            stop_ptr = int(byte_inds[rep][frame+1, 1])
+            start_ptr = int(byte_inds[rep][frame,1])
+            stop_ptr = int(byte_inds[rep][frame+1,1])
             byte_len = stop_ptr - start_ptr
             infobj.seek(start_ptr)
             buf.write(infobj.read(byte_len))
-        if me == ROOT:
-            pb.close()
+        if me == ROOT: pb.close()
 
         # write buffer to disk
-        if me == ROOT:
-            print("Writing buffer to file")
+        if me == ROOT: print("Writing buffer to file")
         of.write(buf.getvalue())
         of.close()
         buf.close()
 
-    for i in infobjs:
-        i.close()
+    for i in infobjs: i.close()
 
     return
 
@@ -337,13 +325,13 @@ def get_canonical_logw(enefn, frametuple_dict, temps, nprod, writefreq,
               pip install --user pymbar
               sudo pip install pymbar
 
-              To install the dev. version directly from GitHub, use:
+              To install the dev. version directly from github, use:
               pip install pip install git+https://github.com/choderalab/pymbar.git
               """)
 
     u_rn = np.loadtxt(enefn)
-    ntemps = u_rn.shape[0]  # number of temps.
-    nframes = int(nprod / writefreq)  # number of frames at each temp.
+    ntemps = u_rn.shape[0] # number of temps.
+    nframes = int(nprod / writefreq) # number of frames at each temp.
 
     # reorder the temps
     u_kn = np.zeros([ntemps, nframes], float)
@@ -353,90 +341,91 @@ def get_canonical_logw(enefn, frametuple_dict, temps, nprod, writefreq,
             u_kn[k, i] = u_rn[rep, frame]
 
     # prep input for pymbar
-    # 1) array of frames at each temp.
+    #1) array of frames at each temp.
     nframes_k = nframes * np.ones(ntemps, np.uint8)
 
-    # 2) inverse temps. for chosen energy scale
+    #2) inverse temps. for chosen energy scale
     beta_k = 1.0 / (kB * temps)
 
-    # 3) get reduced energies (*ONLY FOR THE CANONICAL ENSEMBLE*)
+    #3) get reduced energies (*ONLY FOR THE CANONICAL ENSEMBLE*)
     u_kln = np.zeros([ntemps, ntemps, nframes], float)
     for k in range(ntemps):
         u_kln[k] = np.outer(beta_k, u_kn[k])
 
     # run pymbar and extract the free energies
     print("\nRunning pymbar...")
-    mbar = pymbar.mbar.MBAR(u_kln, nframes_k, verbose=True)
-    f_k = mbar.f_k  # (1 x k array)
+    mbar = pymbar.mbar.MBAR(u_kln, nframes_k, verbose = True)
+    f_k = mbar.f_k # (1 x k array)
 
     # calculate the log-weights
     print("\nExtracting log-weights...")
     log_nframes = np.log(nframes)
-    logw = dict((k, np.zeros([ntemps, nframes], float)) for k in range(ntemps))
+    logw = dict( (k, np.zeros([ntemps, nframes], float)) for k in range(ntemps) )
     # get log-weights to reweight to this temp.
     for k in range(ntemps):
         for n in range(nframes):
-            num = -beta_k[k] * u_kn[k, n]
-            denom = f_k - beta_k[k] * u_kn[k, n]
+            num = -beta_k[k] * u_kn[k,n]
+            denom = f_k - beta_k[k] * u_kn[k,n]
             for l in range(ntemps):
-                logw[l][k, n] = num - logsumexp(denom) - log_nframes
+                logw[l][k,n] = num - logsumexp(denom) - log_nframes
 
     return logw
 
 
+
 #### MAIN WORKFLOW ####
 if __name__ == "__main__":
     # accept user inputs
-    parser = argparse.ArgumentParser(description=__doc__,
-                                     formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser = argparse.ArgumentParser(description = __doc__,
+             formatter_class = argparse.RawDescriptionHelpFormatter)
 
     parser.add_argument("prefix",
-                        help="Prefix of REMD LAMMPS trajectories.\
+                        help = "Prefix of REMD LAMMPS trajectories.\
                         Supply full path. Trajectories assumed to be named as \
                         <prefix>.%%d.lammpstrj. \
                         Can be in compressed (.gz or .bz2) format. \
                         This is a required argument")
 
-    parser.add_argument("-logfn", "--logfn", default="log.lammps",
-                        help="LAMMPS log file that contains swap history \
+    parser.add_argument("-logfn", "--logfn", default = "log.lammps",
+                        help = "LAMMPS log file that contains swap history \
                         of temperatures among replicas. \
                         Default = 'lammps.log'")
 
-    parser.add_argument("-tfn", "--tempfn", default="temps.txt",
-                        help="ascii file (readable by numpy.loadtxt) with \
+    parser.add_argument("-tfn", "--tempfn", default = "temps.txt",
+                        help = "ascii file (readable by numpy.loadtxt) with \
                         the temperatures used in the REMD simulation.")
 
-    parser.add_argument("-ns", "--nswap", type=int,
-                        help="Swap frequency used in LAMMPS temper command")
+    parser.add_argument("-ns", "--nswap", type = int,
+                        help = "Swap frequency used in LAMMPS temper command")
 
-    parser.add_argument("-nw", "--nwrite", type=int, default=1,
-                        help="Trajectory writing frequency used \
+    parser.add_argument("-nw", "--nwrite", type = int, default = 1,
+                        help = "Trajectory writing frequency used \
                         in LAMMPS dump command")
 
-    parser.add_argument("-np", "--nprod", type=int, default=0,
-                        help="Number of timesteps to save in the reordered\
+    parser.add_argument("-np", "--nprod", type = int, default = 0,
+                        help = "Number of timesteps to save in the reordered\
                         trajectories.\
                         This should be in units of the LAMMPS timestep")
 
-    parser.add_argument("-logw", "--logw", action='store_true',
-                        help="Supplying this flag \
+    parser.add_argument("-logw", "--logw", action = 'store_true',
+                        help = "Supplying this flag \
                         calculates *canonical* (NVT ensemble) log weights")
 
     parser.add_argument("-e", "--enefn",
-                        help="File that has n_replica x n_frames array\
+                        help = "File that has n_replica x n_frames array\
                         of total potential energies")
 
     parser.add_argument("-kB", "--boltzmann_const",
-                        type=float, default=0.001987,
-                        help="Boltzmann constant in appropriate units. \
+                        type = float, default = 0.001987,
+                        help = "Boltzmann constant in appropriate units. \
                         Default is kcal/mol")
 
-    parser.add_argument("-ot", "--out_temps", nargs='+', type=np.float64,
-                        help="Reorder trajectories at these temperatures.\n \
+    parser.add_argument("-ot", "--out_temps", nargs = '+', type = np.float64,
+                        help = "Reorder trajectories at these temperatures.\n \
                         Default is all temperatures used in the simulation")
 
-    parser.add_argument("-od", "--outdir", default=".",
-                        help="All output will be saved to this directory")
+    parser.add_argument("-od", "--outdir", default = ".",
+                        help = "All output will be saved to this directory")
 
     # parse inputs
     args = parser.parse_args()
@@ -449,16 +438,14 @@ if __name__ == "__main__":
     nprod = args.nprod
 
     enefn = args.enefn
-    if not enefn is None:
-        enefn = os.path.abspath(enefn)
+    if not enefn is None: enefn = os.path.abspath(enefn)
     get_logw = args.logw
     kB = args.boltzmann_const
 
     out_temps = args.out_temps
     outdir = os.path.abspath(args.outdir)
     if not os.path.isdir(outdir):
-        if me == ROOT:
-            os.mkdir(outdir)
+        if me == ROOT: os.mkdir(outdir)
 
     # check that all input files are present (only on the ROOT proc)
     if me == ROOT:
@@ -478,8 +465,7 @@ if __name__ == "__main__":
     for i in range(ntemps):
         this_intrajfn = intrajfns[i]
         x = this_intrajfn + ".gz"
-        if os.path.isfile(this_intrajfn):
-            continue
+        if os.path.isfile(this_intrajfn): continue
         elif os.path.isfile(this_intrajfn + ".gz"):
             intrajfns[i] = this_intrajfn + ".gz"
         elif os.path.isfile(this_intrajfn + ".bz2"):
@@ -490,41 +476,42 @@ if __name__ == "__main__":
 
     # set output filenames
     outprefix = os.path.join(outdir, traj_prefix.split('/')[-1])
-    outtrajfns = ["%s.%3.2f.lammpstrj.gz" %
-                  (outprefix, _get_nearest_temp(temps, t))
+    outtrajfns = ["%s.%3.2f.lammpstrj.gz" % \
+                 (outprefix, _get_nearest_temp(temps, t)) \
                   for t in out_temps]
-    byteindfns = [os.path.join(outdir, ".byteind_%d.gz" % k)
+    byteindfns = [os.path.join(outdir, ".byteind_%d.gz" % k) \
                   for k in range(ntemps)]
     frametuplefn = outprefix + '.frametuple.pickle'
     if get_logw:
         logwfn = outprefix + ".logw.pickle"
 
+
     # get a list of all frames at a particular temp visited by each replica
     # this is fast so run only on ROOT proc.
     master_frametuple_dict = {}
     if me == ROOT:
-        master_frametuple_dict = get_replica_frames(logfn=logfn,
-                                                    temps=temps,
-                                                    nswap=nswap,
-                                                    writefreq=writefreq)
+        master_frametuple_dict = get_replica_frames(logfn = logfn,
+                                                    temps = temps,
+                                                    nswap = nswap,
+                                                    writefreq = writefreq)
         # save to a pickle from the ROOT proc
         with open(frametuplefn, 'wb') as of:
             pickle.dump(master_frametuple_dict, of)
 
     # broadcast to all procs
-    master_frametuple_dict = comm.bcast(master_frametuple_dict, root=ROOT)
+    master_frametuple_dict = comm.bcast(master_frametuple_dict, root = ROOT)
 
     # define a chunk of replicas  to process on each proc
     CHUNKSIZE_1 = int(ntemps/nproc)
     if me < nproc - 1:
-        my_rep_inds = range((me*CHUNKSIZE_1), (me+1)*CHUNKSIZE_1)
+        my_rep_inds = range( (me*CHUNKSIZE_1), (me+1)*CHUNKSIZE_1 )
     else:
-        my_rep_inds = range((me*CHUNKSIZE_1), ntemps)
+        my_rep_inds = range( (me*CHUNKSIZE_1), ntemps )
 
     # get byte indices from replica (un-ordered) trajs. in parallel
-    get_byte_index(rep_inds=my_rep_inds,
-                   byteindfns=byteindfns,
-                   intrajfns=intrajfns)
+    get_byte_index(rep_inds = my_rep_inds,
+                   byteindfns = byteindfns,
+                   intrajfns = intrajfns)
 
     # block until all procs have finished
     comm.barrier()
@@ -533,7 +520,7 @@ if __name__ == "__main__":
     infobjs = [readwrite(i, "rb") for i in intrajfns]
 
     # open all byteindex files
-    byte_inds = dict((i, np.loadtxt(fn)) for i, fn in enumerate(byteindfns))
+    byte_inds = dict( (i, np.loadtxt(fn)) for i, fn in enumerate(byteindfns) )
 
     # define a chunk of output trajs. to process for each proc.
     # # of reordered trajs. to write may be less than the total # of replicas
@@ -549,38 +536,38 @@ if __name__ == "__main__":
     else:
         nproc_active = nproc
     if me < nproc_active-1:
-        my_temp_inds = range((me*CHUNKSIZE_2), (me+1)*CHUNKSIZE_1)
+        my_temp_inds = range( (me*CHUNKSIZE_2), (me+1)*CHUNKSIZE_1 )
     else:
-        my_temp_inds = range((me*CHUNKSIZE_2), n_out_temps)
+        my_temp_inds = range( (me*CHUNKSIZE_2), n_out_temps)
 
     # retire the excess procs
     # dont' forget to close any open file objects
     if me >= nproc_active:
-        for fobj in infobjs:
-            fobj.close()
+        for fobj in infobjs: fobj.close()
         exit()
 
     # write reordered trajectories to disk from active procs in parallel
-    write_reordered_traj(temp_inds=my_temp_inds,
-                         byte_inds=byte_inds,
-                         outtemps=out_temps, temps=temps,
-                         frametuple_dict=master_frametuple_dict,
-                         nprod=nprod, writefreq=writefreq,
-                         outtrajfns=outtrajfns,
-                         infobjs=infobjs)
+    write_reordered_traj(temp_inds = my_temp_inds,
+                         byte_inds = byte_inds,
+                         outtemps = out_temps, temps = temps,
+                         frametuple_dict = master_frametuple_dict,
+                         nprod = nprod, writefreq = writefreq,
+                         outtrajfns = outtrajfns,
+                         infobjs = infobjs)
 
     # calculate canonical log-weights if requested
     # usually this is very fast so retire all but the ROOT proc
-    if not get_logw:
-        exit()
-    if not me == ROOT:
-        exit()
+    if not get_logw: exit()
+    if not me == ROOT: exit()
+
+    logw = get_canonical_logw(enefn = enefn, temps = temps,
+                              frametuple_dict = master_frametuple_dict,
+                              nprod = nprod, writefreq = writefreq,
+                              kB = kB)
 
-    logw = get_canonical_logw(enefn=enefn, temps=temps,
-                              frametuple_dict=master_frametuple_dict,
-                              nprod=nprod, writefreq=writefreq,
-                              kB=kB)
 
     # save the logweights to a pickle
     with open(logwfn, 'wb') as of:
         pickle.dump(logw, of)
+
+

From d1ce362fca80f5240ad9e36c6bd5d65e0c76fea4 Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Tue, 10 Nov 2020 17:15:42 +0100
Subject: [PATCH 54/64] Remove wrong word 'regoin' from false positive list

---
 doc/utils/sphinx-config/false_positives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt
index 3ef0b904eb..6843118686 100644
--- a/doc/utils/sphinx-config/false_positives.txt
+++ b/doc/utils/sphinx-config/false_positives.txt
@@ -2614,7 +2614,6 @@ Ree
 refactored
 refactoring
 reflectionstyle
-regoin
 Reinders
 reinit
 relaxbox

From 2f3cbfed1304d9c263ed52698fa2ea263f776a40 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 10 Nov 2020 17:58:26 -0500
Subject: [PATCH 55/64] add CMake code to download and compile libyaml if not
 found locally

---
 cmake/Modules/YAML.cmake             | 32 ++++++++++++++++++++++++++++
 unittest/force-styles/CMakeLists.txt |  4 ++--
 2 files changed, 34 insertions(+), 2 deletions(-)
 create mode 100644 cmake/Modules/YAML.cmake

diff --git a/cmake/Modules/YAML.cmake b/cmake/Modules/YAML.cmake
new file mode 100644
index 0000000000..05163675df
--- /dev/null
+++ b/cmake/Modules/YAML.cmake
@@ -0,0 +1,32 @@
+message(STATUS "Downloading and building YAML library")
+
+include(ExternalProject)
+set(YAML_URL "https://pyyaml.org/download/libyaml/yaml-0.2.5.tar.gz" CACHE STRING "URL for libyaml tarball")
+mark_as_advanced(YAML_URL)
+ExternalProject_Add(libyaml
+                    URL               ${YAML_URL}
+                    URL_MD5           bb15429d8fb787e7d3f1c83ae129a999  
+                    SOURCE_DIR        "${CMAKE_BINARY_DIR}/yaml-src"
+                    BINARY_DIR        "${CMAKE_BINARY_DIR}/yaml-build"
+                    CONFIGURE_COMMAND <SOURCE_DIR>/configure ${CONFIGURE_REQUEST_PIC}
+                                      CXX=${CMAKE_CXX_COMPILER}
+                                      CC=${CMAKE_C_COMPILER}
+                                      --prefix=<INSTALL_DIR> --disable-shared
+                    BUILD_BYPRODUCTS  <INSTALL_DIR>/lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a
+                    TEST_COMMAND      "")
+
+ExternalProject_Get_Property(libyaml INSTALL_DIR)
+set(YAML_INCLUDE_DIR ${INSTALL_DIR}/include)
+set(YAML_LIBRARY_DIR ${INSTALL_DIR}/lib)
+
+# workaround for CMake 3.10 on ubuntu 18.04
+file(MAKE_DIRECTORY ${YAML_INCLUDE_DIR})
+file(MAKE_DIRECTORY ${YAML_LIBRARY_DIR})
+
+set(YAML_LIBRARY_PATH ${INSTALL_DIR}/lib/${CMAKE_FIND_LIBRARY_PREFIXES}yaml.a)
+
+add_library(Yaml::Yaml UNKNOWN IMPORTED)
+set_target_properties(Yaml::Yaml PROPERTIES
+        IMPORTED_LOCATION ${YAML_LIBRARY_PATH}
+        INTERFACE_INCLUDE_DIRECTORIES ${YAML_INCLUDE_DIR})
+add_dependencies(Yaml::Yaml libyaml)
diff --git a/unittest/force-styles/CMakeLists.txt b/unittest/force-styles/CMakeLists.txt
index 128dc62cff..1d7dc937eb 100644
--- a/unittest/force-styles/CMakeLists.txt
+++ b/unittest/force-styles/CMakeLists.txt
@@ -1,8 +1,8 @@
 
 find_package(YAML)
 if(NOT YAML_FOUND)
-  message(STATUS "Skipping tests because libyaml is not found")
-  return()
+  # download and build a local copy of libyaml
+  include(YAML)
 endif()
 
 if(CMAKE_VERSION VERSION_LESS 3.12)

From 2c6ccf0d0f0da1b63221dc34f34457c3480c8223 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 10 Nov 2020 18:04:00 -0500
Subject: [PATCH 56/64] update docs for download and compilation of yaml
 sources

---
 doc/src/Build_development.rst | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/doc/src/Build_development.rst b/doc/src/Build_development.rst
index cf3e2fb750..1b076caac0 100644
--- a/doc/src/Build_development.rst
+++ b/doc/src/Build_development.rst
@@ -111,8 +111,10 @@ error margin).  The status of this automated testing can be viewed on
 The unit testing facility is integrated into the CMake build process
 of the LAMMPS source code distribution itself.  It can be enabled by
 setting ``-D ENABLE_TESTING=on`` during the CMake configuration step.
-It requires the `PyYAML <http://pyyaml.org/>`_ library and development
-headers to compile and will download and compile a recent version of the
+It requires the `YAML <http://pyyaml.org/>`_ library and development
+headers (if not found locally a recent version will be downloaded
+and compiled transparently) to compile and will download and compile
+a specific recent version of the
 `Googletest <https://github.com/google/googletest/>`_ C++ test framework
 for implementing the tests.
 

From 552dc7fba90af230c811abaedd96f180e95a2f02 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Tue, 10 Nov 2020 18:05:06 -0500
Subject: [PATCH 57/64] whitespace

---
 cmake/Modules/YAML.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/Modules/YAML.cmake b/cmake/Modules/YAML.cmake
index 05163675df..a080b566be 100644
--- a/cmake/Modules/YAML.cmake
+++ b/cmake/Modules/YAML.cmake
@@ -5,7 +5,7 @@ set(YAML_URL "https://pyyaml.org/download/libyaml/yaml-0.2.5.tar.gz" CACHE STRIN
 mark_as_advanced(YAML_URL)
 ExternalProject_Add(libyaml
                     URL               ${YAML_URL}
-                    URL_MD5           bb15429d8fb787e7d3f1c83ae129a999  
+                    URL_MD5           bb15429d8fb787e7d3f1c83ae129a999
                     SOURCE_DIR        "${CMAKE_BINARY_DIR}/yaml-src"
                     BINARY_DIR        "${CMAKE_BINARY_DIR}/yaml-build"
                     CONFIGURE_COMMAND <SOURCE_DIR>/configure ${CONFIGURE_REQUEST_PIC}

From 39bc47a4da261b5d7a5db1057cf806d1aae62664 Mon Sep 17 00:00:00 2001
From: Tim Bernhard <tim@bernhard-webstudio.ch>
Date: Thu, 12 Nov 2020 13:35:04 +0100
Subject: [PATCH 58/64] Fix inconsistent formatting in Error & Warning doc

---
 doc/src/Errors_warnings.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/doc/src/Errors_warnings.rst b/doc/src/Errors_warnings.rst
index 306c9b7b31..4f29fad9dd 100644
--- a/doc/src/Errors_warnings.rst
+++ b/doc/src/Errors_warnings.rst
@@ -119,7 +119,6 @@ Doc page with :doc:`ERROR messages <Errors_messages>`
    :doc:`pair style zero <pair_zero>` with a suitable cutoff or use :doc:`comm_modify cutoff <comm_modify>`.
 
 *Communication cutoff is shorter than a bond length based estimate. This may lead to errors.*
-
    Since LAMMPS stores topology data with individual atoms, all atoms
    comprising a bond, angle, dihedral or improper must be present on any
    sub-domain that "owns" the atom with the information, either as a

From 3991f704e1990b827d0bfa69ef5fc425430799e0 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 12 Nov 2020 10:42:09 -0500
Subject: [PATCH 59/64] Fix whitespace errors

---
 doc/src/pair_spin_exchange.rst              | 78 ++++++++++-----------
 src/SPIN/compute_spin.cpp                   |  6 +-
 src/SPIN/pair_spin_dipole_cut.cpp           |  8 +--
 src/SPIN/pair_spin_dipole_long.cpp          |  2 +-
 src/SPIN/pair_spin_dmi.cpp                  |  4 +-
 src/SPIN/pair_spin_exchange.cpp             | 34 ++++-----
 src/SPIN/pair_spin_exchange_biquadratic.cpp | 40 +++++------
 src/SPIN/pair_spin_exchange_biquadratic.h   |  2 +-
 src/SPIN/pair_spin_magelec.cpp              |  4 +-
 9 files changed, 89 insertions(+), 89 deletions(-)

diff --git a/doc/src/pair_spin_exchange.rst b/doc/src/pair_spin_exchange.rst
index 72c416ac72..9e6e534280 100644
--- a/doc/src/pair_spin_exchange.rst
+++ b/doc/src/pair_spin_exchange.rst
@@ -40,53 +40,53 @@ pairs of magnetic spins:
    H_{ex} = -\sum_{i,j}^N J_{ij} (r_{ij}) \,\vec{s}_i \cdot \vec{s}_j
 
 where :math:`\vec{s}_i` and :math:`\vec{s}_j` are two unit vectors representing
-the magnetic spins of two particles (usually atoms), and 
-:math:`r_{ij} = \vert \vec{r}_i - \vec{r}_j \vert` is the inter-atomic distance 
-between those two particles. The summation is over pairs of nearest neighbors. 
-:math:`J(r_{ij})` is a function defining the intensity and the sign of the 
-exchange interaction for different neighboring shells. 
+the magnetic spins of two particles (usually atoms), and
+:math:`r_{ij} = \vert \vec{r}_i - \vec{r}_j \vert` is the inter-atomic distance
+between those two particles. The summation is over pairs of nearest neighbors.
+:math:`J(r_{ij})` is a function defining the intensity and the sign of the
+exchange interaction for different neighboring shells.
 
-Style *spin/exchange/biquadratic* computes a biquadratic exchange interaction 
+Style *spin/exchange/biquadratic* computes a biquadratic exchange interaction
 between pairs of magnetic spins:
 
 .. math::
-  
+
    H_{bi} = -\sum_{i, j}^{N} {J}_{ij} \left(r_{ij} \right)\,
-                      \vec{s}_{i}\cdot \vec{s}_{j} 
+                      \vec{s}_{i}\cdot \vec{s}_{j}
                       -\sum_{i, j}^{N} {K}_{ij} \left(r_{ij} \right)\,
-                      \left(\vec{s}_{i}\cdot 
+                      \left(\vec{s}_{i}\cdot
                       \vec{s}_{j}\right)^2
 
-where :math:`\vec{s}_i`,  :math:`\vec{s}_j`,  :math:`r_{ij}` and 
-:math:`J(r_{ij})` have the same definitions as above, and :math:`K(r_{ij})` is 
+where :math:`\vec{s}_i`,  :math:`\vec{s}_j`,  :math:`r_{ij}` and
+:math:`J(r_{ij})` have the same definitions as above, and :math:`K(r_{ij})` is
 a second function, defining the intensity and the sign of the biquadratic term.
 
-The interatomic dependence of :math:`J(r_{ij})` and :math:`K(r_{ij})` in both 
+The interatomic dependence of :math:`J(r_{ij})` and :math:`K(r_{ij})` in both
 interactions above is defined by the following function:
 
 .. math::
 
-    {f}\left( r_{ij} \right) = 4 a \left( \frac{r_{ij}}{d}  \right)^2 
-    \left( 1 - b \left( \frac{r_{ij}}{d}  \right)^2 \right) 
+    {f}\left( r_{ij} \right) = 4 a \left( \frac{r_{ij}}{d}  \right)^2
+    \left( 1 - b \left( \frac{r_{ij}}{d}  \right)^2 \right)
     e^{-\left( \frac{r_{ij}}{d} \right)^2 }\Theta (R_c - r_{ij})
 
-where :math:`a`, :math:`b` and :math:`d` are the three constant coefficients 
-defined in the associated "pair_coeff" command, and :math:`R_c` is the radius 
+where :math:`a`, :math:`b` and :math:`d` are the three constant coefficients
+defined in the associated "pair_coeff" command, and :math:`R_c` is the radius
 cutoff associated to the pair interaction (see below for more explanations).
 
-The coefficients :math:`a`, :math:`b`, and :math:`d` need to be fitted so that 
-the function above matches with the value of the exchange interaction for the 
+The coefficients :math:`a`, :math:`b`, and :math:`d` need to be fitted so that
+the function above matches with the value of the exchange interaction for the
 :math:`N` neighbor shells taken into account.
-Examples and more explanations about this function and its parameterization 
+Examples and more explanations about this function and its parameterization
 are reported in :ref:`(Tranchida) <Tranchida3>`.
 
-When a *spin/exchange/biquadratic* pair style is defined, six coefficients 
-(three for :math:`J(r_{ij})`, and three for :math:`K(r_{ij})`) have to be 
+When a *spin/exchange/biquadratic* pair style is defined, six coefficients
+(three for :math:`J(r_{ij})`, and three for :math:`K(r_{ij})`) have to be
 fitted.
 
 From this exchange interaction, each spin :math:`i` will be submitted
-to a magnetic torque :math:`\vec{\omega}_{i}`, and its associated atom can be 
-submitted to a force :math:`\vec{F}_{i}` for spin-lattice calculations (see 
+to a magnetic torque :math:`\vec{\omega}_{i}`, and its associated atom can be
+submitted to a force :math:`\vec{F}_{i}` for spin-lattice calculations (see
 :doc:`fix nve/spin <fix_nve_spin>`), such as:
 
 .. math::
@@ -94,22 +94,22 @@ submitted to a force :math:`\vec{F}_{i}` for spin-lattice calculations (see
    \vec{\omega}_{i} = \frac{1}{\hbar} \sum_{j}^{Neighb} {J}
    \left(r_{ij} \right)\,\vec{s}_{j}
    ~~{\rm and}~~
-   \vec{F}_{i} = \sum_{j}^{Neighb} \frac{\partial {J} \left(r_{ij} \right)}{ 
+   \vec{F}_{i} = \sum_{j}^{Neighb} \frac{\partial {J} \left(r_{ij} \right)}{
    \partial r_{ij}} \left( \vec{s}_{i}\cdot \vec{s}_{j} \right) \vec{e}_{ij}
 
 with :math:`\hbar` the Planck constant (in metal units), and :math:`\vec{e}_{ij}
 = \frac{\vec{r}_i - \vec{r}_j}{\vert \vec{r}_i-\vec{r}_j \vert}` the unit
 vector between sites :math:`i` and :math:`j`.
-Equivalent forces and magnetic torques are generated for the biquadratic term 
+Equivalent forces and magnetic torques are generated for the biquadratic term
 when a *spin/exchange/biquadratic* pair style is defined.
 
 More details about the derivation of these torques/forces are reported in
 :ref:`(Tranchida) <Tranchida3>`.
 
-For the *spin/exchange* and *spin/exchange/biquadratic* pair styles, the 
-following coefficients must be defined for each pair of atoms types via the 
-:doc:`pair_coeff <pair_coeff>` command as in the examples above, or in the data 
-file or restart files read by the :doc:`read_data <read_data>` or 
+For the *spin/exchange* and *spin/exchange/biquadratic* pair styles, the
+following coefficients must be defined for each pair of atoms types via the
+:doc:`pair_coeff <pair_coeff>` command as in the examples above, or in the data
+file or restart files read by the :doc:`read_data <read_data>` or
 :doc:`read_restart <read_restart>` commands, and set in the following order:
 
 * :math:`R_c` (distance units)
@@ -129,10 +129,10 @@ for the *spin/exchange* pair style, and:
 
 for the *spin/exchange/biquadratic* pair style.
 
-Note that :math:`R_c` is the radius cutoff of the considered exchange 
-interaction, and :math:`a`, :math:`b` and :math:`d` are the three coefficients 
-performing the parameterization of the function :math:`J(r_{ij})` defined 
-above (in the *biquadratic* style, :math:`a_j`, :math:`b_j`, :math:`d_j` and 
+Note that :math:`R_c` is the radius cutoff of the considered exchange
+interaction, and :math:`a`, :math:`b` and :math:`d` are the three coefficients
+performing the parameterization of the function :math:`J(r_{ij})` defined
+above (in the *biquadratic* style, :math:`a_j`, :math:`b_j`, :math:`d_j` and
 :math:`a_k`, :math:`b_k`, :math:`d_k` are the coefficients of :math:`J(r_{ij})`
 and :math:`K(r_{ij})` respectively).
 
@@ -147,7 +147,7 @@ None of those coefficients is optional. If not specified, the
 For spin-lattice simulation, it can be useful to offset the
 mechanical forces and energies generated by the exchange
 interaction.
-The *offset* keyword allows to apply this offset. 
+The *offset* keyword allows to apply this offset.
 By setting *offset* to *yes*, the energy definitions above are
 replaced by:
 
@@ -155,14 +155,14 @@ replaced by:
 
    H_{ex} = -\sum_{i,j}^N J_{ij} (r_{ij}) \,[ \vec{s}_i \cdot \vec{s}_j-1 ]
 
-for the *spin/exchange* pair style, and:  
+for the *spin/exchange* pair style, and:
 
 .. math::
-  
+
    H_{bi} = -\sum_{i, j}^{N} {J}_{ij} \left(r_{ij} \right)\,
                       [ \vec{s}_{i}\cdot \vec{s}_{j} -1 ]
                       -\sum_{i, j}^{N} {K}_{ij} \left(r_{ij} \right)\,
-                      [ \left(\vec{s}_{i}\cdot 
+                      [ \left(\vec{s}_{i}\cdot
                       \vec{s}_{j}\right)^2 -1]
 
 for the *spin/exchange/biquadratic* pair style.
@@ -173,7 +173,7 @@ precession vectors (and thus does no impact the purely magnetic
 properties).
 This ensures that when all spins are aligned, the magnetic energy
 and the associated mechanical forces (and thus the pressure
-generated by the magnetic potential) are null. 
+generated by the magnetic potential) are null.
 
 .. note::
   This offset term can be very important when calculations such as
@@ -194,7 +194,7 @@ Restrictions
 
 All the *pair/spin* styles are part of the SPIN package.  These styles
 are only enabled if LAMMPS was built with this package, and if the
-atom_style "spin" was declared.  
+atom_style "spin" was declared.
 See the :doc:`Build package <Build_package>` doc page for more info.
 
 Related commands
diff --git a/src/SPIN/compute_spin.cpp b/src/SPIN/compute_spin.cpp
index 3e4970a62b..8e44ea7b84 100644
--- a/src/SPIN/compute_spin.cpp
+++ b/src/SPIN/compute_spin.cpp
@@ -178,7 +178,7 @@ void ComputeSpin::compute_vector()
   for (i = 0; i < nlocal; i++) {
     if (mask[i] & groupbit) {
       if (atom->sp_flag) {
-        
+
         // compute first moment
 
         mag[0] += sp[i][0];
@@ -223,9 +223,9 @@ void ComputeSpin::compute_vector()
   magtot[1] *= scale;
   magtot[2] *= scale;
   magtot[3] = sqrt((magtot[0]*magtot[0])+(magtot[1]*magtot[1])+(magtot[2]*magtot[2]));
-  
+
   // compute spin temperature
-  
+
   spintemperature = hbar*tempnumtot;
   spintemperature /= (2.0*kb*tempdenomtot);
 
diff --git a/src/SPIN/pair_spin_dipole_cut.cpp b/src/SPIN/pair_spin_dipole_cut.cpp
index b4355fd640..7ba81d93f8 100644
--- a/src/SPIN/pair_spin_dipole_cut.cpp
+++ b/src/SPIN/pair_spin_dipole_cut.cpp
@@ -234,14 +234,14 @@ void PairSpinDipoleCut::compute(int eflag, int vflag)
       local_cut2 = cut_spin_long[itype][jtype]*cut_spin_long[itype][jtype];
 
       // compute dipolar interaction
-      
+
       if (rsq < local_cut2) {
         r2inv = 1.0/rsq;
         r3inv = r2inv*rinv;
 
         compute_dipolar(i,j,eij,fmi,spi,spj,r3inv);
-        
-        if (lattice_flag) 
+
+        if (lattice_flag)
           compute_dipolar_mech(i,j,eij,fi,spi,spj,r2inv);
 
         if (eflag) {
@@ -269,7 +269,7 @@ void PairSpinDipoleCut::compute(int eflag, int vflag)
       }
     }
   }
-  
+
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
diff --git a/src/SPIN/pair_spin_dipole_long.cpp b/src/SPIN/pair_spin_dipole_long.cpp
index 836b889513..3b4c861e0c 100644
--- a/src/SPIN/pair_spin_dipole_long.cpp
+++ b/src/SPIN/pair_spin_dipole_long.cpp
@@ -310,7 +310,7 @@ void PairSpinDipoleLong::compute(int eflag, int vflag)
       }
     }
   }
-  
+
   if (vflag_fdotr) virial_fdotr_compute();
 }
 
diff --git a/src/SPIN/pair_spin_dmi.cpp b/src/SPIN/pair_spin_dmi.cpp
index 69a9873303..e6ed5e4609 100644
--- a/src/SPIN/pair_spin_dmi.cpp
+++ b/src/SPIN/pair_spin_dmi.cpp
@@ -244,7 +244,7 @@ void PairSpinDmi::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_dmi(i,j,eij,fmi,spj);
-        
+
         if (lattice_flag)
           compute_dmi_mech(i,j,rsq,eij,fi,spi,spj);
 
@@ -253,7 +253,7 @@ void PairSpinDmi::compute(int eflag, int vflag)
           evdwl *= 0.5*hbar;
           emag[i] += evdwl;
         } else evdwl = 0.0;
-        
+
         f[i][0] += fi[0];
         f[i][1] += fi[1];
         f[i][2] += fi[2];
diff --git a/src/SPIN/pair_spin_exchange.cpp b/src/SPIN/pair_spin_exchange.cpp
index bccde3f66b..b7dd6ffc17 100644
--- a/src/SPIN/pair_spin_exchange.cpp
+++ b/src/SPIN/pair_spin_exchange.cpp
@@ -37,8 +37,8 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-PairSpinExchange::PairSpinExchange(LAMMPS *lmp) : 
-  PairSpin(lmp) 
+PairSpinExchange::PairSpinExchange(LAMMPS *lmp) :
+  PairSpin(lmp)
 {
   e_offset = 0;
 }
@@ -66,7 +66,7 @@ PairSpinExchange::~PairSpinExchange()
 void PairSpinExchange::settings(int narg, char **arg)
 {
   PairSpin::settings(narg,arg);
-  
+
   if (narg != 1) error->all(FLERR,"Illegal pair_style command");
 
   cut_spin_exchange_global = utils::numeric(FLERR,arg[0],false,lmp);
@@ -112,17 +112,17 @@ void PairSpinExchange::coeff(int narg, char **arg)
 
   // read energy offset flag if specified
 
-  while (iarg < narg) { 
-    if (strcmp(arg[7],"offset") == 0) { 
+  while (iarg < narg) {
+    if (strcmp(arg[7],"offset") == 0) {
       if (strcmp(arg[8],"yes") == 0) {
         e_offset = 1;
       } else if  (strcmp(arg[8],"no") == 0) {
         e_offset = 0;
       } else error->all(FLERR,"Incorrect args for pair coefficients");
-      iarg += 2; 
+      iarg += 2;
     } else error->all(FLERR,"Incorrect args for pair coefficients");
   }
-  
+
   int count = 0;
   for (int i = ilo; i <= ihi; i++) {
     for (int j = MAX(jlo,i); j <= jhi; j++) {
@@ -252,10 +252,10 @@ void PairSpinExchange::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_exchange(i,j,rsq,fmi,spj);
-        
+
         if (lattice_flag)
           compute_exchange_mech(i,j,rsq,eij,fi,spi,spj);
-        
+
         if (eflag) {
           evdwl -= compute_energy(i,j,rsq,spi,spj);
           emag[i] += evdwl;
@@ -388,7 +388,7 @@ void PairSpinExchange::compute_exchange(int i, int j, double rsq, double fmi[3],
    compute the mechanical force due to the exchange interaction between atom i and atom j
 ------------------------------------------------------------------------- */
 
-void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq, 
+void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq,
     double eij[3], double fi[3],  double spi[3], double spj[3])
 {
   int *type = atom->type;
@@ -407,11 +407,11 @@ void PairSpinExchange::compute_exchange_mech(int i, int j, double rsq,
 
   Jex_mech = 1.0-ra-J2[itype][jtype]*ra*(2.0-ra);
   Jex_mech *= 8.0*Jex*rr*exp(-ra);
-  
+
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
-  
+
   // apply or not energy and force offset
-  
+
   fx = fy = fz = 0.0;
   if (e_offset == 1) { // set offset
     fx = Jex_mech*(sdots-1.0)*eij[0];
@@ -446,17 +446,17 @@ double PairSpinExchange::compute_energy(int i, int j, double rsq, double spi[3],
   Jex = 4.0*Jex*ra;
   Jex *= (1.0-J2[itype][jtype]*ra);
   Jex *= exp(-ra);
-  
-  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
+
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
   // apply or not energy and force offset
-  
+
   if (e_offset == 1) { // set offset
     energy = 0.5*Jex*(sdots-1.0);
   } else if (e_offset == 0) { // no offset ("normal" calculation)
     energy = 0.5*Jex*sdots;
   } else error->all(FLERR,"Illegal option in pair exchange/biquadratic command");
-  
+
   return energy;
 }
 
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 36f3dbcf5e..59b959f4cc 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -37,8 +37,8 @@ using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-PairSpinExchangeBiquadratic::PairSpinExchangeBiquadratic(LAMMPS *lmp) : 
-  PairSpin(lmp) 
+PairSpinExchangeBiquadratic::PairSpinExchangeBiquadratic(LAMMPS *lmp) :
+  PairSpin(lmp)
 {
   e_offset = 0;
 }
@@ -119,14 +119,14 @@ void PairSpinExchangeBiquadratic::coeff(int narg, char **arg)
 
   // read energy offset flag if specified
 
-  while (iarg < narg) { 
-    if (strcmp(arg[10],"offset") == 0) { 
+  while (iarg < narg) {
+    if (strcmp(arg[10],"offset") == 0) {
       if (strcmp(arg[11],"yes") == 0) {
         e_offset = 1;
       } else if  (strcmp(arg[11],"no") == 0) {
         e_offset = 0;
       } else error->all(FLERR,"Incorrect args for pair coefficients");
-      iarg += 2; 
+      iarg += 2;
     } else error->all(FLERR,"Incorrect args for pair coefficients");
   }
 
@@ -267,10 +267,10 @@ void PairSpinExchangeBiquadratic::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_exchange(i,j,rsq,fmi,spi,spj);
-        
+
         if (lattice_flag)
           compute_exchange_mech(i,j,rsq,eij,fi,spi,spj);
-      
+
         if (eflag) {
           evdwl -= compute_energy(i,j,rsq,spi,spj);
           emag[i] += evdwl;
@@ -384,7 +384,7 @@ void PairSpinExchangeBiquadratic::compute_single_pair(int ii, double fmi[3])
    compute exchange interaction between spins i and j
 ------------------------------------------------------------------------- */
 
-void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq, 
+void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq,
     double fmi[3], double spi[3], double spj[3])
 {
   int *type = atom->type;
@@ -395,7 +395,7 @@ void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq,
 
   r2j = rsq/J3[itype][jtype]/J3[itype][jtype];
   r2k = rsq/J3[itype][jtype]/J3[itype][jtype];
- 
+
   Jex = 4.0*J1_mag[itype][jtype]*r2j;
   Jex *= (1.0-J2[itype][jtype]*r2j);
   Jex *= exp(-r2j);
@@ -403,7 +403,7 @@ void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq,
   Kex = 4.0*K1_mag[itype][jtype]*r2k;
   Kex *= (1.0-K2[itype][jtype]*r2k);
   Kex *= exp(-r2k);
-  
+
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
   fmi[0] += (Jex*spj[0] + 2.0*Kex*spj[0]*sdots);
@@ -415,7 +415,7 @@ void PairSpinExchangeBiquadratic::compute_exchange(int i, int j, double rsq,
    compute the mechanical force due to the exchange interaction between atom i and atom j
 ------------------------------------------------------------------------- */
 
-void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j, 
+void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j,
     double rsq, double eij[3], double fi[3],  double spi[3], double spj[3])
 {
   int *type = atom->type;
@@ -430,22 +430,22 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j,
   iJ3 = 1.0/(J3[itype][jtype]*J3[itype][jtype]);
   Kex = K1_mech[itype][jtype];
   iK3 = 1.0/(K3[itype][jtype]*K3[itype][jtype]);
-  
+
   rja = rsq*iJ3;
   rjr = sqrt(rsq)*iJ3;
   rka = rsq*iK3;
   rkr = sqrt(rsq)*iK3;
- 
+
   Jex_mech = 1.0-rja-J2[itype][jtype]*rja*(2.0-rja);
   Jex_mech *= 8.0*Jex*rjr*exp(-rja);
-  
+
   Kex_mech = 1.0-rka-K2[itype][jtype]*rka*(2.0-rka);
   Kex_mech *= 8.0*Kex*rkr*exp(-rka);
 
   sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
   // apply or not energy and force offset
-  
+
   fx = fy = fz = 0.0;
   if (e_offset == 1) { // set offset
     fx = (Jex_mech*(sdots-1.0) + Kex_mech*(sdots*sdots-1.0))*eij[0];
@@ -469,7 +469,7 @@ void PairSpinExchangeBiquadratic::compute_exchange_mech(int i, int j,
    compute energy of spin pair i and j
 ------------------------------------------------------------------------- */
 
-double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq, 
+double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
     double spi[3], double spj[3])
 {
   int *type = atom->type;
@@ -487,7 +487,7 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
   rk = ra/K3[itype][jtype];
   r2k = rsq/K3[itype][jtype]/K3[itype][jtype];
   ir3k = 1.0/(rk*rk*rk);
- 
+
   Jex = 4.0*J1_mech[itype][jtype]*r2j;
   Jex *= (1.0-J2[itype][jtype]*r2j);
   Jex *= exp(-r2j);
@@ -496,16 +496,16 @@ double PairSpinExchangeBiquadratic::compute_energy(int i, int j, double rsq,
   Kex *= (1.0-K2[itype][jtype]*r2k);
   Kex *= exp(-r2k);
 
-  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);  
+  sdots = (spi[0]*spj[0]+spi[1]*spj[1]+spi[2]*spj[2]);
 
   // apply or not energy and force offset
-  
+
   if (e_offset == 1) { // set offset
     energy = 0.5*(Jex*(sdots-1.0) + Kex*(sdots*sdots-1.0));
   } else if (e_offset == 0) { // no offset ("normal" calculation)
     energy = 0.5*(Jex*sdots + Kex*sdots*sdots);
   } else error->all(FLERR,"Illegal option in pair exchange/biquadratic command");
-  
+
   return energy;
 }
 
diff --git a/src/SPIN/pair_spin_exchange_biquadratic.h b/src/SPIN/pair_spin_exchange_biquadratic.h
index 1074b50f7b..9619416f2e 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.h
+++ b/src/SPIN/pair_spin_exchange_biquadratic.h
@@ -48,7 +48,7 @@ class PairSpinExchangeBiquadratic : public PairSpin {
   double cut_spin_exchange_global;      // global exchange cutoff distance
 
  protected:
-  
+
   int e_offset;                         // apply energy offset
   double **J1_mag;                      // H exchange coeffs in eV
   double **J1_mech;                     // mech exchange coeffs in
diff --git a/src/SPIN/pair_spin_magelec.cpp b/src/SPIN/pair_spin_magelec.cpp
index 72a52c1340..33ad364aaa 100644
--- a/src/SPIN/pair_spin_magelec.cpp
+++ b/src/SPIN/pair_spin_magelec.cpp
@@ -237,7 +237,7 @@ void PairSpinMagelec::compute(int eflag, int vflag)
 
       if (rsq <= local_cut2) {
         compute_magelec(i,j,eij,fmi,spj);
-        
+
         if (lattice_flag)
           compute_magelec_mech(i,j,fi,spi,spj);
 
@@ -246,7 +246,7 @@ void PairSpinMagelec::compute(int eflag, int vflag)
           evdwl *= 0.5*hbar;
           emag[i] += evdwl;
         } else evdwl = 0.0;
-        
+
         f[i][0] += fi[0];
         f[i][1] += fi[1];
         f[i][2] += fi[2];

From e7ccbd0ce61fa6bfea47333fc17361b1c753bab3 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 12 Nov 2020 10:44:04 -0500
Subject: [PATCH 60/64] Replace NULL with nullptr

---
 src/SPIN/pair_spin_exchange_biquadratic.cpp | 30 ++++++++++-----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 59b959f4cc..7cdd8d0c19 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -180,7 +180,7 @@ void *PairSpinExchangeBiquadratic::extract(const char *str, int &dim)
 {
   dim = 0;
   if (strcmp(str,"cut") == 0) return (void *) &cut_spin_exchange_global;
-  return NULL;
+  return nullptr;
 }
 
 /* ---------------------------------------------------------------------- */
@@ -576,19 +576,19 @@ void PairSpinExchangeBiquadratic::read_restart(FILE *fp)
   int me = comm->me;
   for (i = 1; i <= atom->ntypes; i++) {
     for (j = i; j <= atom->ntypes; j++) {
-      if (me == 0) utils::sfread(FLERR,&setflag[i][j],sizeof(int),1,fp,NULL,error);
+      if (me == 0) utils::sfread(FLERR,&setflag[i][j],sizeof(int),1,fp,nullptr,error);
       MPI_Bcast(&setflag[i][j],1,MPI_INT,0,world);
       if (setflag[i][j]) {
         if (me == 0) {
-          utils::sfread(FLERR,&J1_mag[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&J1_mech[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&J2[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&J3[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&K1_mag[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&K1_mech[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&K2[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&K3[i][j],sizeof(double),1,fp,NULL,error);
-          utils::sfread(FLERR,&cut_spin_exchange[i][j],sizeof(double),1,fp,NULL,error);
+          utils::sfread(FLERR,&J1_mag[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&J1_mech[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&J2[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&J3[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&K1_mag[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&K1_mech[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&K2[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&K3[i][j],sizeof(double),1,fp,nullptr,error);
+          utils::sfread(FLERR,&cut_spin_exchange[i][j],sizeof(double),1,fp,nullptr,error);
         }
         MPI_Bcast(&J1_mag[i][j],1,MPI_DOUBLE,0,world);
         MPI_Bcast(&J1_mech[i][j],1,MPI_DOUBLE,0,world);
@@ -624,10 +624,10 @@ void PairSpinExchangeBiquadratic::write_restart_settings(FILE *fp)
 void PairSpinExchangeBiquadratic::read_restart_settings(FILE *fp)
 {
   if (comm->me == 0) {
-    utils::sfread(FLERR,&cut_spin_exchange_global,sizeof(double),1,fp,NULL,error);
-    utils::sfread(FLERR,&e_offset,sizeof(int),1,fp,NULL,error);
-    utils::sfread(FLERR,&offset_flag,sizeof(int),1,fp,NULL,error);
-    utils::sfread(FLERR,&mix_flag,sizeof(int),1,fp,NULL,error);
+    utils::sfread(FLERR,&cut_spin_exchange_global,sizeof(double),1,fp,nullptr,error);
+    utils::sfread(FLERR,&e_offset,sizeof(int),1,fp,nullptr,error);
+    utils::sfread(FLERR,&offset_flag,sizeof(int),1,fp,nullptr,error);
+    utils::sfread(FLERR,&mix_flag,sizeof(int),1,fp,nullptr,error);
   }
   MPI_Bcast(&cut_spin_exchange_global,1,MPI_DOUBLE,0,world);
   MPI_Bcast(&e_offset,1,MPI_INT,0,world);

From c407d547cd702f63a1b8d0d59a0231bebb13b82e Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 12 Nov 2020 10:54:20 -0500
Subject: [PATCH 61/64] Whitespace

---
 src/SPIN/pair_spin_exchange_biquadratic.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/SPIN/pair_spin_exchange_biquadratic.cpp b/src/SPIN/pair_spin_exchange_biquadratic.cpp
index 7cdd8d0c19..f2baf1333b 100644
--- a/src/SPIN/pair_spin_exchange_biquadratic.cpp
+++ b/src/SPIN/pair_spin_exchange_biquadratic.cpp
@@ -156,8 +156,7 @@ void PairSpinExchangeBiquadratic::coeff(int narg, char **arg)
 
 double PairSpinExchangeBiquadratic::init_one(int i, int j)
 {
-
-   if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
+  if (setflag[i][j] == 0) error->all(FLERR,"All pair coeffs are not set");
 
   J1_mag[j][i] = J1_mag[i][j];
   J1_mech[j][i] = J1_mech[i][j];

From aadc66877120af9fac50d6e647bbae3dd72e9525 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Thu, 12 Nov 2020 10:58:59 -0500
Subject: [PATCH 62/64] Fix pair_spin_exchange doc page title

---
 doc/src/pair_spin_exchange.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/doc/src/pair_spin_exchange.rst b/doc/src/pair_spin_exchange.rst
index 9e6e534280..630ec6608e 100644
--- a/doc/src/pair_spin_exchange.rst
+++ b/doc/src/pair_spin_exchange.rst
@@ -5,7 +5,7 @@ pair_style spin/exchange command
 ================================
 
 pair_style spin/exchange/biquadratic command
-================================
+============================================
 
 Syntax
 """"""
@@ -25,6 +25,7 @@ Examples
    pair_style spin/exchange 4.0
    pair_coeff * * exchange 4.0 0.0446928 0.003496 1.4885
    pair_coeff 1 2 exchange 6.0 -0.01575 0.0 1.965 offset yes
+
    pair_style spin/exchange/biquadratic 4.0
    pair_coeff * * biquadratic 4.0 0.05 0.03 1.48 0.05 0.03 1.48 offset no
    pair_coeff 1 2 biquadratic 6.0 -0.01 0.0 1.9 0.0 0.1 19

From 497f0dd59358093e11e157e960a6d238ae02df37 Mon Sep 17 00:00:00 2001
From: julient31 <julien.tranchida1@gmail.com>
Date: Thu, 12 Nov 2020 09:43:38 -0700
Subject: [PATCH 63/64] Removing binder and m2,m4 declarations from
 compute/spin

---
 src/SPIN/compute_spin.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/SPIN/compute_spin.cpp b/src/SPIN/compute_spin.cpp
index 3e4970a62b..c92f24f1ae 100644
--- a/src/SPIN/compute_spin.cpp
+++ b/src/SPIN/compute_spin.cpp
@@ -148,12 +148,10 @@ void ComputeSpin::compute_vector()
   int i;
   int countsp, countsptot;
   double mag[4], magtot[4];
-  double m2, m2tot;
-  double m4, m4tot;
   double magenergy, magenergytot;
   double tempnum, tempnumtot;
   double tempdenom, tempdenomtot;
-  double spintemperature,binder;
+  double spintemperature;
 
   invoked_vector = update->ntimestep;
 

From a48f463faf26cdf9af2f4af589f05138ea30b46a Mon Sep 17 00:00:00 2001
From: Stan Gerald Moore <stamoor@sandia.gov>
Date: Fri, 13 Nov 2020 13:12:50 -0700
Subject: [PATCH 64/64] Fix memory bug in Kokkos KISS FFT

---
 src/KOKKOS/fft3d_kokkos.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/fft3d_kokkos.cpp b/src/KOKKOS/fft3d_kokkos.cpp
index 02f55e11fa..04a5512cc7 100644
--- a/src/KOKKOS/fft3d_kokkos.cpp
+++ b/src/KOKKOS/fft3d_kokkos.cpp
@@ -228,7 +228,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
     cufftExec(plan->plan_fast,d_data.data(),d_data.data(),flag);
   #else
     typename FFT_AT::t_FFT_DATA_1d d_tmp =
-     typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.extent(0));
+     typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_data.extent(0));
     kiss_fft_functor<DeviceType> f;
     if (flag == -1)
       f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length);
@@ -236,7 +236,6 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
       f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_backward,length);
     Kokkos::parallel_for(total/length,f);
     d_data = d_tmp;
-    d_tmp = typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.extent(0));
   #endif
 
 
@@ -273,13 +272,13 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
   #elif defined(FFT_CUFFT)
     cufftExec(plan->plan_mid,d_data.data(),d_data.data(),flag);
   #else
+    d_tmp = typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_data.extent(0));
     if (flag == -1)
       f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_forward,length);
     else
       f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_mid_backward,length);
     Kokkos::parallel_for(total/length,f);
     d_data = d_tmp;
-    d_tmp = typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_in.extent(0));
   #endif
 
   // 2nd mid-remap to prepare for 3rd FFTs
@@ -315,6 +314,7 @@ void FFT3dKokkos<DeviceType>::fft_3d_kokkos(typename FFT_AT::t_FFT_DATA_1d d_in,
   #elif defined(FFT_CUFFT)
     cufftExec(plan->plan_slow,d_data.data(),d_data.data(),flag);
   #else
+    d_tmp = typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_data.extent(0));
     if (flag == -1)
       f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_slow_forward,length);
     else
@@ -866,7 +866,8 @@ void FFT3dKokkos<DeviceType>::fft_3d_1d_only_kokkos(typename FFT_AT::t_FFT_DATA_
   cufftExec(plan->plan_slow,d_data.data(),d_data.data(),flag);
 #else
   kiss_fft_functor<DeviceType> f;
-  typename FFT_AT::t_FFT_DATA_1d d_tmp = typename FFT_AT::t_FFT_DATA_1d("fft_3d:tmp",d_data.extent(0));
+    typename FFT_AT::t_FFT_DATA_1d d_tmp =
+     typename FFT_AT::t_FFT_DATA_1d(Kokkos::view_alloc("fft_3d:tmp",Kokkos::WithoutInitializing),d_data.extent(0));
   if (flag == -1) {
     f = kiss_fft_functor<DeviceType>(d_data,d_tmp,plan->cfg_fast_forward,length1);
     Kokkos::parallel_for(total1/length1,f);