From 8fef6a10dd7d97af95787bd48dc40bff4d2013b8 Mon Sep 17 00:00:00 2001
From: Vsevak <thevsevak@gmail.com>
Date: Fri, 18 Jun 2021 00:52:23 +0300
Subject: [PATCH 01/15] Fix atom types handling in the tip4p/gpu kernels

---
 lib/gpu/lal_lj_tip4p_long.cu | 38 +++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu
index 782ae43662..ba552e9df8 100644
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@@ -140,7 +140,8 @@ __kernel void k_lj_tip4p_long_distrib(const __global numtyp4 *restrict x_,
           engv[inum*engv_iter + i] += vM.z * (acctyp)0.5 * alpha;
         }
       }
-    } else {
+    }
+    if (itype == typeO) {
       fM = ansO[i];
       int iH1 = hneigh[i*4  ];
       int iH2 = hneigh[i*4+1];
@@ -202,7 +203,8 @@ __kernel void k_lj_tip4p_reneigh(const __global numtyp4 *restrict x_,
         hneigh[i*4+1] = iH2;
         hneigh[i*4+2] = -1;
       }
-    } else {
+    }
+    if (itype == typeH) {
       if (hneigh[i*4+2] != -1) {
         int iI, iH;
         iI = atom_mapping(map,tag[i] - 1);
@@ -301,12 +303,13 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
     int non_local_oxy = 0;
     int iH1, iH2, iO;
 
-    if(itype == typeO) {
+    if (itype == typeO) {
       iO  = i;
       iH1 = hneigh[i*4  ];
       iH2 = hneigh[i*4+1];
       x1 = m[iO];
-    } else {
+    }
+    if (itype == typeH) {
       iO  = hneigh[i *4  ];
       iH1 = hneigh[iO*4  ];
       iH2 = hneigh[iO*4+1];
@@ -390,7 +393,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
             f.y += dely * force_coul;
             f.z += delz * force_coul;
             f.w += 0;
-          } else {
+          }
+          if (itype == typeO) {
             fO.x += delx * force_coul;
             fO.y += dely * force_coul;
             fO.z += delz * force_coul;
@@ -412,7 +416,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
                 virial[3] += delx*fd.y;
                 virial[4] += delx*fd.z;
                 virial[5] += dely*fd.z;
-              } else {
+              }
+              if (jtype == typeO) {
                 numtyp cO = 1 - alpha, cH = 0.5*alpha;
                 numtyp4 vdj;
                 numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
@@ -429,7 +434,8 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
                 virial[4] += (ix.x - vdj.x)*fd.z;
                 virial[5] += (ix.y - vdj.y)*fd.z;
               }
-            } else {
+            }
+            if (itype == typeO) {
               numtyp cO = 1 - alpha, cH = 0.5*alpha;
               numtyp4 vdi, vdj;
               numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
@@ -439,7 +445,7 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
               vdi.y = xO.y*cO + xH1.y*cH + xH2.y*cH;
               vdi.z = xO.z*cO + xH1.z*cH + xH2.z*cH;
               //vdi.w = vdi.w;
-              if (jtype != typeH) {
+              if (jtype == typeO) {
                 numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
                 numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
                 numtyp4 xjO; fetch4(xjO,jO,pos_tex);
@@ -625,12 +631,13 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
     int non_local_oxy = 0;
     int iH1, iH2, iO;
 
-    if(itype == typeO) {
+    if (itype == typeO) {
       iO  = i;
       iH1 = hneigh[i*4  ];
       iH2 = hneigh[i*4+1];
       x1 = m[iO];
-    } else {
+    }
+    if (itype == typeH) {
       iO  = hneigh[i *4  ];
       iH1 = hneigh[iO*4  ];
       iH2 = hneigh[iO*4+1];
@@ -714,7 +721,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
             f.y += dely * force_coul;
             f.z += delz * force_coul;
             f.w += 0;
-          } else {
+          }
+          if (itype == typeO) {
             fO.x += delx * force_coul;
             fO.y += dely * force_coul;
             fO.z += delz * force_coul;
@@ -736,7 +744,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
                 virial[3] += delx*fd.y;
                 virial[4] += delx*fd.z;
                 virial[5] += dely*fd.z;
-              } else {
+              }
+              if (jtype == typeO) {
                 numtyp cO = 1 - alpha, cH = 0.5*alpha;
                 numtyp4 vdj;
                 numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
@@ -753,7 +762,8 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
                 virial[4] += (ix.x - vdj.x)*fd.z;
                 virial[5] += (ix.y - vdj.y)*fd.z;
               }
-            } else {
+            }
+            if (itype == typeO) {
               numtyp cO = 1 - alpha, cH = 0.5*alpha;
               numtyp4 vdi, vdj;
               numtyp4 xH1; fetch4(xH1,iH1,pos_tex);
@@ -763,7 +773,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
               vdi.y = xO.y*cO + xH1.y*cH + xH2.y*cH;
               vdi.z = xO.z*cO + xH1.z*cH + xH2.z*cH;
               //vdi.w = vdi.w;
-              if (jtype != typeH) {
+              if (jtype == typeO) {
                 numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
                 numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
                 numtyp4 xjO; fetch4(xjO,jO,pos_tex);

From d982d153f870b412941ea50eda3771b7bcf75969 Mon Sep 17 00:00:00 2001
From: Vsevak <thevsevak@gmail.com>
Date: Fri, 18 Jun 2021 18:26:53 +0300
Subject: [PATCH 02/15] Fix conditions for correct results on other types

---
 lib/gpu/lal_lj_tip4p_long.cu | 128 +++++++++++++++++------------------
 1 file changed, 61 insertions(+), 67 deletions(-)

diff --git a/lib/gpu/lal_lj_tip4p_long.cu b/lib/gpu/lal_lj_tip4p_long.cu
index ba552e9df8..026f08b3f9 100644
--- a/lib/gpu/lal_lj_tip4p_long.cu
+++ b/lib/gpu/lal_lj_tip4p_long.cu
@@ -388,17 +388,16 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
           prefactor *= qqrd2e*qtmp/r;
           numtyp force_coul = r2inv*prefactor * (_erfc + EWALD_F*grij*expm2 - factor_coul);
 
-          if (itype == typeH) {
-            f.x += delx * force_coul;
-            f.y += dely * force_coul;
-            f.z += delz * force_coul;
-            f.w += 0;
-          }
           if (itype == typeO) {
             fO.x += delx * force_coul;
             fO.y += dely * force_coul;
             fO.z += delz * force_coul;
             fO.w += 0;
+          } else {
+            f.x += delx * force_coul;
+            f.y += dely * force_coul;
+            f.z += delz * force_coul;
+            f.w += 0;
           }
           if (eflag>0) {
             e_coul += prefactor*(_erfc-factor_coul);
@@ -408,33 +407,6 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
             fd.z = delz*force_coul;
-            if (itype == typeH) {
-              if (jtype == typeH) {
-                virial[0] += delx*fd.x;
-                virial[1] += dely*fd.y;
-                virial[2] += delz*fd.z;
-                virial[3] += delx*fd.y;
-                virial[4] += delx*fd.z;
-                virial[5] += dely*fd.z;
-              }
-              if (jtype == typeO) {
-                numtyp cO = 1 - alpha, cH = 0.5*alpha;
-                numtyp4 vdj;
-                numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
-                numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
-                numtyp4 xjO; fetch4(xjO,jO,pos_tex);
-                vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
-                vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
-                vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
-                //vdj.w = vdj.w;
-                virial[0] += (ix.x - vdj.x)*fd.x;
-                virial[1] += (ix.y - vdj.y)*fd.y;
-                virial[2] += (ix.z - vdj.z)*fd.z;
-                virial[3] += (ix.x - vdj.x)*fd.y;
-                virial[4] += (ix.x - vdj.x)*fd.z;
-                virial[5] += (ix.y - vdj.y)*fd.z;
-              }
-            }
             if (itype == typeO) {
               numtyp cO = 1 - alpha, cH = 0.5*alpha;
               numtyp4 vdi, vdj;
@@ -460,6 +432,31 @@ __kernel void k_lj_tip4p_long(const __global numtyp4 *restrict x_,
               vO[3] += 0.5*(vdi.x - vdj.x)*fd.y;
               vO[4] += 0.5*(vdi.x - vdj.x)*fd.z;
               vO[5] += 0.5*(vdi.y - vdj.y)*fd.z;
+            } else {
+              if (jtype == typeO) {
+                numtyp cO = 1 - alpha, cH = 0.5*alpha;
+                numtyp4 vdj;
+                numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
+                numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
+                numtyp4 xjO; fetch4(xjO,jO,pos_tex);
+                vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
+                vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
+                vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
+                //vdj.w = vdj.w;
+                virial[0] += (ix.x - vdj.x)*fd.x;
+                virial[1] += (ix.y - vdj.y)*fd.y;
+                virial[2] += (ix.z - vdj.z)*fd.z;
+                virial[3] += (ix.x - vdj.x)*fd.y;
+                virial[4] += (ix.x - vdj.x)*fd.z;
+                virial[5] += (ix.y - vdj.y)*fd.z;
+              } else {
+                virial[0] += delx*fd.x;
+                virial[1] += dely*fd.y;
+                virial[2] += delz*fd.z;
+                virial[3] += delx*fd.y;
+                virial[4] += delx*fd.z;
+                virial[5] += dely*fd.z;
+              }
             }
           }
         }
@@ -617,7 +614,7 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
   }
 
   __syncthreads();
-  if (ii<inum) {
+   if (ii<inum) {
     int i, numj, nbor, nbor_end;
     __local int n_stride;
     nbor_info(dev_nbor,dev_packed,nbor_pitch,t_per_atom,ii,offset,i,numj,
@@ -716,17 +713,16 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
           prefactor *= qqrd2e*qtmp/r;
           numtyp force_coul = r2inv*prefactor * (_erfc + EWALD_F*grij*expm2 - factor_coul);
 
-          if (itype == typeH) {
-            f.x += delx * force_coul;
-            f.y += dely * force_coul;
-            f.z += delz * force_coul;
-            f.w += 0;
-          }
           if (itype == typeO) {
             fO.x += delx * force_coul;
             fO.y += dely * force_coul;
             fO.z += delz * force_coul;
             fO.w += 0;
+          } else {
+            f.x += delx * force_coul;
+            f.y += dely * force_coul;
+            f.z += delz * force_coul;
+            f.w += 0;
           }
           if (eflag>0) {
             e_coul += prefactor*(_erfc-factor_coul);
@@ -736,33 +732,6 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
             fd.x = delx*force_coul;
             fd.y = dely*force_coul;
             fd.z = delz*force_coul;
-            if (itype == typeH) {
-              if (jtype == typeH) {
-                virial[0] += delx*fd.x;
-                virial[1] += dely*fd.y;
-                virial[2] += delz*fd.z;
-                virial[3] += delx*fd.y;
-                virial[4] += delx*fd.z;
-                virial[5] += dely*fd.z;
-              }
-              if (jtype == typeO) {
-                numtyp cO = 1 - alpha, cH = 0.5*alpha;
-                numtyp4 vdj;
-                numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
-                numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
-                numtyp4 xjO; fetch4(xjO,jO,pos_tex);
-                vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
-                vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
-                vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
-                //vdj.w = vdj.w;
-                virial[0] += (ix.x - vdj.x)*fd.x;
-                virial[1] += (ix.y - vdj.y)*fd.y;
-                virial[2] += (ix.z - vdj.z)*fd.z;
-                virial[3] += (ix.x - vdj.x)*fd.y;
-                virial[4] += (ix.x - vdj.x)*fd.z;
-                virial[5] += (ix.y - vdj.y)*fd.z;
-              }
-            }
             if (itype == typeO) {
               numtyp cO = 1 - alpha, cH = 0.5*alpha;
               numtyp4 vdi, vdj;
@@ -788,6 +757,31 @@ __kernel void k_lj_tip4p_long_fast(const __global numtyp4 *restrict x_,
               vO[3] += 0.5*(vdi.x - vdj.x)*fd.y;
               vO[4] += 0.5*(vdi.x - vdj.x)*fd.z;
               vO[5] += 0.5*(vdi.y - vdj.y)*fd.z;
+            } else {
+              if (jtype == typeO) {
+                numtyp cO = 1 - alpha, cH = 0.5*alpha;
+                numtyp4 vdj;
+                numtyp4 xjH1; fetch4(xjH1,jH1,pos_tex);
+                numtyp4 xjH2; fetch4(xjH2,jH2,pos_tex);
+                numtyp4 xjO; fetch4(xjO,jO,pos_tex);
+                vdj.x = xjO.x*cO + xjH1.x*cH + xjH2.x*cH;
+                vdj.y = xjO.y*cO + xjH1.y*cH + xjH2.y*cH;
+                vdj.z = xjO.z*cO + xjH1.z*cH + xjH2.z*cH;
+                //vdj.w = vdj.w;
+                virial[0] += (ix.x - vdj.x)*fd.x;
+                virial[1] += (ix.y - vdj.y)*fd.y;
+                virial[2] += (ix.z - vdj.z)*fd.z;
+                virial[3] += (ix.x - vdj.x)*fd.y;
+                virial[4] += (ix.x - vdj.x)*fd.z;
+                virial[5] += (ix.y - vdj.y)*fd.z;
+              } else {
+                virial[0] += delx*fd.x;
+                virial[1] += dely*fd.y;
+                virial[2] += delz*fd.z;
+                virial[3] += delx*fd.y;
+                virial[4] += delx*fd.z;
+                virial[5] += dely*fd.z;
+              }
             }
           }
         }

From cfd9cf625dd90697d9b7e40623f123d032683690 Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Fri, 25 Jun 2021 09:47:54 +1000
Subject: [PATCH 03/15] Initial draft of Kokkos acclerated PPPM routines for
 triclinic cells.

---
 src/KOKKOS/pppm_kokkos.cpp | 553 +++++++++++++++++++++++--------------
 src/KOKKOS/pppm_kokkos.h   |  41 ++-
 2 files changed, 381 insertions(+), 213 deletions(-)

diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index dbda29d807..98b6492e48 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -1,7 +1,6 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
+   https://lammps.sandia.gov/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -31,6 +30,7 @@
 #include "memory_kokkos.h"
 #include "pair.h"
 #include "remap_kokkos.h"
+#include "kokkos_few.h"
 
 #include <cmath>
 
@@ -67,7 +67,7 @@ PPPMKokkos<DeviceType>::PPPMKokkos(LAMMPS *lmp) : PPPM(lmp)
 
   pppmflag = 1;
   group_group_enable = 0;
-  triclinic_support = 0;
+  triclinic_support = 1;
 
   nfactors = 3;
   //factors = new int[nfactors];
@@ -455,80 +455,92 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup4, const int &n) const
 template<class DeviceType>
 void PPPMKokkos<DeviceType>::setup_triclinic()
 {
-//  int i,j,k,n;
-//  double *prd;
-//
-//  // volume-dependent factors
-//  // adjust z dimension for 2d slab PPPM
-//  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
-//
-//  prd = domain->prd;
-//
-//  double xprd = prd[0];
-//  double yprd = prd[1];
-//  double zprd = prd[2];
-//  double zprd_slab = zprd*slab_volfactor;
-//  volume = xprd * yprd * zprd_slab;
-//
-//  // use lamda (0-1) coordinates
-//
-//  delxinv = nx_pppm;
-//  delyinv = ny_pppm;
-//  delzinv = nz_pppm;
-//  delvolinv = delxinv*delyinv*delzinv/volume;
-//
-//  // d_fkx,d_fky,d_fkz for my FFT grid pts
-//
-//  double per_i,per_j,per_k;
-//
-//  n = 0;
-//  for (k = nzlo_fft; k <= nzhi_fft; k++) { // parallel_for
-//    per_k = k - nz_pppm*(2*k/nz_pppm);
-//    for (j = nylo_fft; j <= nyhi_fft; j++) {
-//      per_j = j - ny_pppm*(2*j/ny_pppm);
-//      for (i = nxlo_fft; i <= nxhi_fft; i++) {
-//        per_i = i - nx_pppm*(2*i/nx_pppm);
-//
-//        double unitk_lamda[3];
-//        unitk_lamda[0] = 2.0*MY_PI*per_i;
-//        unitk_lamda[1] = 2.0*MY_PI*per_j;
-//        unitk_lamda[2] = 2.0*MY_PI*per_k;
-//        x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
-//        d_fkx[n] = unitk_lamda[0];
-//        d_fky[n] = unitk_lamda[1];
-//        d_fkz[n] = unitk_lamda[2];
-//        n++;
-//      }
-//    }
-//  }
-//
-//  // virial coefficients
-//
-//  double sqk,vterm;
-//
-//  for (n = 0; n < nfft; n++) { // parallel_for
-//    sqk = d_fkx[n]*d_fkx[n] + d_fky[n]*d_fky[n] + d_fkz[n]*d_fkz[n];
-//    if (sqk == 0.0) {
-//      d_vg(n,0) = 0.0;
-//      d_vg(n,1) = 0.0;
-//      d_vg(n,2) = 0.0;
-//      d_vg(n,3) = 0.0;
-//      d_vg(n,4) = 0.0;
-//      d_vg(n,5) = 0.0;
-//    } else {
-//      vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
-//      d_vg(n,0) = 1.0 + vterm*d_fkx[n]*d_fkx[n];
-//      d_vg(n,1) = 1.0 + vterm*d_fky[n]*d_fky[n];
-//      d_vg(n,2) = 1.0 + vterm*d_fkz[n]*d_fkz[n];
-//      d_vg(n,3) = vterm*d_fkx[n]*d_fky[n];
-//      d_vg(n,4) = vterm*d_fkx[n]*d_fkz[n];
-//      d_vg(n,5) = vterm*d_fky[n]*d_fkz[n];
-//    }
-//  }
-//
-//  compute_gf_ik_triclinic();
+  int i,j,k,n;
+  double *prd;
+
+  // volume-dependent factors
+  // adjust z dimension for 2d slab PPPM
+  // z dimension for 3d PPPM is zprd since slab_volfactor = 1.0
+
+  prd = domain->prd;
+  // Update simulation box parameters
+  h = Few<double, 6>(domain->h);
+  h_inv = Few<double, 6>(domain->h_inv);
+
+  double xprd = prd[0];
+  double yprd = prd[1];
+  double zprd = prd[2];
+  double zprd_slab = zprd*slab_volfactor;
+  volume = xprd * yprd * zprd_slab;
+
+  // use lamda (0-1) coordinates
+
+  delxinv = nx_pppm;
+  delyinv = ny_pppm;
+  delzinv = nz_pppm;
+  delvolinv = delxinv*delyinv*delzinv/volume;
+
+  // merge three outer loops into one for better threading
+  numz_fft = nzhi_fft-nzlo_fft + 1;
+  numy_fft = nyhi_fft-nylo_fft + 1;
+  numx_fft = nxhi_fft-nxlo_fft + 1;
+  const int inum_fft = numx_fft*numy_fft*numz_fft;
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup_triclinic1>(0,inum_fft),*this);
+  copymode = 0;
+
+  // virial coefficients
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup_triclinic2>(0,nfft),*this);
+  copymode = 0;
+
+  compute_gf_ik_triclinic();
 }
 
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic1, const int &n) const
+{
+  const int k = n/(numy_fft*numx_fft);
+  const int j = (n - k*numy_fft*numx_fft) / numx_fft;
+  const int i = n - k*numy_fft*numx_fft - j*numx_fft;
+  double per_k = k - nz_pppm*(2*k/nz_pppm);
+  double per_j = j - ny_pppm*(2*j/ny_pppm);
+  double per_i = i - nx_pppm*(2*i/nx_pppm);
+
+  double unitk_lamda[3];
+  unitk_lamda[0] = 2.0*MY_PI*per_i;
+  unitk_lamda[1] = 2.0*MY_PI*per_j;
+  unitk_lamda[2] = 2.0*MY_PI*per_k;
+  x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+  d_fkx[n] = unitk_lamda[0];
+  d_fky[n] = unitk_lamda[1];
+  d_fkz[n] = unitk_lamda[2];
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic2, const int &n) const
+{
+    const double sqk = d_fkx[n]*d_fkx[n] + d_fky[n]*d_fky[n] + d_fkz[n]*d_fkz[n];
+    if (sqk == 0.0) {
+      d_vg(n,0) = 0.0;
+      d_vg(n,1) = 0.0;
+      d_vg(n,2) = 0.0;
+      d_vg(n,3) = 0.0;
+      d_vg(n,4) = 0.0;
+      d_vg(n,5) = 0.0;
+    } else {
+      const double vterm = -2.0 * (1.0/sqk + 0.25/(g_ewald*g_ewald));
+      d_vg(n,0) = 1.0 + vterm*d_fkx[n]*d_fkx[n];
+      d_vg(n,1) = 1.0 + vterm*d_fky[n]*d_fky[n];
+      d_vg(n,2) = 1.0 + vterm*d_fkz[n]*d_fkz[n];
+      d_vg(n,3) = vterm*d_fkx[n]*d_fky[n];
+      d_vg(n,4) = vterm*d_fkx[n]*d_fkz[n];
+      d_vg(n,5) = vterm*d_fky[n]*d_fkz[n];
+    }
+}
 /* ----------------------------------------------------------------------
    reset local grid arrays and communication stencils
    called by fix balance b/c it changed sizes of processor sub-domains
@@ -1004,7 +1016,7 @@ void PPPMKokkos<DeviceType>::set_grid_global()
     tmp[0] = nx_pppm;
     tmp[1] = ny_pppm;
     tmp[2] = nz_pppm;
-    x2lamdaT(&tmp[0],&tmp[0]);
+    KSpace::x2lamdaT(&tmp[0],&tmp[0]);
     h_x = 1.0/tmp[0];
     h_y = 1.0/tmp[1];
     h_z = 1.0/tmp[2];
@@ -1311,6 +1323,10 @@ void PPPMKokkos<DeviceType>::compute_gf_ik_triclinic()
   nby = static_cast<int> (tmp[1]);
   nbz = static_cast<int> (tmp[2]);
 
+  // Update the local copy of the domain box tilt 
+  h = Few<double, 6>(domain->h);
+  h_inv = Few<double, 6>(domain->h_inv);
+
   twoorder = 2*order;
 
   copymode = 1;
@@ -1320,71 +1336,71 @@ void PPPMKokkos<DeviceType>::compute_gf_ik_triclinic()
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik_triclinic, const int &/*m*/) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik_triclinic, const int &m) const
 {
-  //int n = (m - nzlo_fft)*(nyhi_fft+1 - nylo_fft)*(nxhi_fft+1 - nxlo_fft);
-  //
-  //const int mper = m - nz_pppm*(2*m/nz_pppm);
-  //const double snz = square(sin(MY_PI*mper/nz_pppm));
-  //
-  //for (int l = nylo_fft; l <= nyhi_fft; l++) {
-  //  const int lper = l - ny_pppm*(2*l/ny_pppm);
-  //  const double sny = square(sin(MY_PI*lper/ny_pppm));
-  //
-  //  for (int k = nxlo_fft; k <= nxhi_fft; k++) {
-  //    const int kper = k - nx_pppm*(2*k/nx_pppm);
-  //    const double snx = square(sin(MY_PI*kper/nx_pppm));
-  //
-  //    double unitk_lamda[3];
-  //    unitk_lamda[0] = 2.0*MY_PI*kper;
-  //    unitk_lamda[1] = 2.0*MY_PI*lper;
-  //    unitk_lamda[2] = 2.0*MY_PI*mper;
-  //    x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
-  //
-  //    const double sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
-  //
-  //    if (sqk != 0.0) {
-  //      const double numerator = 12.5663706/sqk;
-  //      const double denominator = gf_denom(snx,sny,snz);
-  //      double sum1 = 0.0;
-  //
-  //      for (int nx = -nbx; nx <= nbx; nx++) {
-  //        const double argx = MY_PI*kper/nx_pppm + MY_PI*nx;
-  //        const double wx = powsinxx(argx,twoorder);
-  //
-  //        for (int ny = -nby; ny <= nby; ny++) {
-  //          const double argy = MY_PI*lper/ny_pppm + MY_PI*ny;
-  //          const double wy = powsinxx(argy,twoorder);
-  //
-  //          for (int nz = -nbz; nz <= nbz; nz++) {
-  //            const double argz = MY_PI*mper/nz_pppm + MY_PI*nz;
-  //            const double wz = powsinxx(argz,twoorder);
-  //
-  //            double b[3];
-  //            b[0] = 2.0*MY_PI*nx_pppm*nx;
-  //            b[1] = 2.0*MY_PI*ny_pppm*ny;
-  //            b[2] = 2.0*MY_PI*nz_pppm*nz;
-  //            x2lamdaT(&b[0],&b[0]);
-  //
-  //            const double qx = unitk_lamda[0]+b[0];
-  //            const double sx = exp(-0.25*square(qx/g_ewald));
-  //
-  //            const double qy = unitk_lamda[1]+b[1];
-  //            const double sy = exp(-0.25*square(qy/g_ewald));
-  //
-  //            const double qz = unitk_lamda[2]+b[2];
-  //            const double sz = exp(-0.25*square(qz/g_ewald));
-  //
-  //            const double dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
-  //            const double dot2 = qx*qx+qy*qy+qz*qz;
-  //            sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
-  //          }
-  //        }
-  //      }
-  //      d_greensfn[n++] = numerator*sum1/denominator;
-  //    } else d_greensfn[n++] = 0.0;
-  //  }
-  //}
+  int n = (m - nzlo_fft)*(nyhi_fft+1 - nylo_fft)*(nxhi_fft+1 - nxlo_fft);
+  
+  const int mper = m - nz_pppm*(2*m/nz_pppm);
+  const double snz = square(sin(MY_PI*mper/nz_pppm));
+  
+  for (int l = nylo_fft; l <= nyhi_fft; l++) {
+    const int lper = l - ny_pppm*(2*l/ny_pppm);
+    const double sny = square(sin(MY_PI*lper/ny_pppm));
+  
+    for (int k = nxlo_fft; k <= nxhi_fft; k++) {
+      const int kper = k - nx_pppm*(2*k/nx_pppm);
+      const double snx = square(sin(MY_PI*kper/nx_pppm));
+  
+      double unitk_lamda[3];
+      unitk_lamda[0] = 2.0*MY_PI*kper;
+      unitk_lamda[1] = 2.0*MY_PI*lper;
+      unitk_lamda[2] = 2.0*MY_PI*mper;
+      x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
+  
+      const double sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
+  
+      if (sqk != 0.0) {
+        const double numerator = 12.5663706/sqk;
+        const double denominator = gf_denom(snx,sny,snz);
+        double sum1 = 0.0;
+  
+        for (int nx = -nbx; nx <= nbx; nx++) {
+          const double argx = MY_PI*kper/nx_pppm + MY_PI*nx;
+          const double wx = powsinxx(argx,twoorder);
+  
+          for (int ny = -nby; ny <= nby; ny++) {
+            const double argy = MY_PI*lper/ny_pppm + MY_PI*ny;
+            const double wy = powsinxx(argy,twoorder);
+  
+            for (int nz = -nbz; nz <= nbz; nz++) {
+              const double argz = MY_PI*mper/nz_pppm + MY_PI*nz;
+              const double wz = powsinxx(argz,twoorder);
+  
+              double b[3];
+              b[0] = 2.0*MY_PI*nx_pppm*nx;
+              b[1] = 2.0*MY_PI*ny_pppm*ny;
+              b[2] = 2.0*MY_PI*nz_pppm*nz;
+              x2lamdaT(&b[0],&b[0]);
+  
+              const double qx = unitk_lamda[0]+b[0];
+              const double sx = exp(-0.25*square(qx/g_ewald));
+  
+              const double qy = unitk_lamda[1]+b[1];
+              const double sy = exp(-0.25*square(qy/g_ewald));
+  
+              const double qz = unitk_lamda[2]+b[2];
+              const double sz = exp(-0.25*square(qz/g_ewald));
+  
+              const double dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
+              const double dot2 = qx*qx+qy*qy+qz*qz;
+              sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
+            }
+          }
+        }
+        d_greensfn[n++] = numerator*sum1/denominator;
+      } else d_greensfn[n++] = 0.0;
+    }
+  }
 }
 
 /* ----------------------------------------------------------------------
@@ -1867,107 +1883,224 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik10, const int &ii) con
 template<class DeviceType>
 void PPPMKokkos<DeviceType>::poisson_ik_triclinic()
 {
-//  int i,j,k,n;
-//
-//  // compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
-//  // FFT leaves data in 3d brick decomposition
-//  // copy it into inner portion of vdx,vdy,vdz arrays
-//
-//  // x direction gradient
-//
-//  n = 0;
-//  for (i = 0; i < nfft; i++) { // parallel_for1
-//    d_work2[n] = -d_fkx[i]*d_work1[n+1];
-//    d_work2[n+1] = d_fkx[i]*d_work1[n];
-//    n += 2;
-//  }
-//
-//  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
-//
-//  n = 0;
-//  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2
-//
-//
-//  // y direction gradient
-//
-//  n = 0;
-//  for (i = 0; i < nfft; i++) { // parallel_for3
-//    d_work2[n] = -d_fky[i]*d_work1[n+1];
-//    d_work2[n+1] = d_fky[i]*d_work1[n];
-//    n += 2;
-//  }
-//
-//  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
-//
-//  n = 0;
-//  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4
-//    for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
-//      for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
-//        d_vdy_brick(k,j,i) = d_work2[n];
-//        n += 2;
-//      }
-//
-//  // z direction gradient
-//
-//  n = 0;
-//  for (i = 0; i < nfft; i++) { // parallel_for5
-//    d_work2[n] = -d_fkz[i]*d_work1[n+1];
-//    d_work2[n+1] = d_fkz[i]*d_work1[n];
-//    n += 2;
-//  }
-//
-//  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
-//
-//  n = 0;
-//  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6
-//
+/**************************************************
+  int i,j,k,n;
+
+  // compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) { // parallel_for1
+    d_work2[n] = -d_fkx[i]*d_work1[n+1];
+    d_work2[n+1] = d_fkx[i]*d_work1[n];
+    n += 2;
+  }
+
+  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
+
+  n = 0;
+  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2
+
+
+  // y direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) { // parallel_for3
+    d_work2[n] = -d_fky[i]*d_work1[n+1];
+    d_work2[n+1] = d_fky[i]*d_work1[n];
+    n += 2;
+  }
+
+  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
+
+  n = 0;
+  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4
+    for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
+      for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
+        d_vdy_brick(k,j,i) = d_work2[n];
+        n += 2;
+      }
+
+  // z direction gradient
+
+  n = 0;
+  for (i = 0; i < nfft; i++) { // parallel_for5
+    d_work2[n] = -d_fkz[i]*d_work1[n+1];
+    d_work2[n+1] = d_fkz[i]*d_work1[n];
+    n += 2;
+  }
+
+  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
+
+  n = 0;
+  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6
+******************************/
+  int i,j,k,n;
+
+  // compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
+  // FFT leaves data in 3d brick decomposition
+  // copy it into inner portion of vdx,vdy,vdz arrays
+
+  // x direction gradient
+
+  //n = 0;
+  //for (i = 0; i < nfft; i++) {
+  //  d_work2[n] = -d_fkx[i]*d_work1[n+1];
+  //  d_work2[n+1] = d_fkx[i]*d_work1[n];
+  //  n += 2;
+  //}
+
+  // merge three outer loops into one for better threading
+
+  numz_fft = nzhi_fft-nzlo_fft + 1;
+  numy_fft = nyhi_fft-nylo_fft + 1;
+  numx_fft = nxhi_fft-nxlo_fft + 1;
+  const int inum_fft = numz_fft*numy_fft*numx_fft;
+
+  numz_inout = (nzhi_in-nzlo_out)-(nzlo_in-nzlo_out) + 1;
+  numy_inout = (nyhi_in-nylo_out)-(nylo_in-nylo_out) + 1;
+  numx_inout = (nxhi_in-nxlo_out)-(nxlo_in-nxlo_out) + 1;
+  const int inum_inout = numz_inout*numy_inout*numx_inout;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic1>(0,nfft),*this);
+  copymode = 0;
+
+  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
+
+  //n = 0;
+  //for (k = nzlo_in; k <= nzhi_in; k++)
+  //  for (j = nylo_in; j <= nyhi_in; j++)
+  //    for (i = nxlo_in; i <= nxhi_in; i++) {
+  //      d_vdx_brick(k,j,i) = d_work2[n];
+  //      n += 2;
+  //    }
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic2>(0,inum_inout),*this);
+  copymode = 0;
+
+  // y direction gradient
+
+  //n = 0;
+  //for (i = 0; i < nfft; i++) {
+  //  d_work2[n] = -d_fky[i]*d_work1[n+1];
+  //  d_work2[n+1] = d_fky[i]*d_work1[n];
+  //  n += 2;
+  //}
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic3>(0,nfft),*this);
+  copymode = 0;
+
+  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
+
+  //n = 0;
+  //for (k = nzlo_in; k <= nzhi_in; k++)
+  //  for (j = nylo_in; j <= nyhi_in; j++)
+  //    for (i = nxlo_in; i <= nxhi_in; i++) {
+  //      d_vdy_brick(k,j,i) = d_work2[n];
+  //      n += 2;
+  //    }
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic4>(0,inum_inout),*this);
+  copymode = 0;
+
+  // z direction gradient
+
+  //n = 0;
+  //for (i = 0; i < nfft; i++) {
+  //  d_work2[n] = -d_fkz[i]*d_work1[n+1];
+  //  d_work2[n+1] = d_fkz[i]*d_work1[n];
+  //  n += 2;
+  //}
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic5>(0,nfft),*this);
+  copymode = 0;
+
+  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
+
+  //n = 0;
+  //for (k = nzlo_in; k <= nzhi_in; k++)
+  //  for (j = nylo_in; j <= nyhi_in; j++)
+  //    for (i = nxlo_in; i <= nxhi_in; i++) {
+  //      d_vdz_brick(k,j,i) = d_work2[n];
+  //      n += 2;
+  //    }
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic6>(0,inum_inout),*this);
+  copymode = 0;
+
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic1, const int &/*k*/) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic1, const int &ii) const
 {
-
+  d_work2[2*ii] = -d_fkx[ii]*d_work1[2*ii+1];
+  d_work2[2*ii+1] = d_fkx[ii]*d_work1[2*ii];
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic2, const int &/*k*/) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic2, const int &ii) const
 {
-//    for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
-//      for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
-//        d_vdx_brick(k,j,i) = d_work2[n];
-//        n += 2;
-//      }
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_vdx_brick(k,j,i) = d_work2[n];
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic3, const int &/*k*/) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic3, const int &ii) const
 {
 //  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
+  d_work2[2*ii] = -d_fky[ii]*d_work1[2*ii+1];
+  d_work2[2*ii+1] = d_fky[ii]*d_work1[2*ii];
 
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic4, const int &/*k*/) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic4, const int &ii) const
 {
 //  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
 //
+
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_vdy_brick(k,j,i) = d_work2[n];
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic5, const int &/*k*/) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic5, const int &ii) const
 {
 //  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
 //
+  d_work2[2*ii] = -d_fkz[ii]*d_work1[2*ii+1];
+  d_work2[2*ii+1] = d_fkz[ii]*d_work1[2*ii];
 }
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int &/*k*/) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int &ii) const
 {
 //  int n = (k - (nzlo_in-nzlo_out))*((nyhi_in-nylo_out) - (nylo_in-nylo_out) + 1)*((nxhi_in-nxlo_out) - (nxlo_in-nxlo_out) + 1)*2;
 //
@@ -1976,6 +2109,14 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik_triclinic6, const int
 //      d_vdz_brick(k,j,i) = d_work2[n];
 //      n += 2;
 //    }
+  const int n = ii*2;
+  int k = ii/(numy_inout*numx_inout);
+  int j = (ii - k*numy_inout*numx_inout) / numx_inout;
+  int i = ii - k*numy_inout*numx_inout - j*numx_inout;
+  k += nzlo_in-nzlo_out;
+  j += nylo_in-nylo_out;
+  i += nxlo_in-nxlo_out;
+  d_vdz_brick(k,j,i) = d_work2[n];
 }
 
 /* ----------------------------------------------------------------------
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index 6b7b7f1f89..c92210b748 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -1,7 +1,6 @@
-// clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://www.lammps.org/, Sandia National Laboratories
+   https://lammps.sandia.gov/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -13,11 +12,11 @@
 ------------------------------------------------------------------------- */
 
 #ifdef KSPACE_CLASS
-// clang-format off
-KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>);
-KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>);
-KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>);
-// clang-format on
+
+KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>)
+KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>)
+KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
+
 #else
 
 #ifndef LMP_PPPM_KOKKOS_H
@@ -29,6 +28,7 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>);
 #include "kokkos_base_fft.h"
 #include "fftdata_kokkos.h"
 #include "kokkos_type.h"
+#include "kokkos_few.h"
 
 // fix up FFT defines for KOKKOS with CUDA
 
@@ -55,6 +55,8 @@ struct TagPPPM_setup1{};
 struct TagPPPM_setup2{};
 struct TagPPPM_setup3{};
 struct TagPPPM_setup4{};
+struct TagPPPM_setup_triclinic1{};
+struct TagPPPM_setup_triclinic2{};
 struct TagPPPM_compute_gf_ik{};
 struct TagPPPM_compute_gf_ik_triclinic{};
 struct TagPPPM_self1{};
@@ -138,6 +140,12 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPPPM_setup4, const int&) const;
 
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_setup_triclinic1, const int&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagPPPM_setup_triclinic2, const int&) const;
+
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPPPM_compute_gf_ik, const int&) const;
 
@@ -309,6 +317,25 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   int numx_out,numy_out,numz_out;
   int ix,iy,nlocal;
 
+  // Local copies of the domain box tilt etc. 
+  // TODO: Will need to put in a less
+  // hacky solution later, since this needs to be manually updated
+  Few<double,6> h, h_inv;
+
+  KOKKOS_INLINE_FUNCTION
+  void x2lamdaT(double* v, double* lamda) const
+  {
+    double lamda_tmp[3];                                                         
+                                                                                
+    lamda_tmp[0] = h_inv[0]*v[0];                                                
+    lamda_tmp[1] = h_inv[5]*v[0] + h_inv[1]*v[1];                                
+    lamda_tmp[2] = h_inv[4]*v[0] + h_inv[3]*v[1] + h_inv[2]*v[2];                
+                                                                                
+    lamda[0] = lamda_tmp[0];                                                     
+    lamda[1] = lamda_tmp[1];                                                     
+    lamda[2] = lamda_tmp[2];
+  }
+
   int nx,ny,nz;
   typename AT::t_int_1d_um d_list_index;
   typename FFT_AT::t_FFT_SCALAR_1d_um d_buf;

From e400e5b6f7be6530db7ca63ca4f422d6dc4e8fee Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Wed, 14 Jul 2021 14:46:54 +1000
Subject: [PATCH 04/15] Fixed bug in PPPMKokkos::setup_triclinic for MPI
 calculations.

This fix should probably be considered a temporary fix - it relies on a
3-dimensional Kokkos range which seems to be disfavoured in the rest of
LAMMPS' codebase.
---
 src/KOKKOS/pppm_kokkos.cpp | 122 +++----------------------------------
 src/KOKKOS/pppm_kokkos.h   |   3 +-
 2 files changed, 10 insertions(+), 115 deletions(-)

diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 98b6492e48..3fe93f6825 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -455,7 +455,6 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup4, const int &n) const
 template<class DeviceType>
 void PPPMKokkos<DeviceType>::setup_triclinic()
 {
-  int i,j,k,n;
   double *prd;
 
   // volume-dependent factors
@@ -480,13 +479,9 @@ void PPPMKokkos<DeviceType>::setup_triclinic()
   delzinv = nz_pppm;
   delvolinv = delxinv*delyinv*delzinv/volume;
 
-  // merge three outer loops into one for better threading
-  numz_fft = nzhi_fft-nzlo_fft + 1;
-  numy_fft = nyhi_fft-nylo_fft + 1;
-  numx_fft = nxhi_fft-nxlo_fft + 1;
-  const int inum_fft = numx_fft*numy_fft*numz_fft;
   copymode = 1;
-  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup_triclinic1>(0,inum_fft),*this);
+  Kokkos::parallel_for(Kokkos::MDRangePolicy<Kokkos::Rank<3>, DeviceType, TagPPPM_setup_triclinic1>\
+      ({nzlo_fft, nylo_fft, nxlo_fft}, {nzhi_fft+1, nyhi_fft+1, nxhi_fft+1}),*this);
   copymode = 0;
 
   // virial coefficients
@@ -500,14 +495,14 @@ void PPPMKokkos<DeviceType>::setup_triclinic()
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic1, const int &n) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic1, const int &k, const int &j, const int& i) const
 {
-  const int k = n/(numy_fft*numx_fft);
-  const int j = (n - k*numy_fft*numx_fft) / numx_fft;
-  const int i = n - k*numy_fft*numx_fft - j*numx_fft;
   double per_k = k - nz_pppm*(2*k/nz_pppm);
   double per_j = j - ny_pppm*(2*j/ny_pppm);
   double per_i = i - nx_pppm*(2*i/nx_pppm);
+  // n corresponds to the "number" of this iteration if we were to execute the loop monotonically and in serial
+  int n = (nxhi_fft - nxlo_fft + 1)*(nyhi_fft - nylo_fft + 1)*(k - nzlo_fft)+
+          (nxhi_fft - nxlo_fft + 1)*(j - nylo_fft) + (i - nxlo_fft);
 
   double unitk_lamda[3];
   unitk_lamda[0] = 2.0*MY_PI*per_i;
@@ -1244,7 +1239,7 @@ void PPPMKokkos<DeviceType>::compute_gf_ik()
   numz_fft = nzhi_fft-nzlo_fft + 1;
   numy_fft = nyhi_fft-nylo_fft + 1;
   numx_fft = nxhi_fft-nxlo_fft + 1;
-  const int inum_fft = numz_fft*numy_fft*numx_fft;
+  const int inum_fft = numx_fft*numy_fft*numz_fft;
 
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_compute_gf_ik>(0,inum_fft),*this);
@@ -1883,76 +1878,12 @@ void PPPMKokkos<DeviceType>::operator()(TagPPPM_poisson_ik10, const int &ii) con
 template<class DeviceType>
 void PPPMKokkos<DeviceType>::poisson_ik_triclinic()
 {
-/**************************************************
-  int i,j,k,n;
-
   // compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
   // FFT leaves data in 3d brick decomposition
   // copy it into inner portion of vdx,vdy,vdz arrays
 
   // x direction gradient
 
-  n = 0;
-  for (i = 0; i < nfft; i++) { // parallel_for1
-    d_work2[n] = -d_fkx[i]*d_work1[n+1];
-    d_work2[n+1] = d_fkx[i]*d_work1[n];
-    n += 2;
-  }
-
-  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
-
-  n = 0;
-  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for2
-
-
-  // y direction gradient
-
-  n = 0;
-  for (i = 0; i < nfft; i++) { // parallel_for3
-    d_work2[n] = -d_fky[i]*d_work1[n+1];
-    d_work2[n+1] = d_fky[i]*d_work1[n];
-    n += 2;
-  }
-
-  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
-
-  n = 0;
-  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for4
-    for (j = nylo_in-nylo_out; j <= nyhi_in-nylo_out; j++)
-      for (i = nxlo_in-nxlo_out; i <= nxhi_in-nxlo_out; i++) {
-        d_vdy_brick(k,j,i) = d_work2[n];
-        n += 2;
-      }
-
-  // z direction gradient
-
-  n = 0;
-  for (i = 0; i < nfft; i++) { // parallel_for5
-    d_work2[n] = -d_fkz[i]*d_work1[n+1];
-    d_work2[n+1] = d_fkz[i]*d_work1[n];
-    n += 2;
-  }
-
-  fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
-
-  n = 0;
-  for (k = nzlo_in-nzlo_out; k <= nzhi_in-nzlo_out; k++) // parallel_for6
-******************************/
-  int i,j,k,n;
-
-  // compute gradients of V(r) in each of 3 dims by transforming ik*V(k)
-  // FFT leaves data in 3d brick decomposition
-  // copy it into inner portion of vdx,vdy,vdz arrays
-
-  // x direction gradient
-
-  //n = 0;
-  //for (i = 0; i < nfft; i++) {
-  //  d_work2[n] = -d_fkx[i]*d_work1[n+1];
-  //  d_work2[n+1] = d_fkx[i]*d_work1[n];
-  //  n += 2;
-  //}
-
   // merge three outer loops into one for better threading
 
   numz_fft = nzhi_fft-nzlo_fft + 1;
@@ -1971,68 +1902,30 @@ void PPPMKokkos<DeviceType>::poisson_ik_triclinic()
 
   fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
 
-  //n = 0;
-  //for (k = nzlo_in; k <= nzhi_in; k++)
-  //  for (j = nylo_in; j <= nyhi_in; j++)
-  //    for (i = nxlo_in; i <= nxhi_in; i++) {
-  //      d_vdx_brick(k,j,i) = d_work2[n];
-  //      n += 2;
-  //    }
-
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic2>(0,inum_inout),*this);
   copymode = 0;
 
   // y direction gradient
 
-  //n = 0;
-  //for (i = 0; i < nfft; i++) {
-  //  d_work2[n] = -d_fky[i]*d_work1[n+1];
-  //  d_work2[n+1] = d_fky[i]*d_work1[n];
-  //  n += 2;
-  //}
-
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic3>(0,nfft),*this);
   copymode = 0;
 
   fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
 
-  //n = 0;
-  //for (k = nzlo_in; k <= nzhi_in; k++)
-  //  for (j = nylo_in; j <= nyhi_in; j++)
-  //    for (i = nxlo_in; i <= nxhi_in; i++) {
-  //      d_vdy_brick(k,j,i) = d_work2[n];
-  //      n += 2;
-  //    }
-
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic4>(0,inum_inout),*this);
   copymode = 0;
 
   // z direction gradient
 
-  //n = 0;
-  //for (i = 0; i < nfft; i++) {
-  //  d_work2[n] = -d_fkz[i]*d_work1[n+1];
-  //  d_work2[n+1] = d_fkz[i]*d_work1[n];
-  //  n += 2;
-  //}
-
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic5>(0,nfft),*this);
   copymode = 0;
 
   fft2->compute(d_work2,d_work2,FFT3dKokkos<DeviceType>::BACKWARD);
 
-  //n = 0;
-  //for (k = nzlo_in; k <= nzhi_in; k++)
-  //  for (j = nylo_in; j <= nyhi_in; j++)
-  //    for (i = nxlo_in; i <= nxhi_in; i++) {
-  //      d_vdz_brick(k,j,i) = d_work2[n];
-  //      n += 2;
-  //    }
-
   copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_poisson_ik_triclinic6>(0,inum_inout),*this);
   copymode = 0;
@@ -3040,3 +2933,4 @@ template class PPPMKokkos<LMPDeviceType>;
 template class PPPMKokkos<LMPHostType>;
 #endif
 }
+
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index c92210b748..e7c55b1726 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -141,7 +141,7 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   void operator()(TagPPPM_setup4, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPPPM_setup_triclinic1, const int&) const;
+  void operator()(TagPPPM_setup_triclinic1, const int&, const int&, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPPPM_setup_triclinic2, const int&) const;
@@ -623,3 +623,4 @@ accuracy.  This error should not occur for typical problems.  Please
 send an email to the developers.
 
 */
+

From e7ba4179a75ef9b05eababbe4ebf00d60f4fde0e Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Thu, 29 Jul 2021 09:53:08 +1000
Subject: [PATCH 05/15] Added Kokkos-enabled version of compute temp/deform.

---
 src/KOKKOS/compute_temp_deform_kokkos.cpp | 285 ++++++++++++++++++++++
 src/KOKKOS/compute_temp_deform_kokkos.h   | 127 ++++++++++
 src/KOKKOS/compute_temp_kokkos.h          |  23 +-
 src/compute_temp_deform.cpp               |   9 +-
 4 files changed, 430 insertions(+), 14 deletions(-)
 create mode 100644 src/KOKKOS/compute_temp_deform_kokkos.cpp
 create mode 100644 src/KOKKOS/compute_temp_deform_kokkos.h

diff --git a/src/KOKKOS/compute_temp_deform_kokkos.cpp b/src/KOKKOS/compute_temp_deform_kokkos.cpp
new file mode 100644
index 0000000000..6040504aeb
--- /dev/null
+++ b/src/KOKKOS/compute_temp_deform_kokkos.cpp
@@ -0,0 +1,285 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Emily Kahl (UQ)
+------------------------------------------------------------------------- */
+
+#include "compute_temp_deform_kokkos.h"
+
+#include "atom_kokkos.h"
+#include "atom_masks.h"
+#include "comm.h"
+#include "error.h"
+#include "force.h"
+#include "update.h"
+#include "memory_kokkos.h"
+
+#include <cstring>
+
+using namespace LAMMPS_NS;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+ComputeTempDeformKokkos<DeviceType>::ComputeTempDeformKokkos(LAMMPS *lmp, int narg, char **arg) :
+  ComputeTempDeform(lmp, narg, arg)
+{
+  kokkosable = 1;
+  atomKK = (AtomKokkos *) atom;
+  domainKK = (DomainKokkos *) domain;
+  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;
+
+  datamask_read = V_MASK | MASK_MASK | RMASS_MASK | TYPE_MASK;
+  datamask_modify = EMPTY_MASK;
+
+  maxbias = 0;
+}
+
+template<class DeviceType>
+ComputeTempDeformKokkos<DeviceType>::~ComputeTempDeformKokkos() 
+{
+
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+double ComputeTempDeformKokkos<DeviceType>::compute_scalar()
+{
+  atomKK->sync(execution_space,datamask_read);
+  atomKK->k_mass.sync<DeviceType>();
+
+  invoked_scalar = update->ntimestep;
+
+  v = atomKK->k_v.view<DeviceType>();
+  x = atomKK->k_x.view<DeviceType>();
+  if (atomKK->rmass)
+    rmass = atomKK->k_rmass.view<DeviceType>();
+  else
+    mass = atomKK->k_mass.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atom->nlocal;
+
+  double t = 0.0;
+  CTEMP t_kk;
+
+  /**************** EVK ****************/
+  // Convert from box coords to lamda coords
+  domainKK->x2lamda(nlocal);
+  
+  copymode = 1;
+  if (atomKK->rmass)
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformScalar<1> >(0,nlocal),*this,t_kk);
+  else
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformScalar<0> >(0,nlocal),*this,t_kk);
+  copymode = 0;
+
+  /**************** EVK ****************/
+  // Convert back to box coords
+  domainKK->lamda2x(nlocal);
+
+  t = t_kk.t0; // could make this more efficient
+
+  MPI_Allreduce(&t,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
+  if (dynamic) dof_compute();
+  if (dof < 0.0 && natoms_temp > 0.0)
+    error->all(FLERR,"Temperature compute degrees of freedom < 0");
+  scalar *= tfactor;
+
+  return scalar;
+}
+
+template<class DeviceType>
+template<int RMASS>
+KOKKOS_INLINE_FUNCTION
+void ComputeTempDeformKokkos<DeviceType>::operator()(TagComputeTempDeformScalar<RMASS>, const int &i, CTEMP& t_kk) const {
+
+  double *h_rate = domainKK->h_rate;
+  double *h_ratelo = domainKK->h_ratelo;
+  double vstream[3],vthermal[3];
+
+  vstream[0] = h_rate[0]*x(i,0) + h_rate[5]*x(i,1) + h_rate[4]*x(i,2) + h_ratelo[0];
+  vstream[1] = h_rate[1]*x(i,1) + h_rate[3]*x(i,2) + h_ratelo[1];
+  vstream[2] = h_rate[2]*x(i,2) + h_ratelo[2];
+  vthermal[0] = v(i,0) - vstream[0];
+  vthermal[1] = v(i,1) - vstream[1];
+  vthermal[2] = v(i,2) - vstream[2];
+  if (RMASS) {
+    if (mask[i] & groupbit)
+      t_kk.t0 += (vthermal[0]*vthermal[0] + vthermal[1]*vthermal[1] + vthermal[2]*vthermal[2]) * rmass[i];
+  } else {
+    if (mask[i] & groupbit)
+      t_kk.t0 += (vthermal[0]*vthermal[0] + vthermal[1]*vthermal[1] + vthermal[2]*vthermal[2]) * mass[type[i]];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+void ComputeTempDeformKokkos<DeviceType>::compute_vector()
+{
+  atomKK->sync(execution_space,datamask_read);
+
+  int i;
+
+  invoked_vector = update->ntimestep;
+
+  v = atomKK->k_v.view<DeviceType>();
+  x = atomKK->k_x.view<DeviceType>();
+  if (atomKK->rmass)
+    rmass = atomKK->k_rmass.view<DeviceType>();
+  else
+    mass = atomKK->k_mass.view<DeviceType>();
+  type = atomKK->k_type.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atom->nlocal;
+
+  double t[6];
+  for (i = 0; i < 6; i++) t[i] = 0.0;
+  CTEMP t_kk;
+
+  /**************** EVK ****************/
+  // Convert from box coords to lamda coords
+  domainKK->x2lamda(nlocal);
+
+  copymode = 1;
+  if (atomKK->rmass)
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformVector<1> >(0,nlocal),*this,t_kk);
+  else
+    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformVector<0> >(0,nlocal),*this,t_kk);
+  copymode = 0;
+
+  /**************** EVK ****************/
+  // Convert back to box coords
+  domainKK->lamda2x(nlocal);
+
+  t[0] = t_kk.t0;
+  t[1] = t_kk.t1;
+  t[2] = t_kk.t2;
+  t[3] = t_kk.t3;
+  t[4] = t_kk.t4;
+  t[5] = t_kk.t5;
+
+  MPI_Allreduce(t,vector,6,MPI_DOUBLE,MPI_SUM,world);
+  for (i = 0; i < 6; i++) vector[i] *= force->mvv2e;
+}
+
+template<class DeviceType>
+template<int RMASS>
+KOKKOS_INLINE_FUNCTION
+void ComputeTempDeformKokkos<DeviceType>::operator()(TagComputeTempDeformVector<RMASS>, const int &i, CTEMP& t_kk) const {
+
+  double *h_rate = domainKK->h_rate;
+  double *h_ratelo = domainKK->h_ratelo;
+  double vstream[3],vthermal[3];
+
+  vstream[0] = h_rate[0]*x(i,0) + h_rate[5]*x(i,1) + h_rate[4]*x(i,2) + h_ratelo[0];
+  vstream[1] = h_rate[1]*x(i,1) + h_rate[3]*x(i,2) + h_ratelo[1];
+  vstream[2] = h_rate[2]*x(i,2) + h_ratelo[2];
+  vthermal[0] = v(i,0) - vstream[0];
+  vthermal[1] = v(i,1) - vstream[1];
+  vthermal[2] = v(i,2) - vstream[2];
+
+  if (mask[i] & groupbit) {
+    F_FLOAT massone = 0.0;
+    if (RMASS) massone = rmass[i];
+    else massone = mass[type[i]];
+    t_kk.t0 += massone * vthermal[0]*vthermal[0];
+    t_kk.t1 += massone * vthermal[1]*vthermal[1];
+    t_kk.t2 += massone * vthermal[2]*vthermal[2];
+    t_kk.t3 += massone * vthermal[0]*vthermal[1];
+    t_kk.t4 += massone * vthermal[0]*vthermal[2];
+    t_kk.t5 += massone * vthermal[1]*vthermal[2];
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+void ComputeTempDeformKokkos<DeviceType>::remove_bias_all()
+{
+  atomKK->sync(execution_space,datamask_read);
+  v = atomKK->k_v.view<DeviceType>();
+  x = atomKK->k_x.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atom->nlocal;
+
+  if (atom->nmax > maxbias) {
+    //memoryKK->destroy_kokkos(vbiasall);
+    maxbias = atom->nmax;
+    //memoryKK->create_kokkos(vbiasall,maxbias,"temp/deform/kk:vbiasall");
+    vbiasall = typename ArrayTypes<DeviceType>::t_v_array("temp/deform/kk:vbiasall", maxbias);
+  }
+
+  /**************** EVK ****************/
+  // Convert from box coords to lamda coords
+  domainKK->x2lamda(nlocal);
+
+  h_rate = domain->h_rate;
+  h_ratelo = domain->h_ratelo;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformRemoveBias >(0,nlocal),*this);
+  copymode = 0;
+
+  /**************** EVK ****************/
+  // Convert back to box coords
+  domainKK->lamda2x(nlocal);
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void ComputeTempDeformKokkos<DeviceType>::operator()(TagComputeTempDeformRemoveBias, const int &i) const {
+  if (mask[i] & groupbit) {
+    vbiasall(i,0) = h_rate[0]*x(i,0) + h_rate[5]*x(i,1) + h_rate[4]*x(i,2) + h_ratelo[0];
+    vbiasall(i,1) = h_rate[1]*x(i,1) + h_rate[3]*x(i,2) + h_ratelo[1];
+    vbiasall(i,2) = h_rate[2]*x(i,2) + h_ratelo[2];
+    v(i,0) -= vbiasall(i,0);
+    v(i,1) -= vbiasall(i,1);
+    v(i,2) -= vbiasall(i,2);
+  }
+}
+
+/* ---------------------------------------------------------------------- */
+template<class DeviceType>
+void ComputeTempDeformKokkos<DeviceType>::restore_bias_all()
+{
+  atomKK->sync(execution_space,datamask_read);
+  v = atomKK->k_v.view<DeviceType>();
+  x = atomKK->k_x.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atom->nlocal;
+
+  copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformRestoreBias >(0,nlocal),*this);
+  copymode = 0;
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void ComputeTempDeformKokkos<DeviceType>::operator()(TagComputeTempDeformRestoreBias, const int &i) const {
+  if (mask[i] & groupbit) {
+    v(i,0) += vbiasall(i,0);
+    v(i,1) += vbiasall(i,1);
+    v(i,2) += vbiasall(i,2);
+  }
+}
+
+namespace LAMMPS_NS {
+template class ComputeTempDeformKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class ComputeTempDeformKokkos<LMPHostType>;
+#endif
+}
diff --git a/src/KOKKOS/compute_temp_deform_kokkos.h b/src/KOKKOS/compute_temp_deform_kokkos.h
new file mode 100644
index 0000000000..4ff6d59674
--- /dev/null
+++ b/src/KOKKOS/compute_temp_deform_kokkos.h
@@ -0,0 +1,127 @@
+// clang-format off
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef COMPUTE_CLASS
+// clang-format off
+ComputeStyle(temp/deform/kk,ComputeTempDeformKokkos<LMPDeviceType>);
+ComputeStyle(temp/deform/kk/device,ComputeTempDeformKokkos<LMPDeviceType>);
+ComputeStyle(temp/deform/kk/host,ComputeTempDeformKokkos<LMPHostType>);
+// clang-format on
+#else
+
+#ifndef LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H
+#define LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H
+
+#include "compute_temp_deform.h"
+#include "kokkos_type.h"
+#include "domain_kokkos.h"
+#include "kokkos_few.h"
+
+namespace LAMMPS_NS {
+
+template<int RMASS>
+struct TagComputeTempDeformScalar{};
+
+template<int RMASS>
+struct TagComputeTempDeformVector{};
+
+struct TagComputeTempDeformRemoveBias{};
+
+struct TagComputeTempDeformRestoreBias{};
+
+template<class DeviceType>
+class ComputeTempDeformKokkos: public ComputeTempDeform {
+ public:
+  struct s_CTEMP {
+    double t0, t1, t2, t3, t4, t5;
+    KOKKOS_INLINE_FUNCTION
+    s_CTEMP() {
+      t0 = t1 = t2 = t3 = t4 = t5 = 0.0;
+    }
+    KOKKOS_INLINE_FUNCTION
+    s_CTEMP& operator+=(const s_CTEMP &rhs) {
+      t0 += rhs.t0;
+      t1 += rhs.t1;
+      t2 += rhs.t2;
+      t3 += rhs.t3;
+      t4 += rhs.t4;
+      t5 += rhs.t5;
+      return *this;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator+=(const volatile s_CTEMP &rhs) volatile {
+      t0 += rhs.t0;
+      t1 += rhs.t1;
+      t2 += rhs.t2;
+      t3 += rhs.t3;
+      t4 += rhs.t4;
+      t5 += rhs.t5;
+    }
+  };
+
+  typedef s_CTEMP CTEMP;
+  typedef CTEMP value_type;
+  typedef DeviceType device_type;
+  typedef ArrayTypes<DeviceType> AT;
+
+  ComputeTempDeformKokkos(class LAMMPS *, int, char **);
+  ~ComputeTempDeformKokkos();
+  double compute_scalar();
+  void compute_vector();
+  void remove_bias_all();
+  void restore_bias_all();
+
+  template<int RMASS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeTempDeformScalar<RMASS>, const int&, CTEMP&) const;
+
+  template<int RMASS>
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeTempDeformVector<RMASS>, const int&, CTEMP&) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeTempDeformRemoveBias, const int &i) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagComputeTempDeformRestoreBias, const int &i) const;
+
+ protected:
+  typename ArrayTypes<DeviceType>::t_x_array_randomread x;
+  typename ArrayTypes<DeviceType>::t_v_array v;
+  typename ArrayTypes<DeviceType>::t_v_array vbiasall;
+  typename ArrayTypes<DeviceType>::t_float_1d_randomread rmass;
+  typename ArrayTypes<DeviceType>::t_float_1d_randomread mass;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread type;
+  typename ArrayTypes<DeviceType>::t_int_1d_randomread mask;
+
+  class DomainKokkos *domainKK;
+
+  Few<double, 6> h_rate, h_ratelo;
+
+  };
+
+}
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Temperature compute degrees of freedom < 0
+
+This should not happen if you are calculating the temperature
+on a valid set of atoms.
+
+*/
diff --git a/src/KOKKOS/compute_temp_kokkos.h b/src/KOKKOS/compute_temp_kokkos.h
index 792e2e17db..4cbace02d7 100644
--- a/src/KOKKOS/compute_temp_kokkos.h
+++ b/src/KOKKOS/compute_temp_kokkos.h
@@ -28,6 +28,16 @@ ComputeStyle(temp/kk/host,ComputeTempKokkos<LMPHostType>);
 
 namespace LAMMPS_NS {
 
+template<int RMASS>
+struct TagComputeTempScalar{};
+
+template<int RMASS>
+struct TagComputeTempVector{};
+
+template<class DeviceType>
+class ComputeTempKokkos : public ComputeTemp {
+ public:
+
   struct s_CTEMP {
     double t0, t1, t2, t3, t4, t5;
     KOKKOS_INLINE_FUNCTION
@@ -55,23 +65,14 @@ namespace LAMMPS_NS {
       t5 += rhs.t5;
     }
   };
+
   typedef s_CTEMP CTEMP;
-
-template<int RMASS>
-struct TagComputeTempScalar{};
-
-template<int RMASS>
-struct TagComputeTempVector{};
-
-template<class DeviceType>
-class ComputeTempKokkos : public ComputeTemp {
- public:
   typedef DeviceType device_type;
   typedef CTEMP value_type;
   typedef ArrayTypes<DeviceType> AT;
 
   ComputeTempKokkos(class LAMMPS *, int, char **);
-  virtual ~ComputeTempKokkos() {}
+  virtual ~ComputeTempKokkos() {};
   double compute_scalar();
   void compute_vector();
 
diff --git a/src/compute_temp_deform.cpp b/src/compute_temp_deform.cpp
index 0c158e96e6..7b17b12fd8 100644
--- a/src/compute_temp_deform.cpp
+++ b/src/compute_temp_deform.cpp
@@ -56,8 +56,11 @@ ComputeTempDeform::ComputeTempDeform(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeTempDeform::~ComputeTempDeform()
 {
-  memory->destroy(vbiasall);
-  delete [] vector;
+  if (!copymode)
+  {
+    memory->destroy(vbiasall);
+    delete [] vector;
+  }
 }
 
 /* ---------------------------------------------------------------------- */
@@ -69,7 +72,7 @@ void ComputeTempDeform::init()
   // check fix deform remap settings
 
   for (i = 0; i < modify->nfix; i++)
-    if (strcmp(modify->fix[i]->style,"deform") == 0) {
+    if (strncmp(modify->fix[i]->style,"deform", 6) == 0) {
       if (((FixDeform *) modify->fix[i])->remapflag == Domain::X_REMAP &&
           comm->me == 0)
         error->warning(FLERR,"Using compute temp/deform with inconsistent "

From 8945d81be3e444b412860987b86e8853f6a010c3 Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Thu, 29 Jul 2021 09:54:15 +1000
Subject: [PATCH 06/15] Added Kokkos accelerated SLLOD thermostat
 (nvt/sllod/kk).

---
 src/KOKKOS/fix_nvt_sllod_kokkos.cpp | 171 ++++++++++++++++++++++++++++
 src/KOKKOS/fix_nvt_sllod_kokkos.h   |  97 ++++++++++++++++
 2 files changed, 268 insertions(+)
 create mode 100644 src/KOKKOS/fix_nvt_sllod_kokkos.cpp
 create mode 100644 src/KOKKOS/fix_nvt_sllod_kokkos.h

diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
new file mode 100644
index 0000000000..7d9ad3a61a
--- /dev/null
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@@ -0,0 +1,171 @@
+// clang-format off
+/* ----------------------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   https://www.lammps.org/
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+/* ----------------------------------------------------------------------
+   Contributing authors: Emily Kahl (UQ, e.kahl@uq.edu.au)
+------------------------------------------------------------------------- */
+
+#include "fix_nvt_sllod_kokkos.h"
+
+#include "atom.h"
+#include "compute.h"
+#include "domain.h"
+#include "error.h"
+#include "fix.h"
+#include "fix_deform_kokkos.h"
+#include "group.h"
+#include "math_extra.h"
+#include "modify.h"
+#include "atom_kokkos.h"
+#include "atom.h"
+#include "atom_masks.h"
+#include "kokkos_few.h"
+#include "memory_kokkos.h"
+
+#include <cstring>
+
+using namespace LAMMPS_NS;
+using namespace FixConst;
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **arg) :
+  FixNHKokkos<DeviceType>(lmp, narg, arg)
+{
+  /************************ EVK *********************/
+  atomKK = (AtomKokkos *) this->atom;
+  this->kokkosable = 1;
+  this->domainKK = (DomainKokkos *) this->domain;
+
+  if (!this->tstat_flag)
+    this->error->all(FLERR,"Temperature control must be used with fix nvt/kk");
+  if (this->pstat_flag)
+    this->error->all(FLERR,"Pressure control can not be used with fix nvt/kk");
+
+  if (this->mtchain_default_flag) this->mtchain = 1;
+    
+  this->id_temp = utils::strdup(std::string(this->id)+"_temp");
+  this->modify->add_compute(fmt::format("{} all temp/deform/kk",this->id_temp));
+  this->tcomputeflag = 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+FixNVTSllodKokkos<DeviceType>::~FixNVTSllodKokkos()
+{
+
+
+}
+
+/* ---------------------------------------------------------------------- */
+
+template<class DeviceType>
+void FixNVTSllodKokkos<DeviceType>::init()
+{
+  FixNHKokkos<DeviceType>::init();
+
+  vdelu = typename ArrayTypes<DeviceType>::t_v_array("nvt/sllod/kk:vdelu", atomKK->nlocal);
+  
+  if (!this->temperature->tempbias)
+    this->error->all(FLERR,"Temperature for fix nvt/sllod does not have a bias");
+
+  nondeformbias = 0;
+  if (strncmp(this->temperature->style,"temp/deform", 11) != 0) nondeformbias = 1;
+
+  // check fix deform remap settings
+
+  int i;
+  for (i = 0; i < this->modify->nfix; i++)
+    if (strncmp(this->modify->fix[i]->style,"deform",6) == 0) {
+      if (((FixDeform *) this->modify->fix[i])->remapflag != Domain::V_REMAP)
+        this->error->all(FLERR,"Using fix nvt/sllod with inconsistent fix deform "
+                   "remap option");
+      break;
+    }
+  if (i == this->modify->nfix)
+    this->error->all(FLERR,"Using fix nvt/sllod with no fix deform defined");
+}
+
+/* ----------------------------------------------------------------------
+   perform half-step scaling of velocities
+-----------------------------------------------------------------------*/
+
+template<class DeviceType>
+void FixNVTSllodKokkos<DeviceType>::nh_v_temp()
+{
+  // remove and restore bias = streaming velocity = Hrate*lamda + Hratelo
+  // thermostat thermal velocity only
+  // vdelu = SLLOD correction = Hrate*Hinv*vthermal
+  // for non temp/deform BIAS:
+  //   calculate temperature since some computes require temp
+  //   computed on current nlocal atoms to remove bias
+  
+  if (nondeformbias){ 
+    atomKK->sync(this->temperature->execution_space,this->temperature->datamask_read);
+    this->temperature->compute_scalar();
+    atomKK->modified(this->temperature->execution_space,this->temperature->datamask_modify);
+  }
+  v = atomKK->k_v.view<DeviceType>();
+  mask = atomKK->k_mask.view<DeviceType>();
+  int nlocal = atomKK->nlocal;
+  if (this->igroup == atomKK->firstgroup) nlocal = atomKK->nfirst;
+
+  double h_two[6];
+  MathExtra::multiply_shape_shape(this->domain->h_rate,this->domain->h_inv,h_two);
+
+  d_h_two = Few<double, 6>(h_two);
+
+  this->copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixNVTSllod_temp1>(0,nlocal),*this);
+  this->copymode = 0;
+
+  this->temperature->remove_bias_all();
+  
+  this->copymode = 1;
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixNVTSllod_temp2>(0,nlocal),*this);
+  this->copymode = 0;
+
+  this->temperature->restore_bias_all();
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixNVTSllodKokkos<DeviceType>::operator()(TagFixNVTSllod_temp1, const int &i) const {
+  if (mask[i] & this->groupbit) {
+    vdelu(i,0) = d_h_two[0]*v(i,0) + d_h_two[5]*v(i,1) + d_h_two[4]*v(i,2);
+    vdelu(i,1) = d_h_two[1]*v(i,1) + d_h_two[3]*v(i,2);
+    vdelu(i,2) = d_h_two[2]*v(i,2);
+  }
+}
+
+template<class DeviceType>
+KOKKOS_INLINE_FUNCTION
+void FixNVTSllodKokkos<DeviceType>::operator()(TagFixNVTSllod_temp2, const int &i) const {
+  if (mask[i] & this->groupbit) {
+    v(i,0) = v(i,0)*this->factor_eta - this->dthalf*vdelu(i,0);
+    v(i,1) = v(i,1)*this->factor_eta - this->dthalf*vdelu(i,1);
+    v(i,2) = v(i,2)*this->factor_eta - this->dthalf*vdelu(i,2);
+  }
+}
+
+namespace LAMMPS_NS {
+template class FixNVTSllodKokkos<LMPDeviceType>;
+#ifdef LMP_KOKKOS_GPU
+template class FixNVTSllodKokkos<LMPHostType>;
+#endif
+}
+
+
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.h b/src/KOKKOS/fix_nvt_sllod_kokkos.h
new file mode 100644
index 0000000000..85b4e54bcd
--- /dev/null
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.h
@@ -0,0 +1,97 @@
+/* -*- c++ -*- ----------------------------------------------------------
+   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
+   www.cs.sandia.gov/~sjplimp/lammps.html
+   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+
+   Copyright (2003) Sandia Corporation.  Under the terms of Contract
+   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
+   certain rights in this software.  This software is distributed under
+   the GNU General Public License.
+
+   See the README file in the top-level LAMMPS directory.
+------------------------------------------------------------------------- */
+
+#ifdef FIX_CLASS
+// clang-format off
+FixStyle(nvt/sllod/kk,FixNVTSllodKokkos<LMPDeviceType>);
+FixStyle(nvt/sllod/kk/device,FixNVTSllodKokkos<LMPDeviceType>);
+FixStyle(nvt/sllod/kk/host,FixNVTSllodKokkos<LMPHostType>);
+// clang-format on
+#else
+
+#ifndef LMP_FIX_NVT_SLLOD_KOKKOS_H
+#define LMP_FIX_NVT_SLLOD_KOKKOS_H
+
+#include "fix_nh_kokkos.h"
+//#include "fix_nvt_sllod.h"
+#include "kokkos_type.h"
+#include "kokkos_few.h"
+
+namespace LAMMPS_NS {
+
+struct TagFixNVTSllod_temp1{};
+struct TagFixNVTSllod_temp2{};
+
+template<class DeviceType>
+class FixNVTSllodKokkos : public FixNHKokkos<DeviceType> {
+ public:
+  FixNVTSllodKokkos(class LAMMPS *, int, char **);
+  ~FixNVTSllodKokkos();
+  void init();
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixNVTSllod_temp1, const int& i) const;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(TagFixNVTSllod_temp2, const int& i) const;
+
+ private:
+  int nondeformbias;
+
+  void nh_v_temp();
+
+ protected:
+  typename ArrayTypes<DeviceType>::t_x_array x;
+  typename ArrayTypes<DeviceType>::t_v_array v;
+  typename ArrayTypes<DeviceType>::t_v_array vdelu;
+  typename ArrayTypes<DeviceType>::t_f_array_const f;
+  typename ArrayTypes<DeviceType>::t_float_1d rmass;
+  typename ArrayTypes<DeviceType>::t_float_1d mass;
+  typename ArrayTypes<DeviceType>::t_int_1d type;
+  typename ArrayTypes<DeviceType>::t_int_1d mask;
+
+  Few<double, 6> d_h_two;
+  
+  class DomainKokkos *domainKK;
+  class AtomKokkos *atomKK;
+};
+
+}    // namespace LAMMPS_NS
+
+#endif
+#endif
+
+/* ERROR/WARNING messages:
+
+E: Temperature control must be used with fix nvt/sllod
+
+Self-explanatory.
+
+E: Pressure control can not be used with fix nvt/sllod
+
+Self-explanatory.
+
+E: Temperature for fix nvt/sllod does not have a bias
+
+The specified compute must compute temperature with a bias.
+
+E: Using fix nvt/sllod with inconsistent fix deform remap option
+
+Fix nvt/sllod requires that deforming atoms have a velocity profile
+provided by "remap v" as a fix deform option.
+
+E: Using fix nvt/sllod with no fix deform defined
+
+Self-explanatory.
+
+*/

From 8ae9d51466b49767399f2f71d1c22b4125fcb82c Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Mon, 2 Aug 2021 14:46:03 +1000
Subject: [PATCH 07/15] Fixed memory issues in ComputeTempDeformKokkos.

---
 src/KOKKOS/compute_temp_deform_kokkos.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/KOKKOS/compute_temp_deform_kokkos.cpp b/src/KOKKOS/compute_temp_deform_kokkos.cpp
index 6040504aeb..2716c4688e 100644
--- a/src/KOKKOS/compute_temp_deform_kokkos.cpp
+++ b/src/KOKKOS/compute_temp_deform_kokkos.cpp
@@ -80,6 +80,8 @@ double ComputeTempDeformKokkos<DeviceType>::compute_scalar()
   /**************** EVK ****************/
   // Convert from box coords to lamda coords
   domainKK->x2lamda(nlocal);
+  h_rate = domainKK->h_rate;
+  h_ratelo = domainKK->h_ratelo;
   
   copymode = 1;
   if (atomKK->rmass)
@@ -108,8 +110,6 @@ template<int RMASS>
 KOKKOS_INLINE_FUNCTION
 void ComputeTempDeformKokkos<DeviceType>::operator()(TagComputeTempDeformScalar<RMASS>, const int &i, CTEMP& t_kk) const {
 
-  double *h_rate = domainKK->h_rate;
-  double *h_ratelo = domainKK->h_ratelo;
   double vstream[3],vthermal[3];
 
   vstream[0] = h_rate[0]*x(i,0) + h_rate[5]*x(i,1) + h_rate[4]*x(i,2) + h_ratelo[0];
@@ -154,6 +154,8 @@ void ComputeTempDeformKokkos<DeviceType>::compute_vector()
   /**************** EVK ****************/
   // Convert from box coords to lamda coords
   domainKK->x2lamda(nlocal);
+  h_rate = domainKK->h_rate;
+  h_ratelo = domainKK->h_ratelo;
 
   copymode = 1;
   if (atomKK->rmass)
@@ -182,8 +184,6 @@ template<int RMASS>
 KOKKOS_INLINE_FUNCTION
 void ComputeTempDeformKokkos<DeviceType>::operator()(TagComputeTempDeformVector<RMASS>, const int &i, CTEMP& t_kk) const {
 
-  double *h_rate = domainKK->h_rate;
-  double *h_ratelo = domainKK->h_ratelo;
   double vstream[3],vthermal[3];
 
   vstream[0] = h_rate[0]*x(i,0) + h_rate[5]*x(i,1) + h_rate[4]*x(i,2) + h_ratelo[0];

From d7f9f9fead0744fb1ff07e50609fc16550d7e933 Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Mon, 9 Aug 2021 10:19:32 +1000
Subject: [PATCH 08/15] Updated documentation to include Kokkos accelerated
 NEMD styles.

Also tidied up header files and attribution to fit LAMMPS coding style.
---
 doc/src/Commands_compute.rst              | 2 +-
 doc/src/Commands_fix.rst                  | 2 +-
 doc/src/compute_temp_deform.rst           | 3 +++
 doc/src/fix_nvt_sllod.rst                 | 3 ++-
 src/KOKKOS/compute_temp_deform_kokkos.cpp | 3 ++-
 src/KOKKOS/compute_temp_deform_kokkos.h   | 1 -
 src/KOKKOS/fix_nvt_sllod_kokkos.cpp       | 2 +-
 7 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/doc/src/Commands_compute.rst b/doc/src/Commands_compute.rst
index 9dfb28fa8b..2c67d44f24 100644
--- a/doc/src/Commands_compute.rst
+++ b/doc/src/Commands_compute.rst
@@ -152,7 +152,7 @@ KOKKOS, o = OPENMP, t = OPT.
    * :doc:`temp/chunk <compute_temp_chunk>`
    * :doc:`temp/com <compute_temp_com>`
    * :doc:`temp/cs <compute_temp_cs>`
-   * :doc:`temp/deform <compute_temp_deform>`
+   * :doc:`temp/deform (k) <compute_temp_deform>`
    * :doc:`temp/deform/eff <compute_temp_deform_eff>`
    * :doc:`temp/drude <compute_temp_drude>`
    * :doc:`temp/eff <compute_temp_eff>`
diff --git a/doc/src/Commands_fix.rst b/doc/src/Commands_fix.rst
index 45a75ff394..08857b96ec 100644
--- a/doc/src/Commands_fix.rst
+++ b/doc/src/Commands_fix.rst
@@ -148,7 +148,7 @@ OPT.
    * :doc:`nvt/body <fix_nvt_body>`
    * :doc:`nvt/eff <fix_nh_eff>`
    * :doc:`nvt/manifold/rattle <fix_nvt_manifold_rattle>`
-   * :doc:`nvt/sllod (io) <fix_nvt_sllod>`
+   * :doc:`nvt/sllod (iko) <fix_nvt_sllod>`
    * :doc:`nvt/sllod/eff <fix_nvt_sllod_eff>`
    * :doc:`nvt/sphere (o) <fix_nvt_sphere>`
    * :doc:`nvt/uef <fix_nh_uef>`
diff --git a/doc/src/compute_temp_deform.rst b/doc/src/compute_temp_deform.rst
index bc4d1ae3f3..6c61f7bc93 100644
--- a/doc/src/compute_temp_deform.rst
+++ b/doc/src/compute_temp_deform.rst
@@ -1,8 +1,11 @@
 .. index:: compute temp/deform
+.. index:: compute temp/deform/kk
 
 compute temp/deform command
 ===========================
 
+Accelerator Variants: *temp/deform/kk*
+
 Syntax
 """"""
 
diff --git a/doc/src/fix_nvt_sllod.rst b/doc/src/fix_nvt_sllod.rst
index 9ff22bca09..3d041ca767 100644
--- a/doc/src/fix_nvt_sllod.rst
+++ b/doc/src/fix_nvt_sllod.rst
@@ -1,11 +1,12 @@
 .. index:: fix nvt/sllod
 .. index:: fix nvt/sllod/intel
 .. index:: fix nvt/sllod/omp
+.. index:: fix nvt/sllod/kk
 
 fix nvt/sllod command
 =====================
 
-Accelerator Variants: *nvt/sllod/intel*, *nvt/sllod/omp*
+Accelerator Variants: *nvt/sllod/intel*, *nvt/sllod/omp*, *nvt/sllod/kk*
 
 Syntax
 """"""
diff --git a/src/KOKKOS/compute_temp_deform_kokkos.cpp b/src/KOKKOS/compute_temp_deform_kokkos.cpp
index 2716c4688e..7846e18113 100644
--- a/src/KOKKOS/compute_temp_deform_kokkos.cpp
+++ b/src/KOKKOS/compute_temp_deform_kokkos.cpp
@@ -13,7 +13,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Emily Kahl (UQ)
+   Contributing authors: Emily Kahl (Uni. of QLD, e.kahl@uq.edu.au)
 ------------------------------------------------------------------------- */
 
 #include "compute_temp_deform_kokkos.h"
@@ -25,6 +25,7 @@
 #include "force.h"
 #include "update.h"
 #include "memory_kokkos.h"
+#include "domain_kokkos.h"
 
 #include <cstring>
 
diff --git a/src/KOKKOS/compute_temp_deform_kokkos.h b/src/KOKKOS/compute_temp_deform_kokkos.h
index 4ff6d59674..9f32069604 100644
--- a/src/KOKKOS/compute_temp_deform_kokkos.h
+++ b/src/KOKKOS/compute_temp_deform_kokkos.h
@@ -25,7 +25,6 @@ ComputeStyle(temp/deform/kk/host,ComputeTempDeformKokkos<LMPHostType>);
 
 #include "compute_temp_deform.h"
 #include "kokkos_type.h"
-#include "domain_kokkos.h"
 #include "kokkos_few.h"
 
 namespace LAMMPS_NS {
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
index 7d9ad3a61a..68574adff0 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@@ -13,7 +13,7 @@
 ------------------------------------------------------------------------- */
 
 /* ----------------------------------------------------------------------
-   Contributing authors: Emily Kahl (UQ, e.kahl@uq.edu.au)
+   Contributing authors: Emily Kahl (Uni. of QLD, e.kahl@uq.edu.au)
 ------------------------------------------------------------------------- */
 
 #include "fix_nvt_sllod_kokkos.h"

From b385c85440c5ab10af686282ee4bd9794f329231 Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Wed, 11 Aug 2021 14:05:45 +1000
Subject: [PATCH 09/15] Refactored PPPMKokkos::setup_triclinic kernel indexing
 to be more consistent the rest of the codebase.

This commit "fixes" the temporary solution using Kokkos::MDRange in commit a98b8bee88.
---
 src/KOKKOS/pppm_kokkos.cpp | 20 ++++++++++++++------
 src/KOKKOS/pppm_kokkos.h   |  2 +-
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 3fe93f6825..e2956036c3 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -479,9 +479,13 @@ void PPPMKokkos<DeviceType>::setup_triclinic()
   delzinv = nz_pppm;
   delvolinv = delxinv*delyinv*delzinv/volume;
 
+  numz_fft = nzhi_fft-nzlo_fft + 1;
+  numy_fft = nyhi_fft-nylo_fft + 1;
+  numx_fft = nxhi_fft-nxlo_fft + 1;
+  const int inum_fft = numz_fft*numy_fft*numx_fft;
+
   copymode = 1;
-  Kokkos::parallel_for(Kokkos::MDRangePolicy<Kokkos::Rank<3>, DeviceType, TagPPPM_setup_triclinic1>\
-      ({nzlo_fft, nylo_fft, nxlo_fft}, {nzhi_fft+1, nyhi_fft+1, nxhi_fft+1}),*this);
+  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagPPPM_setup_triclinic1>(0,inum_fft),*this);
   copymode = 0;
 
   // virial coefficients
@@ -495,14 +499,18 @@ void PPPMKokkos<DeviceType>::setup_triclinic()
 
 template<class DeviceType>
 KOKKOS_INLINE_FUNCTION
-void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic1, const int &k, const int &j, const int& i) const
+void PPPMKokkos<DeviceType>::operator()(TagPPPM_setup_triclinic1, const int &n) const
 {
+  int k = n/(numy_fft*numx_fft);
+  int j = (n - k*numy_fft*numx_fft) / numx_fft;
+  int i = n - k*numy_fft*numx_fft - j*numx_fft;
+  k += nzlo_fft;
+  j += nylo_fft;
+  i += nxlo_fft;
+
   double per_k = k - nz_pppm*(2*k/nz_pppm);
   double per_j = j - ny_pppm*(2*j/ny_pppm);
   double per_i = i - nx_pppm*(2*i/nx_pppm);
-  // n corresponds to the "number" of this iteration if we were to execute the loop monotonically and in serial
-  int n = (nxhi_fft - nxlo_fft + 1)*(nyhi_fft - nylo_fft + 1)*(k - nzlo_fft)+
-          (nxhi_fft - nxlo_fft + 1)*(j - nylo_fft) + (i - nxlo_fft);
 
   double unitk_lamda[3];
   unitk_lamda[0] = 2.0*MY_PI*per_i;
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index e7c55b1726..ad8612e637 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -141,7 +141,7 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   void operator()(TagPPPM_setup4, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
-  void operator()(TagPPPM_setup_triclinic1, const int&, const int&, const int&) const;
+  void operator()(TagPPPM_setup_triclinic1, const int&) const;
 
   KOKKOS_INLINE_FUNCTION
   void operator()(TagPPPM_setup_triclinic2, const int&) const;

From 2e59b5c4de8eb195411abb8d32e7887c147e7ff7 Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Wed, 18 Aug 2021 15:18:55 +1000
Subject: [PATCH 10/15] Fixed whitespace errors and removed some extraneous
 comments.

---
 src/KOKKOS/compute_temp_deform_kokkos.cpp | 20 +++------------
 src/KOKKOS/fix_nvt_sllod_kokkos.cpp       | 11 ++++-----
 src/KOKKOS/fix_nvt_sllod_kokkos.h         |  3 +--
 src/KOKKOS/pppm_kokkos.cpp                | 30 +++++++++++------------
 src/KOKKOS/pppm_kokkos.h                  | 20 +++++++--------
 5 files changed, 33 insertions(+), 51 deletions(-)

diff --git a/src/KOKKOS/compute_temp_deform_kokkos.cpp b/src/KOKKOS/compute_temp_deform_kokkos.cpp
index 7846e18113..91d7d3f942 100644
--- a/src/KOKKOS/compute_temp_deform_kokkos.cpp
+++ b/src/KOKKOS/compute_temp_deform_kokkos.cpp
@@ -49,7 +49,7 @@ ComputeTempDeformKokkos<DeviceType>::ComputeTempDeformKokkos(LAMMPS *lmp, int na
 }
 
 template<class DeviceType>
-ComputeTempDeformKokkos<DeviceType>::~ComputeTempDeformKokkos() 
+ComputeTempDeformKokkos<DeviceType>::~ComputeTempDeformKokkos()
 {
 
 
@@ -78,12 +78,10 @@ double ComputeTempDeformKokkos<DeviceType>::compute_scalar()
   double t = 0.0;
   CTEMP t_kk;
 
-  /**************** EVK ****************/
-  // Convert from box coords to lamda coords
   domainKK->x2lamda(nlocal);
   h_rate = domainKK->h_rate;
   h_ratelo = domainKK->h_ratelo;
-  
+
   copymode = 1;
   if (atomKK->rmass)
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformScalar<1> >(0,nlocal),*this,t_kk);
@@ -91,11 +89,9 @@ double ComputeTempDeformKokkos<DeviceType>::compute_scalar()
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformScalar<0> >(0,nlocal),*this,t_kk);
   copymode = 0;
 
-  /**************** EVK ****************/
-  // Convert back to box coords
   domainKK->lamda2x(nlocal);
 
-  t = t_kk.t0; // could make this more efficient
+  t = t_kk.t0;
 
   MPI_Allreduce(&t,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
   if (dynamic) dof_compute();
@@ -152,8 +148,6 @@ void ComputeTempDeformKokkos<DeviceType>::compute_vector()
   for (i = 0; i < 6; i++) t[i] = 0.0;
   CTEMP t_kk;
 
-  /**************** EVK ****************/
-  // Convert from box coords to lamda coords
   domainKK->x2lamda(nlocal);
   h_rate = domainKK->h_rate;
   h_ratelo = domainKK->h_ratelo;
@@ -165,8 +159,6 @@ void ComputeTempDeformKokkos<DeviceType>::compute_vector()
     Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformVector<0> >(0,nlocal),*this,t_kk);
   copymode = 0;
 
-  /**************** EVK ****************/
-  // Convert back to box coords
   domainKK->lamda2x(nlocal);
 
   t[0] = t_kk.t0;
@@ -218,14 +210,10 @@ void ComputeTempDeformKokkos<DeviceType>::remove_bias_all()
   int nlocal = atom->nlocal;
 
   if (atom->nmax > maxbias) {
-    //memoryKK->destroy_kokkos(vbiasall);
     maxbias = atom->nmax;
-    //memoryKK->create_kokkos(vbiasall,maxbias,"temp/deform/kk:vbiasall");
     vbiasall = typename ArrayTypes<DeviceType>::t_v_array("temp/deform/kk:vbiasall", maxbias);
   }
 
-  /**************** EVK ****************/
-  // Convert from box coords to lamda coords
   domainKK->x2lamda(nlocal);
 
   h_rate = domain->h_rate;
@@ -235,8 +223,6 @@ void ComputeTempDeformKokkos<DeviceType>::remove_bias_all()
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagComputeTempDeformRemoveBias >(0,nlocal),*this);
   copymode = 0;
 
-  /**************** EVK ****************/
-  // Convert back to box coords
   domainKK->lamda2x(nlocal);
 }
 
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
index 68574adff0..1ed99cc77c 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@@ -44,7 +44,6 @@ template<class DeviceType>
 FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **arg) :
   FixNHKokkos<DeviceType>(lmp, narg, arg)
 {
-  /************************ EVK *********************/
   atomKK = (AtomKokkos *) this->atom;
   this->kokkosable = 1;
   this->domainKK = (DomainKokkos *) this->domain;
@@ -55,7 +54,7 @@ FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **a
     this->error->all(FLERR,"Pressure control can not be used with fix nvt/kk");
 
   if (this->mtchain_default_flag) this->mtchain = 1;
-    
+
   this->id_temp = utils::strdup(std::string(this->id)+"_temp");
   this->modify->add_compute(fmt::format("{} all temp/deform/kk",this->id_temp));
   this->tcomputeflag = 1;
@@ -78,7 +77,7 @@ void FixNVTSllodKokkos<DeviceType>::init()
   FixNHKokkos<DeviceType>::init();
 
   vdelu = typename ArrayTypes<DeviceType>::t_v_array("nvt/sllod/kk:vdelu", atomKK->nlocal);
-  
+
   if (!this->temperature->tempbias)
     this->error->all(FLERR,"Temperature for fix nvt/sllod does not have a bias");
 
@@ -112,8 +111,8 @@ void FixNVTSllodKokkos<DeviceType>::nh_v_temp()
   // for non temp/deform BIAS:
   //   calculate temperature since some computes require temp
   //   computed on current nlocal atoms to remove bias
-  
-  if (nondeformbias){ 
+
+  if (nondeformbias){
     atomKK->sync(this->temperature->execution_space,this->temperature->datamask_read);
     this->temperature->compute_scalar();
     atomKK->modified(this->temperature->execution_space,this->temperature->datamask_modify);
@@ -133,7 +132,7 @@ void FixNVTSllodKokkos<DeviceType>::nh_v_temp()
   this->copymode = 0;
 
   this->temperature->remove_bias_all();
-  
+
   this->copymode = 1;
   Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixNVTSllod_temp2>(0,nlocal),*this);
   this->copymode = 0;
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.h b/src/KOKKOS/fix_nvt_sllod_kokkos.h
index 85b4e54bcd..15af600430 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.h
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.h
@@ -23,7 +23,6 @@ FixStyle(nvt/sllod/kk/host,FixNVTSllodKokkos<LMPHostType>);
 #define LMP_FIX_NVT_SLLOD_KOKKOS_H
 
 #include "fix_nh_kokkos.h"
-//#include "fix_nvt_sllod.h"
 #include "kokkos_type.h"
 #include "kokkos_few.h"
 
@@ -61,7 +60,7 @@ class FixNVTSllodKokkos : public FixNHKokkos<DeviceType> {
   typename ArrayTypes<DeviceType>::t_int_1d mask;
 
   Few<double, 6> d_h_two;
-  
+
   class DomainKokkos *domainKK;
   class AtomKokkos *atomKK;
 };
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index e2956036c3..b76e36c6e2 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -1326,7 +1326,7 @@ void PPPMKokkos<DeviceType>::compute_gf_ik_triclinic()
   nby = static_cast<int> (tmp[1]);
   nbz = static_cast<int> (tmp[2]);
 
-  // Update the local copy of the domain box tilt 
+  // Update the local copy of the domain box tilt
   h = Few<double, 6>(domain->h);
   h_inv = Few<double, 6>(domain->h_inv);
 
@@ -1342,58 +1342,58 @@ KOKKOS_INLINE_FUNCTION
 void PPPMKokkos<DeviceType>::operator()(TagPPPM_compute_gf_ik_triclinic, const int &m) const
 {
   int n = (m - nzlo_fft)*(nyhi_fft+1 - nylo_fft)*(nxhi_fft+1 - nxlo_fft);
-  
+
   const int mper = m - nz_pppm*(2*m/nz_pppm);
   const double snz = square(sin(MY_PI*mper/nz_pppm));
-  
+
   for (int l = nylo_fft; l <= nyhi_fft; l++) {
     const int lper = l - ny_pppm*(2*l/ny_pppm);
     const double sny = square(sin(MY_PI*lper/ny_pppm));
-  
+
     for (int k = nxlo_fft; k <= nxhi_fft; k++) {
       const int kper = k - nx_pppm*(2*k/nx_pppm);
       const double snx = square(sin(MY_PI*kper/nx_pppm));
-  
+
       double unitk_lamda[3];
       unitk_lamda[0] = 2.0*MY_PI*kper;
       unitk_lamda[1] = 2.0*MY_PI*lper;
       unitk_lamda[2] = 2.0*MY_PI*mper;
       x2lamdaT(&unitk_lamda[0],&unitk_lamda[0]);
-  
+
       const double sqk = square(unitk_lamda[0]) + square(unitk_lamda[1]) + square(unitk_lamda[2]);
-  
+
       if (sqk != 0.0) {
         const double numerator = 12.5663706/sqk;
         const double denominator = gf_denom(snx,sny,snz);
         double sum1 = 0.0;
-  
+
         for (int nx = -nbx; nx <= nbx; nx++) {
           const double argx = MY_PI*kper/nx_pppm + MY_PI*nx;
           const double wx = powsinxx(argx,twoorder);
-  
+
           for (int ny = -nby; ny <= nby; ny++) {
             const double argy = MY_PI*lper/ny_pppm + MY_PI*ny;
             const double wy = powsinxx(argy,twoorder);
-  
+
             for (int nz = -nbz; nz <= nbz; nz++) {
               const double argz = MY_PI*mper/nz_pppm + MY_PI*nz;
               const double wz = powsinxx(argz,twoorder);
-  
+
               double b[3];
               b[0] = 2.0*MY_PI*nx_pppm*nx;
               b[1] = 2.0*MY_PI*ny_pppm*ny;
               b[2] = 2.0*MY_PI*nz_pppm*nz;
               x2lamdaT(&b[0],&b[0]);
-  
+
               const double qx = unitk_lamda[0]+b[0];
               const double sx = exp(-0.25*square(qx/g_ewald));
-  
+
               const double qy = unitk_lamda[1]+b[1];
               const double sy = exp(-0.25*square(qy/g_ewald));
-  
+
               const double qz = unitk_lamda[2]+b[2];
               const double sz = exp(-0.25*square(qz/g_ewald));
-  
+
               const double dot1 = unitk_lamda[0]*qx + unitk_lamda[1]*qy + unitk_lamda[2]*qz;
               const double dot2 = qx*qx+qy*qy+qz*qz;
               sum1 += (dot1/dot2) * sx*sy*sz * wx*wy*wz;
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index ad8612e637..f9e2b1a62a 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -317,22 +317,20 @@ class PPPMKokkos : public PPPM, public KokkosBaseFFT {
   int numx_out,numy_out,numz_out;
   int ix,iy,nlocal;
 
-  // Local copies of the domain box tilt etc. 
-  // TODO: Will need to put in a less
-  // hacky solution later, since this needs to be manually updated
+  // Local copies of the domain box tilt etc.
   Few<double,6> h, h_inv;
 
   KOKKOS_INLINE_FUNCTION
   void x2lamdaT(double* v, double* lamda) const
   {
-    double lamda_tmp[3];                                                         
-                                                                                
-    lamda_tmp[0] = h_inv[0]*v[0];                                                
-    lamda_tmp[1] = h_inv[5]*v[0] + h_inv[1]*v[1];                                
-    lamda_tmp[2] = h_inv[4]*v[0] + h_inv[3]*v[1] + h_inv[2]*v[2];                
-                                                                                
-    lamda[0] = lamda_tmp[0];                                                     
-    lamda[1] = lamda_tmp[1];                                                     
+    double lamda_tmp[3];
+
+    lamda_tmp[0] = h_inv[0]*v[0];
+    lamda_tmp[1] = h_inv[5]*v[0] + h_inv[1]*v[1];
+    lamda_tmp[2] = h_inv[4]*v[0] + h_inv[3]*v[1] + h_inv[2]*v[2];
+
+    lamda[0] = lamda_tmp[0];
+    lamda[1] = lamda_tmp[1];
     lamda[2] = lamda_tmp[2];
   }
 

From 4876e0cbb697128ce4df8c8e4753c992efcda4d4 Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Wed, 18 Aug 2021 17:37:00 +1000
Subject: [PATCH 11/15] Changed URLs in the headers to point to the new LAMMPS
 site.

---
 src/KOKKOS/fix_nvt_sllod_kokkos.h | 4 ++--
 src/KOKKOS/pppm_kokkos.cpp        | 2 +-
 src/KOKKOS/pppm_kokkos.h          | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.h b/src/KOKKOS/fix_nvt_sllod_kokkos.h
index 15af600430..4e731c4b70 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.h
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.h
@@ -1,7 +1,7 @@
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   www.cs.sandia.gov/~sjplimp/lammps.html
-   Steve Plimpton, sjplimp@sandia.gov, Sandia National Laboratories
+   https://www.lammps.org/, Sandia National Laboratories
+   Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
    DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index b76e36c6e2..16cdce16de 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://lammps.sandia.gov/, Sandia National Laboratories
+   https://lammps.org/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index f9e2b1a62a..9a69a82762 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -1,6 +1,7 @@
+// clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://lammps.sandia.gov/, Sandia National Laboratories
+   https://lammps.org/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -12,11 +13,11 @@
 ------------------------------------------------------------------------- */
 
 #ifdef KSPACE_CLASS
-
+// clang-format off
 KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>)
 KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>)
 KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
-
+// clang-format on
 #else
 
 #ifndef LMP_PPPM_KOKKOS_H

From 511ac4994986cf269f135e3830d9a3ea8d6c4294 Mon Sep 17 00:00:00 2001
From: Axel Kohlmeyer <akohlmey@gmail.com>
Date: Wed, 18 Aug 2021 05:53:23 -0400
Subject: [PATCH 12/15] reformat, minor cosmetic changes

---
 src/KOKKOS/compute_temp_deform_kokkos.cpp |   6 +-
 src/KOKKOS/compute_temp_deform_kokkos.h   |   2 +-
 src/KOKKOS/compute_temp_kokkos.h          |   5 +-
 src/KOKKOS/fix_nvt_sllod_kokkos.cpp       |  28 ++---
 src/KOKKOS/fix_nvt_sllod_kokkos.h         |   5 +-
 src/KOKKOS/pppm_kokkos.cpp                |   2 +-
 src/KOKKOS/pppm_kokkos.h                  |  11 +-
 src/compute_temp_deform.cpp               | 130 +++++++++++-----------
 8 files changed, 85 insertions(+), 104 deletions(-)

diff --git a/src/KOKKOS/compute_temp_deform_kokkos.cpp b/src/KOKKOS/compute_temp_deform_kokkos.cpp
index 91d7d3f942..1d7d087200 100644
--- a/src/KOKKOS/compute_temp_deform_kokkos.cpp
+++ b/src/KOKKOS/compute_temp_deform_kokkos.cpp
@@ -21,13 +21,11 @@
 #include "atom_kokkos.h"
 #include "atom_masks.h"
 #include "comm.h"
+#include "domain_kokkos.h"
 #include "error.h"
 #include "force.h"
-#include "update.h"
 #include "memory_kokkos.h"
-#include "domain_kokkos.h"
-
-#include <cstring>
+#include "update.h"
 
 using namespace LAMMPS_NS;
 
diff --git a/src/KOKKOS/compute_temp_deform_kokkos.h b/src/KOKKOS/compute_temp_deform_kokkos.h
index 9f32069604..8b53c1f633 100644
--- a/src/KOKKOS/compute_temp_deform_kokkos.h
+++ b/src/KOKKOS/compute_temp_deform_kokkos.h
@@ -24,8 +24,8 @@ ComputeStyle(temp/deform/kk/host,ComputeTempDeformKokkos<LMPHostType>);
 #define LMP_COMPUTE_TEMP_DEFORM_KOKKOS_H
 
 #include "compute_temp_deform.h"
-#include "kokkos_type.h"
 #include "kokkos_few.h"
+#include "kokkos_type.h"
 
 namespace LAMMPS_NS {
 
diff --git a/src/KOKKOS/compute_temp_kokkos.h b/src/KOKKOS/compute_temp_kokkos.h
index 4cbace02d7..ae6ab485d2 100644
--- a/src/KOKKOS/compute_temp_kokkos.h
+++ b/src/KOKKOS/compute_temp_kokkos.h
@@ -1,4 +1,3 @@
-// clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -19,13 +18,13 @@ ComputeStyle(temp/kk/device,ComputeTempKokkos<LMPDeviceType>);
 ComputeStyle(temp/kk/host,ComputeTempKokkos<LMPHostType>);
 // clang-format on
 #else
-
 #ifndef LMP_COMPUTE_TEMP_KOKKOS_H
 #define LMP_COMPUTE_TEMP_KOKKOS_H
 
 #include "compute_temp.h"
 #include "kokkos_type.h"
 
+// clang-format off
 namespace LAMMPS_NS {
 
 template<int RMASS>
@@ -72,7 +71,7 @@ class ComputeTempKokkos : public ComputeTemp {
   typedef ArrayTypes<DeviceType> AT;
 
   ComputeTempKokkos(class LAMMPS *, int, char **);
-  virtual ~ComputeTempKokkos() {};
+  virtual ~ComputeTempKokkos() {}
   double compute_scalar();
   void compute_vector();
 
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
index 1ed99cc77c..d0af72f17f 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.cpp
@@ -19,21 +19,19 @@
 #include "fix_nvt_sllod_kokkos.h"
 
 #include "atom.h"
+#include "atom.h"
+#include "atom_kokkos.h"
+#include "atom_masks.h"
 #include "compute.h"
 #include "domain.h"
 #include "error.h"
 #include "fix.h"
 #include "fix_deform_kokkos.h"
 #include "group.h"
-#include "math_extra.h"
-#include "modify.h"
-#include "atom_kokkos.h"
-#include "atom.h"
-#include "atom_masks.h"
 #include "kokkos_few.h"
+#include "math_extra.h"
 #include "memory_kokkos.h"
-
-#include <cstring>
+#include "modify.h"
 
 using namespace LAMMPS_NS;
 using namespace FixConst;
@@ -62,15 +60,6 @@ FixNVTSllodKokkos<DeviceType>::FixNVTSllodKokkos(LAMMPS *lmp, int narg, char **a
 
 /* ---------------------------------------------------------------------- */
 
-template<class DeviceType>
-FixNVTSllodKokkos<DeviceType>::~FixNVTSllodKokkos()
-{
-
-
-}
-
-/* ---------------------------------------------------------------------- */
-
 template<class DeviceType>
 void FixNVTSllodKokkos<DeviceType>::init()
 {
@@ -82,16 +71,15 @@ void FixNVTSllodKokkos<DeviceType>::init()
     this->error->all(FLERR,"Temperature for fix nvt/sllod does not have a bias");
 
   nondeformbias = 0;
-  if (strncmp(this->temperature->style,"temp/deform", 11) != 0) nondeformbias = 1;
+  if (utils::strmatch(this->temperature->style,"^temp/deform")) nondeformbias = 1;
 
   // check fix deform remap settings
 
   int i;
   for (i = 0; i < this->modify->nfix; i++)
-    if (strncmp(this->modify->fix[i]->style,"deform",6) == 0) {
+    if (utils::strmatch(this->modify->fix[i]->style,"^deform")) {
       if (((FixDeform *) this->modify->fix[i])->remapflag != Domain::V_REMAP)
-        this->error->all(FLERR,"Using fix nvt/sllod with inconsistent fix deform "
-                   "remap option");
+        this->error->all(FLERR,"Using fix nvt/sllod with inconsistent fix deform remap option");
       break;
     }
   if (i == this->modify->nfix)
diff --git a/src/KOKKOS/fix_nvt_sllod_kokkos.h b/src/KOKKOS/fix_nvt_sllod_kokkos.h
index 4e731c4b70..6057ce44d0 100644
--- a/src/KOKKOS/fix_nvt_sllod_kokkos.h
+++ b/src/KOKKOS/fix_nvt_sllod_kokkos.h
@@ -23,9 +23,10 @@ FixStyle(nvt/sllod/kk/host,FixNVTSllodKokkos<LMPHostType>);
 #define LMP_FIX_NVT_SLLOD_KOKKOS_H
 
 #include "fix_nh_kokkos.h"
-#include "kokkos_type.h"
 #include "kokkos_few.h"
+#include "kokkos_type.h"
 
+// clang-format off
 namespace LAMMPS_NS {
 
 struct TagFixNVTSllod_temp1{};
@@ -35,7 +36,7 @@ template<class DeviceType>
 class FixNVTSllodKokkos : public FixNHKokkos<DeviceType> {
  public:
   FixNVTSllodKokkos(class LAMMPS *, int, char **);
-  ~FixNVTSllodKokkos();
+  ~FixNVTSllodKokkos() {}
   void init();
 
   KOKKOS_INLINE_FUNCTION
diff --git a/src/KOKKOS/pppm_kokkos.cpp b/src/KOKKOS/pppm_kokkos.cpp
index 16cdce16de..a7f58f2525 100644
--- a/src/KOKKOS/pppm_kokkos.cpp
+++ b/src/KOKKOS/pppm_kokkos.cpp
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://lammps.org/, Sandia National Laboratories
+   https://www.lammps.org/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
diff --git a/src/KOKKOS/pppm_kokkos.h b/src/KOKKOS/pppm_kokkos.h
index 9a69a82762..9a333135b7 100644
--- a/src/KOKKOS/pppm_kokkos.h
+++ b/src/KOKKOS/pppm_kokkos.h
@@ -1,7 +1,6 @@
-// clang-format off
 /* -*- c++ -*- ----------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
-   https://lammps.org/, Sandia National Laboratories
+   https://www.lammps.org/, Sandia National Laboratories
    Steve Plimpton, sjplimp@sandia.gov
 
    Copyright (2003) Sandia Corporation.  Under the terms of Contract
@@ -14,9 +13,9 @@
 
 #ifdef KSPACE_CLASS
 // clang-format off
-KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>)
-KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>)
-KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
+KSpaceStyle(pppm/kk,PPPMKokkos<LMPDeviceType>);
+KSpaceStyle(pppm/kk/device,PPPMKokkos<LMPDeviceType>);
+KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>);
 // clang-format on
 #else
 
@@ -31,6 +30,8 @@ KSpaceStyle(pppm/kk/host,PPPMKokkos<LMPHostType>)
 #include "kokkos_type.h"
 #include "kokkos_few.h"
 
+// clang-format off
+
 // fix up FFT defines for KOKKOS with CUDA
 
 #ifdef KOKKOS_ENABLE_CUDA
diff --git a/src/compute_temp_deform.cpp b/src/compute_temp_deform.cpp
index 7b17b12fd8..915e682219 100644
--- a/src/compute_temp_deform.cpp
+++ b/src/compute_temp_deform.cpp
@@ -1,4 +1,3 @@
-// clang-format off
 /* ----------------------------------------------------------------------
    LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
    https://www.lammps.org/, Sandia National Laboratories
@@ -18,27 +17,25 @@
 
 #include "compute_temp_deform.h"
 
-#include <cstring>
-#include "domain.h"
 #include "atom.h"
-#include "update.h"
-#include "force.h"
-#include "modify.h"
+#include "comm.h"
+#include "domain.h"
+#include "error.h"
 #include "fix.h"
 #include "fix_deform.h"
+#include "force.h"
 #include "group.h"
-#include "comm.h"
 #include "memory.h"
-#include "error.h"
+#include "modify.h"
+#include "update.h"
 
 using namespace LAMMPS_NS;
 
 /* ---------------------------------------------------------------------- */
 
-ComputeTempDeform::ComputeTempDeform(LAMMPS *lmp, int narg, char **arg) :
-  Compute(lmp, narg, arg)
+ComputeTempDeform::ComputeTempDeform(LAMMPS *lmp, int narg, char **arg) : Compute(lmp, narg, arg)
 {
-  if (narg != 3) error->all(FLERR,"Illegal compute temp/deform command");
+  if (narg != 3) error->all(FLERR, "Illegal compute temp/deform command");
 
   scalar_flag = vector_flag = 1;
   size_vector = 6;
@@ -56,10 +53,9 @@ ComputeTempDeform::ComputeTempDeform(LAMMPS *lmp, int narg, char **arg) :
 
 ComputeTempDeform::~ComputeTempDeform()
 {
-  if (!copymode)
-  {
+  if (!copymode) {
     memory->destroy(vbiasall);
-    delete [] vector;
+    delete[] vector;
   }
 }
 
@@ -72,16 +68,14 @@ void ComputeTempDeform::init()
   // check fix deform remap settings
 
   for (i = 0; i < modify->nfix; i++)
-    if (strncmp(modify->fix[i]->style,"deform", 6) == 0) {
-      if (((FixDeform *) modify->fix[i])->remapflag == Domain::X_REMAP &&
-          comm->me == 0)
-        error->warning(FLERR,"Using compute temp/deform with inconsistent "
-                       "fix deform remap option");
+    if (utils::strmatch(modify->fix[i]->style, "^deform")) {
+      if (((FixDeform *) modify->fix[i])->remapflag == Domain::X_REMAP && comm->me == 0)
+        error->warning(FLERR,
+                       "Using compute temp/deform with inconsistent fix deform remap option");
       break;
     }
   if (i == modify->nfix && comm->me == 0)
-    error->warning(FLERR,
-                   "Using compute temp/deform with no fix deform defined");
+    error->warning(FLERR, "Using compute temp/deform with no fix deform defined");
 }
 
 /* ---------------------------------------------------------------------- */
@@ -101,15 +95,17 @@ void ComputeTempDeform::dof_compute()
   natoms_temp = group->count(igroup);
   dof = domain->dimension * natoms_temp;
   dof -= extra_dof + fix_dof;
-  if (dof > 0) tfactor = force->mvv2e / (dof * force->boltz);
-  else tfactor = 0.0;
+  if (dof > 0)
+    tfactor = force->mvv2e / (dof * force->boltz);
+  else
+    tfactor = 0.0;
 }
 
 /* ---------------------------------------------------------------------- */
 
 double ComputeTempDeform::compute_scalar()
 {
-  double lamda[3],vstream[3],vthermal[3];
+  double lamda[3], vstream[3], vthermal[3];
 
   invoked_scalar = update->ntimestep;
 
@@ -132,26 +128,25 @@ double ComputeTempDeform::compute_scalar()
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
-      domain->x2lamda(x[i],lamda);
-      vstream[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
-        h_rate[4]*lamda[2] + h_ratelo[0];
-      vstream[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
-      vstream[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+      domain->x2lamda(x[i], lamda);
+      vstream[0] = h_rate[0] * lamda[0] + h_rate[5] * lamda[1] + h_rate[4] * lamda[2] + h_ratelo[0];
+      vstream[1] = h_rate[1] * lamda[1] + h_rate[3] * lamda[2] + h_ratelo[1];
+      vstream[2] = h_rate[2] * lamda[2] + h_ratelo[2];
       vthermal[0] = v[i][0] - vstream[0];
       vthermal[1] = v[i][1] - vstream[1];
       vthermal[2] = v[i][2] - vstream[2];
       if (rmass)
-        t += (vthermal[0]*vthermal[0] + vthermal[1]*vthermal[1] +
-              vthermal[2]*vthermal[2]) * rmass[i];
+        t += (vthermal[0] * vthermal[0] + vthermal[1] * vthermal[1] + vthermal[2] * vthermal[2]) *
+            rmass[i];
       else
-        t += (vthermal[0]*vthermal[0] + vthermal[1]*vthermal[1] +
-              vthermal[2]*vthermal[2]) * mass[type[i]];
+        t += (vthermal[0] * vthermal[0] + vthermal[1] * vthermal[1] + vthermal[2] * vthermal[2]) *
+            mass[type[i]];
     }
 
-  MPI_Allreduce(&t,&scalar,1,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(&t, &scalar, 1, MPI_DOUBLE, MPI_SUM, world);
   if (dynamic) dof_compute();
   if (dof < 0.0 && natoms_temp > 0.0)
-    error->all(FLERR,"Temperature compute degrees of freedom < 0");
+    error->all(FLERR, "Temperature compute degrees of freedom < 0");
   scalar *= tfactor;
   return scalar;
 }
@@ -160,7 +155,7 @@ double ComputeTempDeform::compute_scalar()
 
 void ComputeTempDeform::compute_vector()
 {
-  double lamda[3],vstream[3],vthermal[3];
+  double lamda[3], vstream[3], vthermal[3];
 
   invoked_vector = update->ntimestep;
 
@@ -175,31 +170,32 @@ void ComputeTempDeform::compute_vector()
   double *h_rate = domain->h_rate;
   double *h_ratelo = domain->h_ratelo;
 
-  double massone,t[6];
+  double massone, t[6];
   for (int i = 0; i < 6; i++) t[i] = 0.0;
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
-      domain->x2lamda(x[i],lamda);
-      vstream[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
-        h_rate[4]*lamda[2] + h_ratelo[0];
-      vstream[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
-      vstream[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+      domain->x2lamda(x[i], lamda);
+      vstream[0] = h_rate[0] * lamda[0] + h_rate[5] * lamda[1] + h_rate[4] * lamda[2] + h_ratelo[0];
+      vstream[1] = h_rate[1] * lamda[1] + h_rate[3] * lamda[2] + h_ratelo[1];
+      vstream[2] = h_rate[2] * lamda[2] + h_ratelo[2];
       vthermal[0] = v[i][0] - vstream[0];
       vthermal[1] = v[i][1] - vstream[1];
       vthermal[2] = v[i][2] - vstream[2];
 
-      if (rmass) massone = rmass[i];
-      else massone = mass[type[i]];
-      t[0] += massone * vthermal[0]*vthermal[0];
-      t[1] += massone * vthermal[1]*vthermal[1];
-      t[2] += massone * vthermal[2]*vthermal[2];
-      t[3] += massone * vthermal[0]*vthermal[1];
-      t[4] += massone * vthermal[0]*vthermal[2];
-      t[5] += massone * vthermal[1]*vthermal[2];
+      if (rmass)
+        massone = rmass[i];
+      else
+        massone = mass[type[i]];
+      t[0] += massone * vthermal[0] * vthermal[0];
+      t[1] += massone * vthermal[1] * vthermal[1];
+      t[2] += massone * vthermal[2] * vthermal[2];
+      t[3] += massone * vthermal[0] * vthermal[1];
+      t[4] += massone * vthermal[0] * vthermal[2];
+      t[5] += massone * vthermal[1] * vthermal[2];
     }
 
-  MPI_Allreduce(t,vector,6,MPI_DOUBLE,MPI_SUM,world);
+  MPI_Allreduce(t, vector, 6, MPI_DOUBLE, MPI_SUM, world);
   for (int i = 0; i < 6; i++) vector[i] *= force->mvv2e;
 }
 
@@ -213,11 +209,10 @@ void ComputeTempDeform::remove_bias(int i, double *v)
   double *h_rate = domain->h_rate;
   double *h_ratelo = domain->h_ratelo;
 
-  domain->x2lamda(atom->x[i],lamda);
-  vbias[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
-    h_rate[4]*lamda[2] + h_ratelo[0];
-  vbias[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
-  vbias[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+  domain->x2lamda(atom->x[i], lamda);
+  vbias[0] = h_rate[0] * lamda[0] + h_rate[5] * lamda[1] + h_rate[4] * lamda[2] + h_ratelo[0];
+  vbias[1] = h_rate[1] * lamda[1] + h_rate[3] * lamda[2] + h_ratelo[1];
+  vbias[2] = h_rate[2] * lamda[2] + h_ratelo[2];
   v[0] -= vbias[0];
   v[1] -= vbias[1];
   v[2] -= vbias[2];
@@ -233,11 +228,10 @@ void ComputeTempDeform::remove_bias_thr(int i, double *v, double *b)
   double *h_rate = domain->h_rate;
   double *h_ratelo = domain->h_ratelo;
 
-  domain->x2lamda(atom->x[i],lamda);
-  b[0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
-    h_rate[4]*lamda[2] + h_ratelo[0];
-  b[1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
-  b[2] = h_rate[2]*lamda[2] + h_ratelo[2];
+  domain->x2lamda(atom->x[i], lamda);
+  b[0] = h_rate[0] * lamda[0] + h_rate[5] * lamda[1] + h_rate[4] * lamda[2] + h_ratelo[0];
+  b[1] = h_rate[1] * lamda[1] + h_rate[3] * lamda[2] + h_ratelo[1];
+  b[2] = h_rate[2] * lamda[2] + h_ratelo[2];
   v[0] -= b[0];
   v[1] -= b[1];
   v[2] -= b[2];
@@ -256,7 +250,7 @@ void ComputeTempDeform::remove_bias_all()
   if (atom->nmax > maxbias) {
     memory->destroy(vbiasall);
     maxbias = atom->nmax;
-    memory->create(vbiasall,maxbias,3,"temp/deform:vbiasall");
+    memory->create(vbiasall, maxbias, 3, "temp/deform:vbiasall");
   }
 
   double lamda[3];
@@ -265,11 +259,11 @@ void ComputeTempDeform::remove_bias_all()
 
   for (int i = 0; i < nlocal; i++)
     if (mask[i] & groupbit) {
-      domain->x2lamda(atom->x[i],lamda);
-      vbiasall[i][0] = h_rate[0]*lamda[0] + h_rate[5]*lamda[1] +
-        h_rate[4]*lamda[2] + h_ratelo[0];
-      vbiasall[i][1] = h_rate[1]*lamda[1] + h_rate[3]*lamda[2] + h_ratelo[1];
-      vbiasall[i][2] = h_rate[2]*lamda[2] + h_ratelo[2];
+      domain->x2lamda(atom->x[i], lamda);
+      vbiasall[i][0] =
+          h_rate[0] * lamda[0] + h_rate[5] * lamda[1] + h_rate[4] * lamda[2] + h_ratelo[0];
+      vbiasall[i][1] = h_rate[1] * lamda[1] + h_rate[3] * lamda[2] + h_ratelo[1];
+      vbiasall[i][2] = h_rate[2] * lamda[2] + h_ratelo[2];
       v[i][0] -= vbiasall[i][0];
       v[i][1] -= vbiasall[i][1];
       v[i][2] -= vbiasall[i][2];
@@ -323,6 +317,6 @@ void ComputeTempDeform::restore_bias_all()
 
 double ComputeTempDeform::memory_usage()
 {
-  double bytes = 3*maxbias * sizeof(double);
+  double bytes = 3 * maxbias * sizeof(double);
   return bytes;
 }

From 3dc142c0b00d9860c3ceb908a23cf0a28ff80cba Mon Sep 17 00:00:00 2001
From: Emily Kahl <emily.v.kahl@gmail.com>
Date: Tue, 24 Aug 2021 16:23:05 +1000
Subject: [PATCH 13/15] Added fix_nvt_sllod_kokkos and
 compute_temp_deform_kokkos to Install.sh

---
 src/KOKKOS/Install.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh
index 5e7285a75f..e23dd11500 100755
--- a/src/KOKKOS/Install.sh
+++ b/src/KOKKOS/Install.sh
@@ -43,7 +43,7 @@ elif (test $mode = 0) then
   fi
 fi
 
-# list of files with optional dependcies
+# list of files with optional dependencies
 
 action angle_charmm_kokkos.cpp angle_charmm.cpp
 action angle_charmm_kokkos.h angle_charmm.h
@@ -94,6 +94,8 @@ action compute_orientorder_atom_kokkos.cpp
 action compute_orientorder_atom_kokkos.h
 action compute_temp_kokkos.cpp
 action compute_temp_kokkos.h
+action compute_temp_deform_kokkos.cpp
+action compute_temp_deform_kokkos.h
 action dihedral_charmm_kokkos.cpp dihedral_charmm.cpp
 action dihedral_charmm_kokkos.h dihedral_charmm.h
 action dihedral_class2_kokkos.cpp dihedral_class2.cpp
@@ -135,6 +137,8 @@ action fix_nve_sphere_kokkos.cpp
 action fix_nve_sphere_kokkos.h
 action fix_nvt_kokkos.cpp
 action fix_nvt_kokkos.h
+action fix_nvt_sllod_kokkos.cpp
+action fix_nvt_sllod_kokkos.h
 action fix_property_atom_kokkos.cpp
 action fix_property_atom_kokkos.h
 action fix_qeq_reaxff_kokkos.cpp fix_qeq_reaxff.cpp

From 862cb43fa9a4c3d8dbb1219245ff6f605df93912 Mon Sep 17 00:00:00 2001
From: Vsevak <thevsevak@gmail.com>
Date: Tue, 24 Aug 2021 17:07:51 +0300
Subject: [PATCH 14/15] Enable unittest for GPU lj/cut/tip4p/long

---
 unittest/force-styles/tests/mol-pair-lj_cut_tip4p_long.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/unittest/force-styles/tests/mol-pair-lj_cut_tip4p_long.yaml b/unittest/force-styles/tests/mol-pair-lj_cut_tip4p_long.yaml
index 2e76bdcb57..d1bed9430b 100644
--- a/unittest/force-styles/tests/mol-pair-lj_cut_tip4p_long.yaml
+++ b/unittest/force-styles/tests/mol-pair-lj_cut_tip4p_long.yaml
@@ -2,7 +2,6 @@
 lammps_version: 10 Feb 2021
 date_generated: Fri Feb 26 23:08:49 2021
 epsilon: 1e-13
-skip_tests: gpu
 prerequisites: ! |
   atom full
   pair lj/cut/tip4p/long

From a26da031aa830bee204432807cbc34afe50bd5f5 Mon Sep 17 00:00:00 2001
From: Richard Berger <richard.berger@temple.edu>
Date: Tue, 24 Aug 2021 11:27:30 -0400
Subject: [PATCH 15/15] Use .coveragerc to configure Python coverage reporting

---
 cmake/.coveragerc.in             | 10 ++++++++++
 cmake/Modules/CodeCoverage.cmake | 10 ++++++----
 unittest/python/CMakeLists.txt   |  2 +-
 3 files changed, 17 insertions(+), 5 deletions(-)
 create mode 100644 cmake/.coveragerc.in

diff --git a/cmake/.coveragerc.in b/cmake/.coveragerc.in
new file mode 100644
index 0000000000..3dc467d4d0
--- /dev/null
+++ b/cmake/.coveragerc.in
@@ -0,0 +1,10 @@
+[run]
+source = @LAMMPS_PYTHON_DIR@
+parallel=True
+branch=True
+omit=*/install.py
+     */setup.py
+
+[paths]
+sources = python
+          @LAMMPS_PYTHON_DIR@
diff --git a/cmake/Modules/CodeCoverage.cmake b/cmake/Modules/CodeCoverage.cmake
index 054e08fc1a..21a651e519 100644
--- a/cmake/Modules/CodeCoverage.cmake
+++ b/cmake/Modules/CodeCoverage.cmake
@@ -54,6 +54,8 @@ if(ENABLE_COVERAGE)
 
     if(COVERAGE_FOUND)
         set(PYTHON_COVERAGE_HTML_DIR ${CMAKE_BINARY_DIR}/python_coverage_html)
+        configure_file(.coveragerc.in ${CMAKE_BINARY_DIR}/.coveragerc @ONLY)
+
         add_custom_command(
             OUTPUT ${CMAKE_BINARY_DIR}/unittest/python/.coverage
             COMMAND ${COVERAGE_BINARY} combine
@@ -63,16 +65,16 @@ if(ENABLE_COVERAGE)
 
         add_custom_target(
             gen_python_coverage_html
-            COMMAND ${COVERAGE_BINARY} html -d ${PYTHON_COVERAGE_HTML_DIR}
-            DEPENDS ${CMAKE_BINARY_DIR}/unittest/python/.coverage
+            COMMAND ${COVERAGE_BINARY} html --rcfile=${CMAKE_BINARY_DIR}/.coveragerc -d ${PYTHON_COVERAGE_HTML_DIR}
+            DEPENDS ${CMAKE_BINARY_DIR}/unittest/python/.coverage ${CMAKE_BINARY_DIR}/.coveragerc
             WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/unittest/python
             COMMENT "Generating HTML Python coverage report..."
         )
 
         add_custom_target(
             gen_python_coverage_xml
-            COMMAND ${COVERAGE_BINARY} xml -o ${CMAKE_BINARY_DIR}/python_coverage.xml
-            DEPENDS ${CMAKE_BINARY_DIR}/unittest/python/.coverage
+            COMMAND ${COVERAGE_BINARY} xml --rcfile=${CMAKE_BINARY_DIR}/.coveragerc -o ${CMAKE_BINARY_DIR}/python_coverage.xml
+            DEPENDS ${CMAKE_BINARY_DIR}/unittest/python/.coverage ${CMAKE_BINARY_DIR}/.coveragerc
             WORKING_DIRECTORY ${CMAKE_BINARY_DIR}/unittest/python
             COMMENT "Generating XML Python coverage report..."
         )
diff --git a/unittest/python/CMakeLists.txt b/unittest/python/CMakeLists.txt
index 6832f7e028..d76f73aceb 100644
--- a/unittest/python/CMakeLists.txt
+++ b/unittest/python/CMakeLists.txt
@@ -46,7 +46,7 @@ if(Python_EXECUTABLE)
     find_package_handle_standard_args(COVERAGE DEFAULT_MSG COVERAGE_BINARY)
 
     if(COVERAGE_FOUND)
-        set(PYTHON_TEST_RUNNER ${Python_EXECUTABLE} -u ${COVERAGE_BINARY} run --parallel-mode --include=${LAMMPS_PYTHON_DIR}/lammps/*.py --omit=${LAMMPS_PYTHON_DIR}/install.py)
+        set(PYTHON_TEST_RUNNER ${Python_EXECUTABLE} -u ${COVERAGE_BINARY} run --rcfile=${CMAKE_BINARY_DIR}/.coveragerc)
     else()
         set(PYTHON_TEST_RUNNER ${Python_EXECUTABLE} -u)
     endif()