lammps/src/KOKKOS/fix_shake_kokkos.cpp

// clang-format off
/* ----------------------------------------------------------------------
   LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator
   https://www.lammps.org/, Sandia National Laboratories
   Steve Plimpton, sjplimp@sandia.gov

   Copyright (2003) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the GNU General Public License.

   See the README file in the top-level LAMMPS directory.
------------------------------------------------------------------------- */

#include <mpi.h>
#include <cmath>
#include <cstdlib>
#include <cstring>
#include <cstdio>
#include "fix_shake_kokkos.h"
#include "fix_rattle.h"
#include "atom_kokkos.h"
#include "atom_vec.h"
#include "molecule.h"
#include "update.h"
#include "respa.h"
#include "modify.h"
#include "domain.h"
#include "force.h"
#include "bond.h"
#include "angle.h"
#include "comm.h"
#include "group.h"
#include "fix_respa.h"
#include "math_const.h"
#include "memory_kokkos.h"
#include "error.h"
#include "kokkos.h"
#include "atom_masks.h"

using namespace LAMMPS_NS;
using namespace FixConst;
using namespace MathConst;

#define RVOUS 1   // 0 for irregular, 1 for all2all

#define BIG 1.0e20
#define MASSDELTA 0.1

/* ---------------------------------------------------------------------- */

template<class DeviceType>
FixShakeKokkos<DeviceType>::FixShakeKokkos(LAMMPS *lmp, int narg, char **arg) :
  FixShake(lmp, narg, arg)
{
  kokkosable = 1;
  forward_comm_device = 1;
  atomKK = (AtomKokkos *)atom;
  execution_space = ExecutionSpaceFromDevice<DeviceType>::space;

  datamask_read = EMPTY_MASK;
  datamask_modify = EMPTY_MASK;

  shake_flag_tmp = shake_flag;
  shake_atom_tmp = shake_atom;
  shake_type_tmp = shake_type;

  shake_flag = nullptr;
  shake_atom = nullptr;
  shake_type = nullptr;

  int nmax = atom->nmax;

  grow_arrays(nmax);

  for (int i = 0; i < nmax; i++) {
    k_shake_flag.h_view[i] = shake_flag_tmp[i];
    k_shake_atom.h_view(i,0) = shake_atom_tmp[i][0];
    k_shake_atom.h_view(i,1) = shake_atom_tmp[i][1];
    k_shake_atom.h_view(i,2) = shake_atom_tmp[i][2];
    k_shake_atom.h_view(i,3) = shake_atom_tmp[i][3];
    k_shake_type.h_view(i,0) = shake_type_tmp[i][0];
    k_shake_type.h_view(i,1) = shake_type_tmp[i][1];
    k_shake_type.h_view(i,2) = shake_type_tmp[i][2];
  }

  k_shake_flag.modify_host();
  k_shake_atom.modify_host();
  k_shake_type.modify_host();

  k_bond_distance = DAT::tdual_float_1d("fix_shake:bond_distance",atom->nbondtypes+1);
  k_angle_distance = DAT::tdual_float_1d("fix_shake:angle_distance",atom->nangletypes+1);

  d_bond_distance = k_bond_distance.view<DeviceType>();
  d_angle_distance = k_angle_distance.view<DeviceType>();

  // use 1D view for scalars to reduce GPU memory operations

  d_scalars = typename AT::t_int_1d("neighbor:scalars",2);
  h_scalars = HAT::t_int_1d("neighbor:scalars_mirror",2);

  d_error_flag = Kokkos::subview(d_scalars,0);
  d_nlist = Kokkos::subview(d_scalars,1);

  h_error_flag = Kokkos::subview(h_scalars,0);
  h_nlist = Kokkos::subview(h_scalars,1);

  memory->destroy(shake_flag_tmp);
  memory->destroy(shake_atom_tmp);
  memory->destroy(shake_type_tmp);
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
FixShakeKokkos<DeviceType>::~FixShakeKokkos()
{
  if (copymode) return;

  k_shake_flag.sync_host();
  k_shake_atom.sync_host();

  for (int i = 0; i < nlocal; i++) {
    if (shake_flag[i] == 0) continue;
    else if (shake_flag[i] == 1) {
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][1],1);
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][2],1);
      angletype_findset(i,shake_atom[i][1],shake_atom[i][2],1);
    } else if (shake_flag[i] == 2) {
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][1],1);
    } else if (shake_flag[i] == 3) {
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][1],1);
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][2],1);
    } else if (shake_flag[i] == 4) {
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][1],1);
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][2],1);
      bondtype_findset(i,shake_atom[i][0],shake_atom[i][3],1);
    }
  }

  memoryKK->destroy_kokkos(k_shake_flag,shake_flag);
  memoryKK->destroy_kokkos(k_shake_atom,shake_atom);
  memoryKK->destroy_kokkos(k_shake_type,shake_type);
  memoryKK->destroy_kokkos(k_xshake,xshake);
  memoryKK->destroy_kokkos(k_list,list);

  memoryKK->destroy_kokkos(k_vatom,vatom);
}

/* ----------------------------------------------------------------------
   set bond and angle distances
   this init must happen after force->bond and force->angle inits
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::init()
{
  FixShake::init();

  if (utils::strmatch(update->integrate_style,"^respa"))
    error->all(FLERR,"Cannot yet use respa with Kokkos");

  if (rattle)
    error->all(FLERR,"Cannot yet use KOKKOS package with fix rattle");

  // set equilibrium bond distances

  for (int i = 1; i <= atom->nbondtypes; i++)
    k_bond_distance.h_view[i] = bond_distance[i];

  // set equilibrium angle distances

  for (int i = 1; i <= atom->nangletypes; i++)
    k_angle_distance.h_view[i] = angle_distance[i];

  k_bond_distance.modify_host();
  k_angle_distance.modify_host();

  k_bond_distance.sync<DeviceType>();
  k_angle_distance.sync<DeviceType>();
}


/* ----------------------------------------------------------------------
   build list of SHAKE clusters to constrain
   if one or more atoms in cluster are on this proc,
     this proc lists the cluster exactly once
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::pre_neighbor()
{
  // local copies of atom quantities
  // used by SHAKE until next re-neighboring

  x = atom->x;
  v = atom->v;
  f = atom->f;
  mass = atom->mass;
  rmass = atom->rmass;
  type = atom->type;
  nlocal = atom->nlocal;

  map_style = atom->map_style;
  if (map_style == Atom::MAP_ARRAY) {
    k_map_array = atomKK->k_map_array;
    k_map_array.template sync<DeviceType>();
  } else if (map_style == Atom::MAP_HASH) {
    k_map_hash = atomKK->k_map_hash;
  }

  k_shake_flag.sync<DeviceType>();
  k_shake_atom.sync<DeviceType>();

  // extend size of SHAKE list if necessary

  if (nlocal > maxlist) {
    maxlist = nlocal;
    memoryKK->destroy_kokkos(k_list,list);
    memoryKK->create_kokkos(k_list,list,maxlist,"shake:list");
    d_list = k_list.view<DeviceType>();
  }

  // Atom Map

  map_style = atom->map_style;

  if (map_style == Atom::MAP_ARRAY) {
    k_map_array = atomKK->k_map_array;
    k_map_array.template sync<DeviceType>();
  } else if (map_style == Atom::MAP_HASH) {
    k_map_hash = atomKK->k_map_hash;
  }

  // build list of SHAKE clusters I compute

  Kokkos::deep_copy(d_scalars,0);

  {
    // local variables for lambda capture

    auto d_shake_flag = this->d_shake_flag;
    auto d_shake_atom = this->d_shake_atom;
    auto d_list = this->d_list;
    auto d_error_flag = this->d_error_flag;
    auto d_nlist = this->d_nlist;
    auto map_style = atom->map_style;
    auto k_map_array = this->k_map_array;
    auto k_map_hash = this->k_map_hash;

    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal),
     LAMMPS_LAMBDA(const int& i) {
      if (d_shake_flag[i]) {
        if (d_shake_flag[i] == 2) {
          const int atom1 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,0),map_style,k_map_array,k_map_hash);
          const int atom2 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,1),map_style,k_map_array,k_map_hash);
          if (atom1 == -1 || atom2 == -1) {
            d_error_flag() = 1;
          }
          if (i <= atom1 && i <= atom2) {
            const int nlist = Kokkos::atomic_fetch_add(&d_nlist(),1);
            d_list[nlist] = i;
          }
        } else if (d_shake_flag[i] % 2 == 1) {
          const int atom1 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,0),map_style,k_map_array,k_map_hash);
          const int atom2 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,1),map_style,k_map_array,k_map_hash);
          const int atom3 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,2),map_style,k_map_array,k_map_hash);
          if (atom1 == -1 || atom2 == -1 || atom3 == -1)
            d_error_flag() = 1;
          if (i <= atom1 && i <= atom2 && i <= atom3) {
            const int nlist = Kokkos::atomic_fetch_add(&d_nlist(),1);
            d_list[nlist] = i;
          }
        } else {
          const int atom1 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,0),map_style,k_map_array,k_map_hash);
          const int atom2 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,1),map_style,k_map_array,k_map_hash);
          const int atom3 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,2),map_style,k_map_array,k_map_hash);
          const int atom4 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(i,3),map_style,k_map_array,k_map_hash);
          if (atom1 == -1 || atom2 == -1 || atom3 == -1 || atom4 == -1)
            d_error_flag() = 1;
          if (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4) {
            const int nlist = Kokkos::atomic_fetch_add(&d_nlist(),1);
            d_list[nlist] = i;
          }
        }
      }
    });
  }

  Kokkos::deep_copy(h_scalars,d_scalars);
  nlist = h_nlist();

  if (h_error_flag() == 1) {
    error->one(FLERR,"Shake atoms missing on proc "
                                 "{} at step {}",me,update->ntimestep);
  }
}

/* ----------------------------------------------------------------------
   compute the force adjustment for SHAKE constraint
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::post_force(int vflag)
{
  copymode = 1;

  d_x = atomKK->k_x.view<DeviceType>();
  d_f = atomKK->k_f.view<DeviceType>();
  d_type = atomKK->k_type.view<DeviceType>();
  d_rmass = atomKK->k_rmass.view<DeviceType>();
  d_mass = atomKK->k_mass.view<DeviceType>();
  nlocal = atomKK->nlocal;

  map_style = atom->map_style;
  if (map_style == Atom::MAP_ARRAY) {
    k_map_array = atomKK->k_map_array;
    k_map_array.template sync<DeviceType>();
  } else if (map_style == Atom::MAP_HASH) {
    k_map_hash = atomKK->k_map_hash;
  }

  if (d_rmass.data())
    atomKK->sync(execution_space,X_MASK|F_MASK|RMASS_MASK);
  else
    atomKK->sync(execution_space,X_MASK|F_MASK|TYPE_MASK);

  k_shake_flag.sync<DeviceType>();
  k_shake_atom.sync<DeviceType>();
  k_shake_type.sync<DeviceType>();

  if (update->ntimestep == next_output) {
    atomKK->sync(Host,X_MASK);
    k_shake_flag.sync_host();
    k_shake_atom.sync_host();
    k_shake_type.sync_host();
    stats();
  }

  // xshake = unconstrained move with current v,f
  // communicate results if necessary

  unconstrained_update();
  if (nprocs > 1) comm->forward_comm_fix(this);
  k_xshake.sync<DeviceType>();

  // virial setup

  v_init(vflag);

  // reallocate per-atom arrays if necessary

  if (vflag_atom) {
    memoryKK->destroy_kokkos(k_vatom,vatom);
    memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"improper:vatom");
    d_vatom = k_vatom.template view<KKDeviceType>();
  }


  neighflag = lmp->kokkos->neighflag;

  // FULL neighlist still needs atomics in fix shake

  if (neighflag == FULL) {
    if (lmp->kokkos->nthreads > 1 || lmp->kokkos->ngpus > 0)
      neighflag = HALFTHREAD;
    else
      neighflag = HALF;
  }

  need_dup = 0;
  if (neighflag != HALF)
    need_dup = std::is_same<typename NeedDup<HALFTHREAD,DeviceType>::value,Kokkos::Experimental::ScatterDuplicated>::value;

  // allocate duplicated memory

  if (need_dup) {
    dup_f            = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_f);
    dup_vatom        = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated>(d_vatom);
  } else {
    ndup_f            = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_f);
    ndup_vatom        = Kokkos::Experimental::create_scatter_view<Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated>(d_vatom);
  }

  Kokkos::deep_copy(d_error_flag,0);

  update_domain_variables();

  EV_FLOAT ev;

  // loop over clusters to add constraint forces

  if (neighflag == HALF) {
   if (evflag)
      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagFixShakePostForce<HALF,1> >(0,nlist),*this,ev);
    else
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixShakePostForce<HALF,0> >(0,nlist),*this);
  } else {
    if (evflag)
      Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType, TagFixShakePostForce<HALFTHREAD,1> >(0,nlist),*this,ev);
    else
      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixShakePostForce<HALFTHREAD,0> >(0,nlist),*this);
  }

  copymode = 0;

  Kokkos::deep_copy(h_error_flag,d_error_flag);

  if (h_error_flag() == 2)
    error->warning(FLERR,"Shake determinant < 0.0");
  else if (h_error_flag() == 3)
    error->one(FLERR,"Shake determinant = 0.0");

  // store vflag for coordinate_constraints_end_of_step()

  vflag_post_force = vflag;

  // reduction over duplicated memory

  if (need_dup)
    Kokkos::Experimental::contribute(d_f,dup_f);

  atomKK->modified(execution_space,F_MASK);

  if (vflag_global) {
    virial[0] += ev.v[0];
    virial[1] += ev.v[1];
    virial[2] += ev.v[2];
    virial[3] += ev.v[3];
    virial[4] += ev.v[4];
    virial[5] += ev.v[5];
  }

  if (vflag_atom) {
    k_vatom.template modify<DeviceType>();
    k_vatom.template sync<LMPHostType>();
  }

  // free duplicated memory

  if (need_dup) {
    dup_f = decltype(dup_f)();
    dup_vatom = decltype(dup_vatom)();
  }
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::operator()(TagFixShakePostForce<NEIGHFLAG,EVFLAG>, const int &i, EV_FLOAT& ev) const {
  const int m = d_list[i];
  if (d_shake_flag[m] == 2) shake<NEIGHFLAG,EVFLAG>(m,ev);
  else if (d_shake_flag[m] == 3) shake3<NEIGHFLAG,EVFLAG>(m,ev);
  else if (d_shake_flag[m] == 4) shake4<NEIGHFLAG,EVFLAG>(m,ev);
  else shake3angle<NEIGHFLAG,EVFLAG>(m,ev);
}

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::operator()(TagFixShakePostForce<NEIGHFLAG,EVFLAG>, const int &i) const {
  EV_FLOAT ev;
  this->template operator()<NEIGHFLAG,EVFLAG>(TagFixShakePostForce<NEIGHFLAG,EVFLAG>(), i, ev);
}

/* ----------------------------------------------------------------------
   count # of degrees-of-freedom removed by SHAKE for atoms in igroup
------------------------------------------------------------------------- */

template<class DeviceType>
int FixShakeKokkos<DeviceType>::dof(int igroup)
{

  d_mask = atomKK->k_mask.view<DeviceType>();
  d_tag = atomKK->k_tag.view<DeviceType>();
  nlocal = atom->nlocal;

  atomKK->sync(execution_space,MASK_MASK|TAG_MASK);
  k_shake_flag.sync<DeviceType>();
  k_shake_atom.sync<DeviceType>();

  // count dof in a cluster if and only if
  // the central atom is in group and atom i is the central atom

  int n = 0;
  {
    // local variables for lambda capture

    auto d_shake_flag = this->d_shake_flag;
    auto d_shake_atom = this->d_shake_atom;
    auto tag = this->d_tag;
    auto mask = this->d_mask;
    auto groupbit = group->bitmask[igroup];

    Kokkos::parallel_reduce(Kokkos::RangePolicy<DeviceType>(0,nlocal),
     LAMMPS_LAMBDA(const int& i, int& n) {
      if (!(mask[i] & groupbit)) return;
      if (d_shake_flag[i] == 0) return;
      if (d_shake_atom(i,0) != tag[i]) return;
      if (d_shake_flag[i] == 1) n += 3;
      else if (d_shake_flag[i] == 2) n += 1;
      else if (d_shake_flag[i] == 3) n += 2;
      else if (d_shake_flag[i] == 4) n += 3;
    },n);
  }

  int nall;
  MPI_Allreduce(&n,&nall,1,MPI_INT,MPI_SUM,world);
  return nall;
}


/* ----------------------------------------------------------------------
   assumes NVE update, seems to be accurate enough for NVT,NPT,NPH as well
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::unconstrained_update()
{
  d_x = atomKK->k_x.view<DeviceType>();
  d_v = atomKK->k_v.view<DeviceType>();
  d_f = atomKK->k_f.view<DeviceType>();
  d_type = atomKK->k_type.view<DeviceType>();
  d_rmass = atomKK->k_rmass.view<DeviceType>();
  d_mass = atomKK->k_mass.view<DeviceType>();
  nlocal = atom->nlocal;

  if (d_rmass.data())
    atomKK->sync(execution_space,X_MASK|V_MASK|F_MASK|RMASS_MASK);
  else
    atomKK->sync(execution_space,X_MASK|V_MASK|F_MASK|TYPE_MASK);


  k_shake_flag.sync<DeviceType>();
  k_xshake.sync<DeviceType>();

  {
    // local variables for lambda capture

    auto d_shake_flag = this->d_shake_flag;
    auto d_xshake = this->d_xshake;
    auto x = this->d_x;
    auto v = this->d_v;
    auto f = this->d_f;
    auto dtfsq = this->dtfsq;
    auto dtv = this->dtv;

    if (d_rmass.data()) {

      auto rmass = this->d_rmass;

      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal),
       LAMMPS_LAMBDA(const int& i) {
        if (d_shake_flag[i]) {
          const double dtfmsq = dtfsq / rmass[i];
          d_xshake(i,0) = x(i,0) + dtv*v(i,0) + dtfmsq*f(i,0);
          d_xshake(i,1) = x(i,1) + dtv*v(i,1) + dtfmsq*f(i,1);
          d_xshake(i,2) = x(i,2) + dtv*v(i,2) + dtfmsq*f(i,2);
        } else d_xshake(i,2) = d_xshake(i,1) = d_xshake(i,0) = 0.0;
      });
    } else {

      auto mass = this->d_mass;
      auto type = this->d_type;

      Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType>(0,nlocal),
       LAMMPS_LAMBDA(const int& i) {
        if (d_shake_flag[i]) {
          const double dtfmsq = dtfsq / mass[type[i]];
          d_xshake(i,0) = x(i,0) + dtv*v(i,0) + dtfmsq*f(i,0);
          d_xshake(i,1) = x(i,1) + dtv*v(i,1) + dtfmsq*f(i,1);
          d_xshake(i,2) = x(i,2) + dtv*v(i,2) + dtfmsq*f(i,2);
        } else d_xshake(i,2) = d_xshake(i,1) = d_xshake(i,0) = 0.0;
      });
    }
  }

  k_xshake.modify<DeviceType>();
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::shake(int m, EV_FLOAT& ev) const
{

  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial

  auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
  auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();

  int nlist,list[2];
  double v[6];
  double invmass0,invmass1;

  // local atom IDs and constraint distances

  int i0 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,0),map_style,k_map_array,k_map_hash);
  int i1 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,1),map_style,k_map_array,k_map_hash);
  double bond1 = d_bond_distance[d_shake_type(m,0)];

  // r01 = distance vec between atoms, with PBC

  double r01[3];
  r01[0] = d_x(i0,0) - d_x(i1,0);
  r01[1] = d_x(i0,1) - d_x(i1,1);
  r01[2] = d_x(i0,2) - d_x(i1,2);
  minimum_image(r01);

  // s01 = distance vec after unconstrained update, with PBC
  // use Domain::minimum_image_once(), not minimum_image()
  // b/c xshake values might be huge, due to e.g. fix gcmc

  double s01[3];
  s01[0] = d_xshake(i0,0) - d_xshake(i1,0);
  s01[1] = d_xshake(i0,1) - d_xshake(i1,1);
  s01[2] = d_xshake(i0,2) - d_xshake(i1,2);
  minimum_image_once(s01);

  // scalar distances between atoms

  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];

  // a,b,c = coeffs in quadratic equation for lamda

  if (d_rmass.data()) {
    invmass0 = 1.0/d_rmass[i0];
    invmass1 = 1.0/d_rmass[i1];
  } else {
    invmass0 = 1.0/d_mass[d_type[i0]];
    invmass1 = 1.0/d_mass[d_type[i1]];
  }

  double a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
  double b = 2.0 * (invmass0+invmass1) *
    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
  double c = s01sq - bond1*bond1;

  // error check

  double determ = b*b - 4.0*a*c;
  if (determ < 0.0) {
    //error->warning(FLERR,"Shake determinant < 0.0",0);
    d_error_flag() = 2;
    determ = 0.0;
  }

  // exact quadratic solution for lamda

  double lamda,lamda1,lamda2;
  lamda1 = (-b+sqrt(determ)) / (2.0*a);
  lamda2 = (-b-sqrt(determ)) / (2.0*a);

  if (fabs(lamda1) <= fabs(lamda2)) lamda = lamda1;
  else lamda = lamda2;

  // update forces if atom is owned by this processor

  lamda /= dtfsq;

  if (i0 < nlocal) {
    a_f(i0,0) += lamda*r01[0];
    a_f(i0,1) += lamda*r01[1];
    a_f(i0,2) += lamda*r01[2];
  }

  if (i1 < nlocal) {
    a_f(i1,0) -= lamda*r01[0];
    a_f(i1,1) -= lamda*r01[1];
    a_f(i1,2) -= lamda*r01[2];
  }

  if (EVFLAG) {
    nlist = 0;
    if (i0 < nlocal) list[nlist++] = i0;
    if (i1 < nlocal) list[nlist++] = i1;

    v[0] = lamda*r01[0]*r01[0];
    v[1] = lamda*r01[1]*r01[1];
    v[2] = lamda*r01[2]*r01[2];
    v[3] = lamda*r01[0]*r01[1];
    v[4] = lamda*r01[0]*r01[2];
    v[5] = lamda*r01[1]*r01[2];

    v_tally<NEIGHFLAG>(ev,nlist,list,2.0,v);
  }
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::shake3(int m, EV_FLOAT& ev) const
{

  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial

  auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
  auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();

  int nlist,list[3];
  double v[6];
  double invmass0,invmass1,invmass2;

  // local atom IDs and constraint distances

  int i0 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,0),map_style,k_map_array,k_map_hash);
  int i1 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,1),map_style,k_map_array,k_map_hash);
  int i2 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,2),map_style,k_map_array,k_map_hash);
  double bond1 = d_bond_distance[d_shake_type(m,0)];
  double bond2 = d_bond_distance[d_shake_type(m,1)];

  // r01,r02 = distance vec between atoms, with PBC

  double r01[3];
  r01[0] = d_x(i0,0) - d_x(i1,0);
  r01[1] = d_x(i0,1) - d_x(i1,1);
  r01[2] = d_x(i0,2) - d_x(i1,2);
  minimum_image(r01);

  double r02[3];
  r02[0] = d_x(i0,0) - d_x(i2,0);
  r02[1] = d_x(i0,1) - d_x(i2,1);
  r02[2] = d_x(i0,2) - d_x(i2,2);
  minimum_image(r02);

  // s01,s02 = distance vec after unconstrained update, with PBC
  // use Domain::minimum_image_once(), not minimum_image()
  // b/c xshake values might be huge, due to e.g. fix gcmc

  double s01[3];
  s01[0] = d_xshake(i0,0) - d_xshake(i1,0);
  s01[1] = d_xshake(i0,1) - d_xshake(i1,1);
  s01[2] = d_xshake(i0,2) - d_xshake(i1,2);
  minimum_image_once(s01);

  double s02[3];
  s02[0] = d_xshake(i0,0) - d_xshake(i2,0);
  s02[1] = d_xshake(i0,1) - d_xshake(i2,1);
  s02[2] = d_xshake(i0,2) - d_xshake(i2,2);
  minimum_image_once(s02);

  // scalar distances between atoms

  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
  double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2];
  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];
  double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2];

  // matrix coeffs and rhs for lamda equations

  if (d_rmass.data()) {
    invmass0 = 1.0/d_rmass[i0];
    invmass1 = 1.0/d_rmass[i1];
    invmass2 = 1.0/d_rmass[i2];
  } else {
    invmass0 = 1.0/d_mass[d_type[i0]];
    invmass1 = 1.0/d_mass[d_type[i1]];
    invmass2 = 1.0/d_mass[d_type[i2]];
  }

  double a11 = 2.0 * (invmass0+invmass1) *
    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
  double a12 = 2.0 * invmass0 *
    (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]);
  double a21 = 2.0 * invmass0 *
    (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]);
  double a22 = 2.0 * (invmass0+invmass2) *
    (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]);

  // inverse of matrix

  double determ = a11*a22 - a12*a21;
  if (determ == 0.0) d_error_flag() = 3;
  //error->one(FLERR,"Shake determinant = 0.0");
  double determinv = 1.0/determ;

  double a11inv = a22*determinv;
  double a12inv = -a12*determinv;
  double a21inv = -a21*determinv;
  double a22inv = a11*determinv;

  // quadratic correction coeffs

  double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]);

  double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
  double quad1_0202 = invmass0*invmass0 * r02sq;
  double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102;

  double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
  double quad2_0101 = invmass0*invmass0 * r01sq;
  double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102;

  // iterate until converged

  double lamda01 = 0.0;
  double lamda02 = 0.0;
  int niter = 0;
  int done = 0;

  double quad1,quad2,b1,b2,lamda01_new,lamda02_new;

  while (!done && niter < max_iter) {
    quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 +
      quad1_0102 * lamda01*lamda02;
    quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 +
      quad2_0102 * lamda01*lamda02;

    b1 = bond1*bond1 - s01sq - quad1;
    b2 = bond2*bond2 - s02sq - quad2;

    lamda01_new = a11inv*b1 + a12inv*b2;
    lamda02_new = a21inv*b1 + a22inv*b2;

    done = 1;
    if (fabs(lamda01_new-lamda01) > tolerance) done = 0;
    if (fabs(lamda02_new-lamda02) > tolerance) done = 0;

    lamda01 = lamda01_new;
    lamda02 = lamda02_new;

    // stop iterations before we have a floating point overflow
    // max double is < 1.0e308, so 1e150 is a reasonable cutoff

    if (fabs(lamda01) > 1e150 || fabs(lamda02) > 1e150) done = 1;

    niter++;
  }

  // update forces if atom is owned by this processor

  lamda01 = lamda01/dtfsq;
  lamda02 = lamda02/dtfsq;

  if (i0 < nlocal) {
    a_f(i0,0) += lamda01*r01[0] + lamda02*r02[0];
    a_f(i0,1) += lamda01*r01[1] + lamda02*r02[1];
    a_f(i0,2) += lamda01*r01[2] + lamda02*r02[2];
  }

  if (i1 < nlocal) {
    a_f(i1,0) -= lamda01*r01[0];
    a_f(i1,1) -= lamda01*r01[1];
    a_f(i1,2) -= lamda01*r01[2];
  }

  if (i2 < nlocal) {
    a_f(i2,0) -= lamda02*r02[0];
    a_f(i2,1) -= lamda02*r02[1];
    a_f(i2,2) -= lamda02*r02[2];
  }

  if (EVFLAG) {
    nlist = 0;
    if (i0 < nlocal) list[nlist++] = i0;
    if (i1 < nlocal) list[nlist++] = i1;
    if (i2 < nlocal) list[nlist++] = i2;

    v[0] = lamda01*r01[0]*r01[0] + lamda02*r02[0]*r02[0];
    v[1] = lamda01*r01[1]*r01[1] + lamda02*r02[1]*r02[1];
    v[2] = lamda01*r01[2]*r01[2] + lamda02*r02[2]*r02[2];
    v[3] = lamda01*r01[0]*r01[1] + lamda02*r02[0]*r02[1];
    v[4] = lamda01*r01[0]*r01[2] + lamda02*r02[0]*r02[2];
    v[5] = lamda01*r01[1]*r01[2] + lamda02*r02[1]*r02[2];

    v_tally<NEIGHFLAG>(ev,nlist,list,3.0,v);
  }
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::shake4(int m, EV_FLOAT& ev) const
{

  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial

  auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
  auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();

 int nlist,list[4];
  double v[6];
  double invmass0,invmass1,invmass2,invmass3;

  // local atom IDs and constraint distances

  int i0 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,0),map_style,k_map_array,k_map_hash);
  int i1 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,1),map_style,k_map_array,k_map_hash);
  int i2 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,2),map_style,k_map_array,k_map_hash);
  int i3 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,3),map_style,k_map_array,k_map_hash);
  double bond1 = d_bond_distance[d_shake_type(m,0)];
  double bond2 = d_bond_distance[d_shake_type(m,1)];
  double bond3 = d_bond_distance[d_shake_type(m,2)];

  // r01,r02,r03 = distance vec between atoms, with PBC

  double r01[3];
  r01[0] = d_x(i0,0) - d_x(i1,0);
  r01[1] = d_x(i0,1) - d_x(i1,1);
  r01[2] = d_x(i0,2) - d_x(i1,2);
  minimum_image(r01);

  double r02[3];
  r02[0] = d_x(i0,0) - d_x(i2,0);
  r02[1] = d_x(i0,1) - d_x(i2,1);
  r02[2] = d_x(i0,2) - d_x(i2,2);
  minimum_image(r02);

  double r03[3];
  r03[0] = d_x(i0,0) - d_x(i3,0);
  r03[1] = d_x(i0,1) - d_x(i3,1);
  r03[2] = d_x(i0,2) - d_x(i3,2);
  minimum_image(r03);

  // s01,s02,s03 = distance vec after unconstrained update, with PBC
  // use Domain::minimum_image_once(), not minimum_image()
  // b/c xshake values might be huge, due to e.g. fix gcmc

  double s01[3];
  s01[0] = d_xshake(i0,0) - d_xshake(i1,0);
  s01[1] = d_xshake(i0,1) - d_xshake(i1,1);
  s01[2] = d_xshake(i0,2) - d_xshake(i1,2);
  minimum_image_once(s01);

  double s02[3];
  s02[0] = d_xshake(i0,0) - d_xshake(i2,0);
  s02[1] = d_xshake(i0,1) - d_xshake(i2,1);
  s02[2] = d_xshake(i0,2) - d_xshake(i2,2);
  minimum_image_once(s02);

  double s03[3];
  s03[0] = d_xshake(i0,0) - d_xshake(i3,0);
  s03[1] = d_xshake(i0,1) - d_xshake(i3,1);
  s03[2] = d_xshake(i0,2) - d_xshake(i3,2);
  minimum_image_once(s03);

  // scalar distances between atoms

  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
  double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2];
  double r03sq = r03[0]*r03[0] + r03[1]*r03[1] + r03[2]*r03[2];
  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];
  double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2];
  double s03sq = s03[0]*s03[0] + s03[1]*s03[1] + s03[2]*s03[2];

  // matrix coeffs and rhs for lamda equations

  if (d_rmass.data()) {
    invmass0 = 1.0/d_rmass[i0];
    invmass1 = 1.0/d_rmass[i1];
    invmass2 = 1.0/d_rmass[i2];
    invmass3 = 1.0/d_rmass[i3];
  } else {
    invmass0 = 1.0/d_mass[d_type[i0]];
    invmass1 = 1.0/d_mass[d_type[i1]];
    invmass2 = 1.0/d_mass[d_type[i2]];
    invmass3 = 1.0/d_mass[d_type[i3]];
  }

  double a11 = 2.0 * (invmass0+invmass1) *
    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
  double a12 = 2.0 * invmass0 *
    (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]);
  double a13 = 2.0 * invmass0 *
    (s01[0]*r03[0] + s01[1]*r03[1] + s01[2]*r03[2]);
  double a21 = 2.0 * invmass0 *
    (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]);
  double a22 = 2.0 * (invmass0+invmass2) *
    (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]);
  double a23 = 2.0 * invmass0 *
    (s02[0]*r03[0] + s02[1]*r03[1] + s02[2]*r03[2]);
  double a31 = 2.0 * invmass0 *
    (s03[0]*r01[0] + s03[1]*r01[1] + s03[2]*r01[2]);
  double a32 = 2.0 * invmass0 *
    (s03[0]*r02[0] + s03[1]*r02[1] + s03[2]*r02[2]);
  double a33 = 2.0 * (invmass0+invmass3) *
    (s03[0]*r03[0] + s03[1]*r03[1] + s03[2]*r03[2]);

  // inverse of matrix;

  double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
  if (determ == 0.0) d_error_flag() = 3;
  //error->one(FLERR,"Shake determinant = 0.0");
  double determinv = 1.0/determ;

  double a11inv = determinv * (a22*a33 - a23*a32);
  double a12inv = -determinv * (a12*a33 - a13*a32);
  double a13inv = determinv * (a12*a23 - a13*a22);
  double a21inv = -determinv * (a21*a33 - a23*a31);
  double a22inv = determinv * (a11*a33 - a13*a31);
  double a23inv = -determinv * (a11*a23 - a13*a21);
  double a31inv = determinv * (a21*a32 - a22*a31);
  double a32inv = -determinv * (a11*a32 - a12*a31);
  double a33inv = determinv * (a11*a22 - a12*a21);

  // quadratic correction coeffs

  double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]);
  double r0103 = (r01[0]*r03[0] + r01[1]*r03[1] + r01[2]*r03[2]);
  double r0203 = (r02[0]*r03[0] + r02[1]*r03[1] + r02[2]*r03[2]);

  double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
  double quad1_0202 = invmass0*invmass0 * r02sq;
  double quad1_0303 = invmass0*invmass0 * r03sq;
  double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102;
  double quad1_0103 = 2.0 * (invmass0+invmass1)*invmass0 * r0103;
  double quad1_0203 = 2.0 * invmass0*invmass0 * r0203;

  double quad2_0101 = invmass0*invmass0 * r01sq;
  double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
  double quad2_0303 = invmass0*invmass0 * r03sq;
  double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102;
  double quad2_0103 = 2.0 * invmass0*invmass0 * r0103;
  double quad2_0203 = 2.0 * (invmass0+invmass2)*invmass0 * r0203;

  double quad3_0101 = invmass0*invmass0 * r01sq;
  double quad3_0202 = invmass0*invmass0 * r02sq;
  double quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq;
  double quad3_0102 = 2.0 * invmass0*invmass0 * r0102;
  double quad3_0103 = 2.0 * (invmass0+invmass3)*invmass0 * r0103;
  double quad3_0203 = 2.0 * (invmass0+invmass3)*invmass0 * r0203;

  // iterate until converged

  double lamda01 = 0.0;
  double lamda02 = 0.0;
  double lamda03 = 0.0;
  int niter = 0;
  int done = 0;

  double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new;

  while (!done && niter < max_iter) {
    quad1 = quad1_0101 * lamda01*lamda01 +
      quad1_0202 * lamda02*lamda02 +
      quad1_0303 * lamda03*lamda03 +
      quad1_0102 * lamda01*lamda02 +
      quad1_0103 * lamda01*lamda03 +
      quad1_0203 * lamda02*lamda03;

    quad2 = quad2_0101 * lamda01*lamda01 +
      quad2_0202 * lamda02*lamda02 +
      quad2_0303 * lamda03*lamda03 +
      quad2_0102 * lamda01*lamda02 +
      quad2_0103 * lamda01*lamda03 +
      quad2_0203 * lamda02*lamda03;

    quad3 = quad3_0101 * lamda01*lamda01 +
      quad3_0202 * lamda02*lamda02 +
      quad3_0303 * lamda03*lamda03 +
      quad3_0102 * lamda01*lamda02 +
      quad3_0103 * lamda01*lamda03 +
      quad3_0203 * lamda02*lamda03;

    b1 = bond1*bond1 - s01sq - quad1;
    b2 = bond2*bond2 - s02sq - quad2;
    b3 = bond3*bond3 - s03sq - quad3;

    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
    lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3;

    done = 1;
    if (fabs(lamda01_new-lamda01) > tolerance) done = 0;
    if (fabs(lamda02_new-lamda02) > tolerance) done = 0;
    if (fabs(lamda03_new-lamda03) > tolerance) done = 0;

    lamda01 = lamda01_new;
    lamda02 = lamda02_new;
    lamda03 = lamda03_new;

    // stop iterations before we have a floating point overflow
    // max double is < 1.0e308, so 1e150 is a reasonable cutoff

    if (fabs(lamda01) > 1e150 || fabs(lamda02) > 1e150
        || fabs(lamda03) > 1e150) done = 1;

    niter++;
  }

  // update forces if atom is owned by this processor

  lamda01 = lamda01/dtfsq;
  lamda02 = lamda02/dtfsq;
  lamda03 = lamda03/dtfsq;

  if (i0 < nlocal) {
    a_f(i0,0) += lamda01*r01[0] + lamda02*r02[0] + lamda03*r03[0];
    a_f(i0,1) += lamda01*r01[1] + lamda02*r02[1] + lamda03*r03[1];
    a_f(i0,2) += lamda01*r01[2] + lamda02*r02[2] + lamda03*r03[2];
  }

  if (i1 < nlocal) {
    a_f(i1,0) -= lamda01*r01[0];
    a_f(i1,1) -= lamda01*r01[1];
    a_f(i1,2) -= lamda01*r01[2];
  }

  if (i2 < nlocal) {
    a_f(i2,0) -= lamda02*r02[0];
    a_f(i2,1) -= lamda02*r02[1];
    a_f(i2,2) -= lamda02*r02[2];
  }

  if (i3 < nlocal) {
    a_f(i3,0) -= lamda03*r03[0];
    a_f(i3,1) -= lamda03*r03[1];
    a_f(i3,2) -= lamda03*r03[2];
  }

  if (EVFLAG) {
    nlist = 0;
    if (i0 < nlocal) list[nlist++] = i0;
    if (i1 < nlocal) list[nlist++] = i1;
    if (i2 < nlocal) list[nlist++] = i2;
    if (i3 < nlocal) list[nlist++] = i3;

    v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda03*r03[0]*r03[0];
    v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda03*r03[1]*r03[1];
    v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda03*r03[2]*r03[2];
    v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda03*r03[0]*r03[1];
    v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda03*r03[0]*r03[2];
    v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda03*r03[1]*r03[2];

    v_tally<NEIGHFLAG>(ev,nlist,list,4.0,v);
  }
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
template<int NEIGHFLAG, int EVFLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::shake3angle(int m, EV_FLOAT& ev) const
{

  // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial

  auto v_f = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f);
  auto a_f = v_f.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();

  int nlist,list[3];
  double v[6];
  double invmass0,invmass1,invmass2;

  // local atom IDs and constraint distances

  int i0 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,0),map_style,k_map_array,k_map_hash);
  int i1 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,1),map_style,k_map_array,k_map_hash);
  int i2 = AtomKokkos::map_kokkos<DeviceType>(d_shake_atom(m,2),map_style,k_map_array,k_map_hash);
  double bond1 = d_bond_distance[d_shake_type(m,0)];
  double bond2 = d_bond_distance[d_shake_type(m,1)];
  double bond12 = d_angle_distance[d_shake_type(m,2)];

  // r01,r02,r12 = distance vec between atoms, with PBC

  double r01[3];
  r01[0] = d_x(i0,0) - d_x(i1,0);
  r01[1] = d_x(i0,1) - d_x(i1,1);
  r01[2] = d_x(i0,2) - d_x(i1,2);
  minimum_image(r01);

  double r02[3];
  r02[0] = d_x(i0,0) - d_x(i2,0);
  r02[1] = d_x(i0,1) - d_x(i2,1);
  r02[2] = d_x(i0,2) - d_x(i2,2);
  minimum_image(r02);

  double r12[3];
  r12[0] = d_x(i1,0) - d_x(i2,0);
  r12[1] = d_x(i1,1) - d_x(i2,1);
  r12[2] = d_x(i1,2) - d_x(i2,2);
  minimum_image(r12);

  // s01,s02,s12 = distance vec after unconstrained update, with PBC
  // use Domain::minimum_image_once(), not minimum_image()
  // b/c xshake values might be huge, due to e.g. fix gcmc

  double s01[3];
  s01[0] = d_xshake(i0,0) - d_xshake(i1,0);
  s01[1] = d_xshake(i0,1) - d_xshake(i1,1);
  s01[2] = d_xshake(i0,2) - d_xshake(i1,2);
  minimum_image_once(s01);

  double s02[3];
  s02[0] = d_xshake(i0,0) - d_xshake(i2,0);
  s02[1] = d_xshake(i0,1) - d_xshake(i2,1);
  s02[2] = d_xshake(i0,2) - d_xshake(i2,2);
  minimum_image_once(s02);

  double s12[3];
  s12[0] = d_xshake(i1,0) - d_xshake(i2,0);
  s12[1] = d_xshake(i1,1) - d_xshake(i2,1);
  s12[2] = d_xshake(i1,2) - d_xshake(i2,2);
  minimum_image_once(s12);

  // scalar distances between atoms

  double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2];
  double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2];
  double r12sq = r12[0]*r12[0] + r12[1]*r12[1] + r12[2]*r12[2];
  double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2];
  double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2];
  double s12sq = s12[0]*s12[0] + s12[1]*s12[1] + s12[2]*s12[2];

  // matrix coeffs and rhs for lamda equations

  if (d_rmass.data()) {
    invmass0 = 1.0/d_rmass[i0];
    invmass1 = 1.0/d_rmass[i1];
    invmass2 = 1.0/d_rmass[i2];
  } else {
    invmass0 = 1.0/d_mass[d_type[i0]];
    invmass1 = 1.0/d_mass[d_type[i1]];
    invmass2 = 1.0/d_mass[d_type[i2]];
  }

  double a11 = 2.0 * (invmass0+invmass1) *
    (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]);
  double a12 = 2.0 * invmass0 *
    (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]);
  double a13 = - 2.0 * invmass1 *
    (s01[0]*r12[0] + s01[1]*r12[1] + s01[2]*r12[2]);
  double a21 = 2.0 * invmass0 *
    (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]);
  double a22 = 2.0 * (invmass0+invmass2) *
    (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]);
  double a23 = 2.0 * invmass2 *
    (s02[0]*r12[0] + s02[1]*r12[1] + s02[2]*r12[2]);
  double a31 = - 2.0 * invmass1 *
    (s12[0]*r01[0] + s12[1]*r01[1] + s12[2]*r01[2]);
  double a32 = 2.0 * invmass2 *
    (s12[0]*r02[0] + s12[1]*r02[1] + s12[2]*r02[2]);
  double a33 = 2.0 * (invmass1+invmass2) *
    (s12[0]*r12[0] + s12[1]*r12[1] + s12[2]*r12[2]);

  // inverse of matrix

  double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 -
    a11*a23*a32 - a12*a21*a33 - a13*a22*a31;
  if (determ == 0.0) d_error_flag() = 3;
  //error->one(FLERR,"Shake determinant = 0.0");
  double determinv = 1.0/determ;

  double a11inv = determinv * (a22*a33 - a23*a32);
  double a12inv = -determinv * (a12*a33 - a13*a32);
  double a13inv = determinv * (a12*a23 - a13*a22);
  double a21inv = -determinv * (a21*a33 - a23*a31);
  double a22inv = determinv * (a11*a33 - a13*a31);
  double a23inv = -determinv * (a11*a23 - a13*a21);
  double a31inv = determinv * (a21*a32 - a22*a31);
  double a32inv = -determinv * (a11*a32 - a12*a31);
  double a33inv = determinv * (a11*a22 - a12*a21);

  // quadratic correction coeffs

  double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]);
  double r0112 = (r01[0]*r12[0] + r01[1]*r12[1] + r01[2]*r12[2]);
  double r0212 = (r02[0]*r12[0] + r02[1]*r12[1] + r02[2]*r12[2]);

  double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq;
  double quad1_0202 = invmass0*invmass0 * r02sq;
  double quad1_1212 = invmass1*invmass1 * r12sq;
  double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102;
  double quad1_0112 = - 2.0 * (invmass0+invmass1)*invmass1 * r0112;
  double quad1_0212 = - 2.0 * invmass0*invmass1 * r0212;

  double quad2_0101 = invmass0*invmass0 * r01sq;
  double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq;
  double quad2_1212 = invmass2*invmass2 * r12sq;
  double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102;
  double quad2_0112 = 2.0 * invmass0*invmass2 * r0112;
  double quad2_0212 = 2.0 * (invmass0+invmass2)*invmass2 * r0212;

  double quad3_0101 = invmass1*invmass1 * r01sq;
  double quad3_0202 = invmass2*invmass2 * r02sq;
  double quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq;
  double quad3_0102 = - 2.0 * invmass1*invmass2 * r0102;
  double quad3_0112 = - 2.0 * (invmass1+invmass2)*invmass1 * r0112;
  double quad3_0212 = 2.0 * (invmass1+invmass2)*invmass2 * r0212;

  // iterate until converged

  double lamda01 = 0.0;
  double lamda02 = 0.0;
  double lamda12 = 0.0;
  int niter = 0;
  int done = 0;

  double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new;

  while (!done && niter < max_iter) {

    quad1 = quad1_0101 * lamda01*lamda01 +
      quad1_0202 * lamda02*lamda02 +
      quad1_1212 * lamda12*lamda12 +
      quad1_0102 * lamda01*lamda02 +
      quad1_0112 * lamda01*lamda12 +
      quad1_0212 * lamda02*lamda12;

    quad2 = quad2_0101 * lamda01*lamda01 +
      quad2_0202 * lamda02*lamda02 +
      quad2_1212 * lamda12*lamda12 +
      quad2_0102 * lamda01*lamda02 +
      quad2_0112 * lamda01*lamda12 +
      quad2_0212 * lamda02*lamda12;

    quad3 = quad3_0101 * lamda01*lamda01 +
      quad3_0202 * lamda02*lamda02 +
      quad3_1212 * lamda12*lamda12 +
      quad3_0102 * lamda01*lamda02 +
      quad3_0112 * lamda01*lamda12 +
      quad3_0212 * lamda02*lamda12;

    b1 = bond1*bond1 - s01sq - quad1;
    b2 = bond2*bond2 - s02sq - quad2;
    b3 = bond12*bond12 - s12sq - quad3;

    lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3;
    lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3;
    lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3;

    done = 1;
    if (fabs(lamda01_new-lamda01) > tolerance) done = 0;
    if (fabs(lamda02_new-lamda02) > tolerance) done = 0;
    if (fabs(lamda12_new-lamda12) > tolerance) done = 0;

    lamda01 = lamda01_new;
    lamda02 = lamda02_new;
    lamda12 = lamda12_new;

    // stop iterations before we have a floating point overflow
    // max double is < 1.0e308, so 1e150 is a reasonable cutoff

    if (fabs(lamda01) > 1e150 || fabs(lamda02) > 1e150
        || fabs(lamda12) > 1e150) done = 1;

    niter++;
  }

  // update forces if atom is owned by this processor

  lamda01 = lamda01/dtfsq;
  lamda02 = lamda02/dtfsq;
  lamda12 = lamda12/dtfsq;

  if (i0 < nlocal) {
    a_f(i0,0) += lamda01*r01[0] + lamda02*r02[0];
    a_f(i0,1) += lamda01*r01[1] + lamda02*r02[1];
    a_f(i0,2) += lamda01*r01[2] + lamda02*r02[2];
  }

  if (i1 < nlocal) {
    a_f(i1,0) -= lamda01*r01[0] - lamda12*r12[0];
    a_f(i1,1) -= lamda01*r01[1] - lamda12*r12[1];
    a_f(i1,2) -= lamda01*r01[2] - lamda12*r12[2];
  }

  if (i2 < nlocal) {
    a_f(i2,0) -= lamda02*r02[0] + lamda12*r12[0];
    a_f(i2,1) -= lamda02*r02[1] + lamda12*r12[1];
    a_f(i2,2) -= lamda02*r02[2] + lamda12*r12[2];
  }

  if (EVFLAG) {
    nlist = 0;
    if (i0 < nlocal) list[nlist++] = i0;
    if (i1 < nlocal) list[nlist++] = i1;
    if (i2 < nlocal) list[nlist++] = i2;

    v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda12*r12[0]*r12[0];
    v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda12*r12[1]*r12[1];
    v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda12*r12[2]*r12[2];
    v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda12*r12[0]*r12[1];
    v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda12*r12[0]*r12[2];
    v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda12*r12[1]*r12[2];

    v_tally<NEIGHFLAG>(ev,nlist,list,3.0,v);
  }
}

/* ----------------------------------------------------------------------
   allocate local atom-based arrays
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::grow_arrays(int nmax)
{
  memoryKK->grow_kokkos(k_shake_flag,shake_flag,nmax,"shake:shake_flag");
  memoryKK->grow_kokkos(k_shake_atom,shake_atom,nmax,4,"shake:shake_atom");
  memoryKK->grow_kokkos(k_shake_type,shake_type,nmax,3,"shake:shake_type");
  memoryKK->destroy_kokkos(k_xshake,xshake);
  memoryKK->create_kokkos(k_xshake,xshake,nmax,"shake:xshake");

  d_shake_flag = k_shake_flag.view<DeviceType>();
  d_shake_atom = k_shake_atom.view<DeviceType>();
  d_shake_type = k_shake_type.view<DeviceType>();
  d_xshake = k_xshake.view<DeviceType>();

  memory->destroy(ftmp);
  memory->create(ftmp,nmax,3,"shake:ftmp");
  memory->destroy(vtmp);
  memory->create(vtmp,nmax,3,"shake:vtmp");
}

/* ----------------------------------------------------------------------
   copy values within local atom-based arrays
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::copy_arrays(int i, int j, int delflag)
{
  k_shake_flag.sync_host();
  k_shake_atom.sync_host();
  k_shake_type.sync_host();

  FixShake::copy_arrays(i,j,delflag);

  k_shake_flag.modify_host();
  k_shake_atom.modify_host();
  k_shake_type.modify_host();
}

/* ----------------------------------------------------------------------
   initialize one atom's array values, called when atom is created
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::set_arrays(int i)
{
  k_shake_flag.sync_host();

  shake_flag[i] = 0;

  k_shake_flag.modify_host();
}

/* ----------------------------------------------------------------------
   update one atom's array values
   called when molecule is created from fix gcmc
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::update_arrays(int i, int atom_offset)
{
  k_shake_flag.sync_host();
  k_shake_atom.sync_host();

  FixShake::update_arrays(i,atom_offset);

  k_shake_flag.modify_host();
  k_shake_atom.modify_host();
}

/* ----------------------------------------------------------------------
   initialize a molecule inserted by another fix, e.g. deposit or pour
   called when molecule is created
   nlocalprev = # of atoms on this proc before molecule inserted
   tagprev = atom ID previous to new atoms in the molecule
   xgeom,vcm,quat ignored
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::set_molecule(int nlocalprev, tagint tagprev, int imol,
                            double * xgeom, double * vcm, double * quat)
{
  atomKK->sync(Host,TAG_MASK);
  k_shake_flag.sync_host();
  k_shake_atom.sync_host();
  k_shake_type.sync_host();

  FixShake::set_molecule(nlocalprev,tagprev,imol,xgeom,vcm,quat);

  k_shake_atom.modify_host();
  k_shake_type.modify_host();
}

/* ----------------------------------------------------------------------
   pack values in local atom-based arrays for exchange with another proc
------------------------------------------------------------------------- */

template<class DeviceType>
int FixShakeKokkos<DeviceType>::pack_exchange(int i, double *buf)
{
  k_shake_flag.sync_host();
  k_shake_atom.sync_host();
  k_shake_type.sync_host();

  int m = FixShake::pack_exchange(i,buf);

  k_shake_flag.modify_host();
  k_shake_atom.modify_host();
  k_shake_type.modify_host();

  return m;
}

/* ----------------------------------------------------------------------
   unpack values in local atom-based arrays from exchange with another proc
------------------------------------------------------------------------- */

template<class DeviceType>
int FixShakeKokkos<DeviceType>::unpack_exchange(int nlocal, double *buf)
{
  k_shake_flag.sync_host();
  k_shake_atom.sync_host();
  k_shake_type.sync_host();

  int m = FixShake::unpack_exchange(nlocal,buf);

  k_shake_flag.modify_host();
  k_shake_atom.modify_host();
  k_shake_type.modify_host();

  return m;
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
int FixShakeKokkos<DeviceType>::pack_forward_comm_fix_kokkos(int n, DAT::tdual_int_2d k_sendlist,
                                                        int iswap_in, DAT::tdual_xfloat_1d &k_buf,
                                                        int pbc_flag, int* pbc)
{
  d_sendlist = k_sendlist.view<DeviceType>();
  iswap = iswap_in;
  d_buf = k_buf.view<DeviceType>();

  if (domain->triclinic == 0) {
    dx = pbc[0]*domain->xprd;
    dy = pbc[1]*domain->yprd;
    dz = pbc[2]*domain->zprd;
  } else {
    dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz;
    dy = pbc[1]*domain->yprd + pbc[3]*domain->yz;
    dz = pbc[2]*domain->zprd;
  }

  if (pbc_flag)
    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixShakePackForwardComm<1> >(0,n),*this);
  else
    Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixShakePackForwardComm<0> >(0,n),*this);
  return n*3;
}

template<class DeviceType>
template<int PBC_FLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::operator()(TagFixShakePackForwardComm<PBC_FLAG>, const int &i) const {
  const int j = d_sendlist(iswap, i);

  if (PBC_FLAG == 0) {
    d_buf[3*i] = d_xshake(j,0);
    d_buf[3*i+1] = d_xshake(j,1);
    d_buf[3*i+2] = d_xshake(j,2);
  } else {
    d_buf[3*i] = d_xshake(j,0) + dx;
    d_buf[3*i+1] = d_xshake(j,1) + dy;
    d_buf[3*i+2] = d_xshake(j,2) + dz;
  }
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
int FixShakeKokkos<DeviceType>::pack_forward_comm(int n, int *list, double *buf,
                                int pbc_flag, int *pbc)
{
  k_xshake.sync_host();

  int m = FixShake::pack_forward_comm(n,list,buf,pbc_flag,pbc);

  k_xshake.modify_host();

  return m;
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::unpack_forward_comm_fix_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf)
{
  first = first_in;
  d_buf = buf.view<DeviceType>();
  Kokkos::parallel_for(Kokkos::RangePolicy<DeviceType, TagFixShakeUnpackForwardComm>(0,n),*this);
}

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::operator()(TagFixShakeUnpackForwardComm, const int &i) const {
  d_xshake(i + first,0) = d_buf[3*i];
  d_xshake(i + first,1) = d_buf[3*i+1];
  d_xshake(i + first,2) = d_buf[3*i+2];
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::unpack_forward_comm(int n, int first, double *buf)
{
  k_xshake.sync_host();

  FixShake::unpack_forward_comm(n,first,buf);

  k_xshake.modify_host();
}

/* ----------------------------------------------------------------------
   add coordinate constraining forces
   this method is called at the end of a timestep
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::shake_end_of_step(int vflag) {
  dtv     = update->dt;
  dtfsq   = 0.5 * update->dt * update->dt * force->ftm2v;
  FixShakeKokkos<DeviceType>::post_force(vflag);
  if (!rattle) dtfsq = update->dt * update->dt * force->ftm2v;
}

/* ----------------------------------------------------------------------
   calculate constraining forces based on the current configuration
   change coordinates
------------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::correct_coordinates(int vflag) {
  atomKK->sync(Host,X_MASK|V_MASK|F_MASK);

  // save current forces and velocities so that you
  // initialize them to zero such that FixShake::unconstrained_coordinate_update has no effect

  for (int j=0; j<nlocal; j++) {
    for (int k=0; k<3; k++) {

      // store current value of forces and velocities
      ftmp[j][k] = f[j][k];
      vtmp[j][k] = v[j][k];

      // set f and v to zero for SHAKE

      v[j][k] = 0;
      f[j][k] = 0;
    }
  }

  atomKK->modified(Host,V_MASK|F_MASK);

  // call SHAKE to correct the coordinates which were updated without constraints
  // IMPORTANT: use 1 as argument and thereby enforce velocity Verlet

  dtfsq   = 0.5 * update->dt * update->dt * force->ftm2v;
  FixShakeKokkos<DeviceType>::post_force(vflag);

  atomKK->sync(Host,X_MASK|F_MASK);

  // integrate coordinates: x' = xnp1 + dt^2/2m_i * f, where f is the constraining force
  // NOTE: After this command, the coordinates geometry of the molecules will be correct!

  double dtfmsq;
  if (rmass) {
    for (int i = 0; i < nlocal; i++) {
      dtfmsq = dtfsq/ rmass[i];
      x[i][0] = x[i][0] + dtfmsq*f[i][0];
      x[i][1] = x[i][1] + dtfmsq*f[i][1];
      x[i][2] = x[i][2] + dtfmsq*f[i][2];
    }
  }
  else {
    for (int i = 0; i < nlocal; i++) {
      dtfmsq = dtfsq / mass[type[i]];
      x[i][0] = x[i][0] + dtfmsq*f[i][0];
      x[i][1] = x[i][1] + dtfmsq*f[i][1];
      x[i][2] = x[i][2] + dtfmsq*f[i][2];
    }
  }

  // copy forces and velocities back

  for (int j=0; j<nlocal; j++) {
    for (int k=0; k<3; k++) {
      f[j][k] = ftmp[j][k];
      v[j][k] = vtmp[j][k];
    }
  }

  if (!rattle) dtfsq = update->dt * update->dt * force->ftm2v;

  // communicate changes
  // NOTE: for compatibility xshake is temporarily set to x, such that pack/unpack_forward
  //       can be used for communicating the coordinates.

  double **xtmp = xshake;
  xshake = x;
  if (nprocs > 1) {
    forward_comm_device = 0;
    comm->forward_comm_fix(this);
    forward_comm_device = 1;
  }
  xshake = xtmp;

  atomKK->modified(Host,X_MASK|V_MASK|F_MASK);
}

/* ----------------------------------------------------------------------
   tally virial into global and per-atom accumulators
   n = # of local owned atoms involved, with local indices in list
   v = total virial for the interaction involving total atoms
   increment global virial by n/total fraction
   increment per-atom virial of each atom in list by 1/total fraction
   this method can be used when fix computes forces in post_force()
     e.g. fix shake, fix rigid: compute virial only on owned atoms
       whether newton_bond is on or off
     other procs will tally left-over fractions for atoms they own
------------------------------------------------------------------------- */
template<class DeviceType>
template<int NEIGHFLAG>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::v_tally(EV_FLOAT &ev, int n, int *list, double total,
     double *v) const
{
  int m;

  if (vflag_global) {
    double fraction = n/total;
    ev.v[0] += fraction*v[0];
    ev.v[1] += fraction*v[1];
    ev.v[2] += fraction*v[2];
    ev.v[3] += fraction*v[3];
    ev.v[4] += fraction*v[4];
    ev.v[5] += fraction*v[5];
  }

  if (vflag_atom) {
    double fraction = 1.0/total;
    for (int i = 0; i < n; i++) {
      auto v_vatom = ScatterViewHelper<typename NeedDup<NEIGHFLAG,DeviceType>::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom);
      auto a_vatom = v_vatom.template access<typename AtomicDup<NEIGHFLAG,DeviceType>::value>();
      m = list[i];
      a_vatom(m,0) += fraction*v[0];
      a_vatom(m,1) += fraction*v[1];
      a_vatom(m,2) += fraction*v[2];
      a_vatom(m,3) += fraction*v[3];
      a_vatom(m,4) += fraction*v[4];
      a_vatom(m,5) += fraction*v[5];
    }
  }
}

/* ---------------------------------------------------------------------- */

template<class DeviceType>
void FixShakeKokkos<DeviceType>::update_domain_variables()
{
  triclinic = domain->triclinic;
  xperiodic = domain->xperiodic;
  xprd_half = domain->xprd_half;
  xprd = domain->xprd;
  yperiodic = domain->yperiodic;
  yprd_half = domain->yprd_half;
  yprd = domain->yprd;
  zperiodic = domain->zperiodic;
  zprd_half = domain->zprd_half;
  zprd = domain->zprd;
  xy = domain->xy;
  xz = domain->xz;
  yz = domain->yz;
}

/* ----------------------------------------------------------------------
   minimum image convention in periodic dimensions
   use 1/2 of box size as test
   for triclinic, also add/subtract tilt factors in other dims as needed
   changed "if" to "while" to enable distance to
     far-away ghost atom returned by atom->map() to be wrapped back into box
     could be problem for looking up atom IDs when cutoff > boxsize
   this should not be used if atom has moved infinitely far outside box
     b/c while could iterate forever
     e.g. fix shake prediction of new position with highly overlapped atoms
     use minimum_image_once() instead
   copied from domain.cpp
------------------------------------------------------------------------- */

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::minimum_image(double *delta) const
{
  if (triclinic == 0) {
    if (xperiodic) {
      while (fabs(delta[0]) > xprd_half) {
        if (delta[0] < 0.0) delta[0] += xprd;
        else delta[0] -= xprd;
      }
    }
    if (yperiodic) {
      while (fabs(delta[1]) > yprd_half) {
        if (delta[1] < 0.0) delta[1] += yprd;
        else delta[1] -= yprd;
      }
    }
    if (zperiodic) {
      while (fabs(delta[2]) > zprd_half) {
        if (delta[2] < 0.0) delta[2] += zprd;
        else delta[2] -= zprd;
      }
    }

  } else {
    if (zperiodic) {
      while (fabs(delta[2]) > zprd_half) {
        if (delta[2] < 0.0) {
          delta[2] += zprd;
          delta[1] += yz;
          delta[0] += xz;
        } else {
          delta[2] -= zprd;
          delta[1] -= yz;
          delta[0] -= xz;
        }
      }
    }
    if (yperiodic) {
      while (fabs(delta[1]) > yprd_half) {
        if (delta[1] < 0.0) {
          delta[1] += yprd;
          delta[0] += xy;
        } else {
          delta[1] -= yprd;
          delta[0] -= xy;
        }
      }
    }
    if (xperiodic) {
      while (fabs(delta[0]) > xprd_half) {
        if (delta[0] < 0.0) delta[0] += xprd;
        else delta[0] -= xprd;
      }
    }
  }
}

/* ----------------------------------------------------------------------
   minimum image convention in periodic dimensions
   use 1/2 of box size as test
   for triclinic, also add/subtract tilt factors in other dims as needed
   only shift by one box length in each direction
   this should not be used if multiple box shifts are required
   copied from domain.cpp
------------------------------------------------------------------------- */

template<class DeviceType>
KOKKOS_INLINE_FUNCTION
void FixShakeKokkos<DeviceType>::minimum_image_once(double *delta) const
{
  if (triclinic == 0) {
    if (xperiodic) {
      if (fabs(delta[0]) > xprd_half) {
        if (delta[0] < 0.0) delta[0] += xprd;
        else delta[0] -= xprd;
      }
    }
    if (yperiodic) {
      if (fabs(delta[1]) > yprd_half) {
        if (delta[1] < 0.0) delta[1] += yprd;
        else delta[1] -= yprd;
      }
    }
    if (zperiodic) {
      if (fabs(delta[2]) > zprd_half) {
        if (delta[2] < 0.0) delta[2] += zprd;
        else delta[2] -= zprd;
      }
    }

  } else {
    if (zperiodic) {
      if (fabs(delta[2]) > zprd_half) {
        if (delta[2] < 0.0) {
          delta[2] += zprd;
          delta[1] += yz;
          delta[0] += xz;
        } else {
          delta[2] -= zprd;
          delta[1] -= yz;
          delta[0] -= xz;
        }
      }
    }
    if (yperiodic) {
      if (fabs(delta[1]) > yprd_half) {
        if (delta[1] < 0.0) {
          delta[1] += yprd;
          delta[0] += xy;
        } else {
          delta[1] -= yprd;
          delta[0] -= xy;
        }
      }
    }
    if (xperiodic) {
      if (fabs(delta[0]) > xprd_half) {
        if (delta[0] < 0.0) delta[0] += xprd;
        else delta[0] -= xprd;
      }
    }
  }
}

/* ---------------------------------------------------------------------- */

namespace LAMMPS_NS {
template class FixShakeKokkos<LMPDeviceType>;
#ifdef LMP_KOKKOS_GPU
template class FixShakeKokkos<LMPHostType>;
#endif
}