diff --git a/src/Depend.sh b/src/Depend.sh index a6351e7b13..e629ca9197 100755 --- a/src/Depend.sh +++ b/src/Depend.sh @@ -107,6 +107,7 @@ if (test $1 = "PERI") then fi if (test $1 = "RIGID") then + depend KOKKOS depend USER-OMP depend USER-SDPD fi diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 03508578ae..7c35f81891 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -142,6 +142,8 @@ action fix_reaxc_species_kokkos.cpp fix_reaxc_species.cpp action fix_reaxc_species_kokkos.h fix_reaxc_species.h action fix_setforce_kokkos.cpp action fix_setforce_kokkos.h +action fix_shake_kokkos.cpp fix_shake.cpp +action fix_shake_kokkos.h fix_shake.h action fix_shardlow_kokkos.cpp fix_shardlow.cpp action fix_shardlow_kokkos.h fix_shardlow.h action fix_momentum_kokkos.cpp diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index f8d833d48c..97bb1a5b0b 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -75,6 +75,11 @@ CommKokkos::CommKokkos(LAMMPS *lmp) : CommBrick(lmp) max_buf_pair = 0; k_buf_send_pair = DAT::tdual_xfloat_1d("comm:k_buf_send_pair",1); k_buf_recv_pair = DAT::tdual_xfloat_1d("comm:k_recv_send_pair",1); + + max_buf_fix = 0; + k_buf_send_fix = DAT::tdual_xfloat_1d("comm:k_buf_send_fix",1); + k_buf_recv_fix = DAT::tdual_xfloat_1d("comm:k_recv_send_fix",1); + } /* ---------------------------------------------------------------------- */ @@ -356,10 +361,79 @@ void CommKokkos::reverse_comm_device() void CommKokkos::forward_comm_fix(Fix *fix, int size) { - k_sendlist.sync(); - CommBrick::forward_comm_fix(fix,size); + if (fix->execution_space == Device && fix->forward_comm_device) { + k_sendlist.sync(); + forward_comm_fix_device(fix,size); + } else { + k_sendlist.sync(); + CommBrick::forward_comm_fix(fix,size); + } } +template +void CommKokkos::forward_comm_fix_device(Fix *fix, int size) +{ + int iswap,n,nsize; + MPI_Request request; + + if (size) nsize = size; + else nsize = fix->comm_forward; + KokkosBase* fixKKBase = dynamic_cast(fix); + + for (iswap = 0; iswap < nswap; iswap++) { + int n = MAX(max_buf_fix,nsize*sendnum[iswap]); + n = MAX(n,nsize*recvnum[iswap]); + if (n > max_buf_fix) + grow_buf_fix(n); + } + + for (iswap = 0; iswap < nswap; iswap++) { + + // pack buffer + + n = fixKKBase->pack_forward_comm_fix_kokkos(sendnum[iswap],k_sendlist, + iswap,k_buf_send_fix,pbc_flag[iswap],pbc[iswap]); + DeviceType().fence(); + + // exchange with another proc + // if self, set recv buffer to send buffer + + if (sendproc[iswap] != me) { + double* buf_send_fix; + double* buf_recv_fix; + if (lmp->kokkos->gpu_aware_flag) { + buf_send_fix = k_buf_send_fix.view().data(); + buf_recv_fix = k_buf_recv_fix.view().data(); + } else { + k_buf_send_fix.modify(); + k_buf_send_fix.sync(); + buf_send_fix = k_buf_send_fix.h_view.data(); + buf_recv_fix = k_buf_recv_fix.h_view.data(); + } + + if (recvnum[iswap]) { + MPI_Irecv(buf_recv_fix,nsize*recvnum[iswap],MPI_DOUBLE, + recvproc[iswap],0,world,&request); + } + if (sendnum[iswap]) + MPI_Send(buf_send_fix,n,MPI_DOUBLE,sendproc[iswap],0,world); + if (recvnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_aware_flag) { + k_buf_recv_fix.modify(); + k_buf_recv_fix.sync(); + } + } else k_buf_recv_fix = k_buf_send_fix; + + // unpack buffer + + fixKKBase->unpack_forward_comm_fix_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv_fix); + DeviceType().fence(); + } +} + +/* ---------------------------------------------------------------------- */ + void CommKokkos::reverse_comm_fix(Fix *fix, int size) { k_sendlist.sync(); @@ -456,6 +530,12 @@ void CommKokkos::grow_buf_pair(int n) { k_buf_recv_pair.resize(max_buf_pair); } +void CommKokkos::grow_buf_fix(int n) { + max_buf_fix = n * BUFFACTOR; + k_buf_send_fix.resize(max_buf_fix); + k_buf_recv_fix.resize(max_buf_fix); +} + void CommKokkos::reverse_comm_pair(Pair *pair) { k_sendlist.sync(); @@ -491,8 +571,8 @@ void CommKokkos::exchange() if(!exchange_comm_classic) { static int print = 1; if(print && comm->me==0) { - error->warning(FLERR,"Fixes cannot yet send data in Kokkos communication, " - "switching to classic communication"); + error->warning(FLERR,"Fixes cannot yet send exchange data in Kokkos communication, " + "switching to classic exchange/border communication"); } print = 0; exchange_comm_classic = true; @@ -742,7 +822,7 @@ void CommKokkos::borders() (ghost_velocity && ((AtomVecKokkos*)atom->avec)->no_border_vel_flag)) { if (print && comm->me==0) { error->warning(FLERR,"Required border comm not yet implemented in Kokkos communication, " - "switching to classic communication"); + "switching to classic exchange/border communication"); } print = 0; exchange_comm_classic = true; diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index 9d8766e309..6b4a201ab7 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -51,6 +51,7 @@ class CommKokkos : public CommBrick { template void forward_comm_device(int dummy); template void reverse_comm_device(); template void forward_comm_pair_device(Pair *pair); + template void forward_comm_fix_device(Fix *fix, int size=0); template void exchange_device(); template void borders_device(); @@ -73,10 +74,11 @@ class CommKokkos : public CommBrick { DAT::tdual_int_1d k_sendnum_scan; int totalsend; - int max_buf_pair; - DAT::tdual_xfloat_1d k_buf_send_pair; - DAT::tdual_xfloat_1d k_buf_recv_pair; + int max_buf_pair,max_buf_fix; + DAT::tdual_xfloat_1d k_buf_send_pair, k_buf_send_fix; + DAT::tdual_xfloat_1d k_buf_recv_pair, k_buf_recv_fix; void grow_buf_pair(int); + void grow_buf_fix(int); void grow_send(int, int); void grow_recv(int); diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp index bf047e7769..c96f5ded68 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp @@ -47,6 +47,7 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) : FixQEqReax(lmp, narg, arg) { kokkosable = 1; + forward_comm_device = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; @@ -262,19 +263,15 @@ void FixQEqReaxKokkos::pre_force(int /*vflag*/) // comm->forward_comm_fix(this); //Dist_vector( s ); pack_flag = 2; - k_s.template modify(); - k_s.template sync(); - comm->forward_comm_fix(this); - k_s.template modify(); k_s.template sync(); + comm->forward_comm_fix(this); + k_s.template modify(); // comm->forward_comm_fix(this); //Dist_vector( t ); pack_flag = 3; - k_t.template modify(); - k_t.template sync(); - comm->forward_comm_fix(this); - k_t.template modify(); k_t.template sync(); + comm->forward_comm_fix(this); + k_t.template modify(); need_dup = lmp->kokkos->need_dup(); @@ -784,11 +781,9 @@ void FixQEqReaxKokkos::cg_solve1() // comm->forward_comm_fix(this); //Dist_vector( d ); pack_flag = 1; - k_d.template modify(); - k_d.template sync(); - comm->forward_comm_fix(this); - k_d.template modify(); k_d.template sync(); + comm->forward_comm_fix(this); + k_d.template modify(); // sparse_matvec( &H, d, q ); FixQEqReaxKokkosSparse22Functor sparse22_functor(this); @@ -922,11 +917,9 @@ void FixQEqReaxKokkos::cg_solve2() // comm->forward_comm_fix(this); //Dist_vector( d ); pack_flag = 1; - k_d.template modify(); - k_d.template sync(); - comm->forward_comm_fix(this); - k_d.template modify(); k_d.template sync(); + comm->forward_comm_fix(this); + k_d.template modify(); // sparse_matvec( &H, d, q ); FixQEqReaxKokkosSparse22Functor sparse22_functor(this); @@ -1027,11 +1020,9 @@ void FixQEqReaxKokkos::calculate_q() pack_flag = 4; //comm->forward_comm_fix( this ); //Dist_vector( atom->q ); - atomKK->k_q.modify(); - atomKK->k_q.sync(); - comm->forward_comm_fix(this); - atomKK->k_q.modify(); atomKK->k_q.sync(); + comm->forward_comm_fix(this); + atomKK->k_q.modify(); } @@ -1361,6 +1352,59 @@ void FixQEqReaxKokkos::calculate_q_item(int ii) const /* ---------------------------------------------------------------------- */ +template +int FixQEqReaxKokkos::pack_forward_comm_fix_kokkos(int n, DAT::tdual_int_2d k_sendlist, + int iswap_in, DAT::tdual_xfloat_1d &k_buf, + int /*pbc_flag*/, int * /*pbc*/) +{ + d_sendlist = k_sendlist.view(); + iswap = iswap_in; + d_buf = k_buf.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); + return n; +} + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxKokkos::operator()(TagFixQEqReaxPackForwardComm, const int &i) const { + int j = d_sendlist(iswap, i); + + if (pack_flag == 1) + d_buf[i] = d_d[j]; + else if( pack_flag == 2 ) + d_buf[i] = d_s[j]; + else if( pack_flag == 3 ) + d_buf[i] = d_t[j]; + else if( pack_flag == 4 ) + d_buf[i] = q[j]; +} + +/* ---------------------------------------------------------------------- */ + +template +void FixQEqReaxKokkos::unpack_forward_comm_fix_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf) +{ + first = first_in; + d_buf = buf.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); +} + +template +KOKKOS_INLINE_FUNCTION +void FixQEqReaxKokkos::operator()(TagFixQEqReaxUnpackForwardComm, const int &i) const { + if (pack_flag == 1) + d_d[i + first] = d_buf[i]; + else if( pack_flag == 2) + d_s[i + first] = d_buf[i]; + else if( pack_flag == 3) + d_t[i + first] = d_buf[i]; + else if( pack_flag == 4) + q[i + first] = d_buf[i]; + +} + +/* ---------------------------------------------------------------------- */ + template int FixQEqReaxKokkos::pack_forward_comm(int n, int *list, double *buf, int /*pbc_flag*/, int * /*pbc*/) diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.h b/src/KOKKOS/fix_qeq_reax_kokkos.h index bed532905e..cc9bff3652 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.h +++ b/src/KOKKOS/fix_qeq_reax_kokkos.h @@ -26,6 +26,7 @@ FixStyle(qeq/reax/kk/host,FixQEqReaxKokkos) #include "kokkos_type.h" #include "neigh_list.h" #include "neigh_list_kokkos.h" +#include "kokkos_base.h" namespace LAMMPS_NS { @@ -33,9 +34,11 @@ struct TagSparseMatvec1 {}; struct TagSparseMatvec2 {}; struct TagSparseMatvec3 {}; struct TagZeroQGhosts{}; +struct TagFixQEqReaxPackForwardComm {}; +struct TagFixQEqReaxUnpackForwardComm {}; template -class FixQEqReaxKokkos : public FixQEqReax { +class FixQEqReaxKokkos : public FixQEqReax, public KokkosBase { public: typedef DeviceType device_type; typedef ArrayTypes AT; @@ -136,6 +139,12 @@ class FixQEqReaxKokkos : public FixQEqReax { KOKKOS_INLINE_FUNCTION double calculate_H_k(const F_FLOAT &r, const F_FLOAT &shld) const; + KOKKOS_INLINE_FUNCTION + void operator()(TagFixQEqReaxPackForwardComm, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagFixQEqReaxUnpackForwardComm, const int&) const; + struct params_qeq{ KOKKOS_INLINE_FUNCTION params_qeq(){chi=0;eta=0;gamma=0;}; @@ -144,6 +153,9 @@ class FixQEqReaxKokkos : public FixQEqReax { F_FLOAT chi, eta, gamma; }; + int pack_forward_comm_fix_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d&, + int, int *); + void unpack_forward_comm_fix_kokkos(int, int, DAT::tdual_xfloat_1d&); virtual int pack_forward_comm(int, int *, double *, int, int *); virtual void unpack_forward_comm(int, int, double *); int pack_reverse_comm(int, int, double *); @@ -203,6 +215,11 @@ class FixQEqReaxKokkos : public FixQEqReax { Kokkos::Experimental::ScatterView::value, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterDuplicated> dup_o; Kokkos::Experimental::ScatterView::value, Kokkos::Experimental::ScatterSum, Kokkos::Experimental::ScatterNonDuplicated> ndup_o; + int iswap; + int first; + typename AT::t_int_2d d_sendlist; + typename AT::t_xfloat_1d_um d_buf; + void init_shielding_k(); void init_hist(); void allocate_matrix(); @@ -216,11 +233,6 @@ class FixQEqReaxKokkos : public FixQEqReax { int count, isuccess; double alpha, beta, delta, cutsq; - int iswap; - int first; - typename AT::t_int_2d d_sendlist; - typename AT::t_xfloat_1d_um v_buf; - void grow_arrays(int); void copy_arrays(int, int, int); int pack_exchange(int, double *); diff --git a/src/KOKKOS/fix_shake_kokkos.cpp b/src/KOKKOS/fix_shake_kokkos.cpp new file mode 100644 index 0000000000..854dfefeed --- /dev/null +++ b/src/KOKKOS/fix_shake_kokkos.cpp @@ -0,0 +1,1852 @@ +/* ---------------------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#include +#include +#include +#include +#include +#include "fix_shake_kokkos.h" +#include "fix_rattle.h" +#include "atom_kokkos.h" +#include "atom_vec.h" +#include "molecule.h" +#include "update.h" +#include "respa.h" +#include "modify.h" +#include "domain.h" +#include "force.h" +#include "bond.h" +#include "angle.h" +#include "comm.h" +#include "group.h" +#include "fix_respa.h" +#include "math_const.h" +#include "memory_kokkos.h" +#include "error.h" +#include "kokkos.h" +#include "atom_masks.h" + +using namespace LAMMPS_NS; +using namespace FixConst; +using namespace MathConst; + +#define RVOUS 1 // 0 for irregular, 1 for all2all + +#define BIG 1.0e20 +#define MASSDELTA 0.1 + +/* ---------------------------------------------------------------------- */ + +template +FixShakeKokkos::FixShakeKokkos(LAMMPS *lmp, int narg, char **arg) : + FixShake(lmp, narg, arg) +{ + kokkosable = 1; + forward_comm_device = 1; + atomKK = (AtomKokkos *)atom; + execution_space = ExecutionSpaceFromDevice::space; + + datamask_read = EMPTY_MASK; + datamask_modify = EMPTY_MASK; + + shake_flag_tmp = shake_flag; + shake_atom_tmp = shake_atom; + shake_type_tmp = shake_type; + + shake_flag = nullptr; + shake_atom = nullptr; + shake_type = nullptr; + + int nmax = atom->nmax; + + grow_arrays(nmax); + + for (int i = 0; i < nmax; i++) { + k_shake_flag.h_view[i] = shake_flag_tmp[i]; + k_shake_atom.h_view(i,0) = shake_atom_tmp[i][0]; + k_shake_atom.h_view(i,1) = shake_atom_tmp[i][1]; + k_shake_atom.h_view(i,2) = shake_atom_tmp[i][2]; + k_shake_atom.h_view(i,3) = shake_atom_tmp[i][3]; + k_shake_type.h_view(i,0) = shake_type_tmp[i][0]; + k_shake_type.h_view(i,1) = shake_type_tmp[i][1]; + k_shake_type.h_view(i,2) = shake_type_tmp[i][2]; + } + + k_shake_flag.modify_host(); + k_shake_atom.modify_host(); + k_shake_type.modify_host(); + + k_bond_distance = DAT::tdual_float_1d("fix_shake:bond_distance",atom->nbondtypes+1); + k_angle_distance = DAT::tdual_float_1d("fix_shake:angle_distance",atom->nangletypes+1); + + d_bond_distance = k_bond_distance.view(); + d_angle_distance = k_angle_distance.view(); + + // use 1D view for scalars to reduce GPU memory operations + + d_scalars = typename AT::t_int_1d("neighbor:scalars",2); + h_scalars = HAT::t_int_1d("neighbor:scalars_mirror",2); + + d_error_flag = Kokkos::subview(d_scalars,0); + d_nlist = Kokkos::subview(d_scalars,1); + + h_error_flag = Kokkos::subview(h_scalars,0); + h_nlist = Kokkos::subview(h_scalars,1); +} + +/* ---------------------------------------------------------------------- */ + +template +FixShakeKokkos::~FixShakeKokkos() +{ + if (copymode) return; + + memoryKK->destroy_kokkos(k_shake_flag,shake_flag); + memoryKK->destroy_kokkos(k_shake_atom,shake_atom); + memoryKK->destroy_kokkos(k_shake_type,shake_type); + memoryKK->destroy_kokkos(k_xshake,xshake); + memoryKK->destroy_kokkos(k_list,list); + + memoryKK->destroy_kokkos(k_vatom,vatom); + + shake_flag = shake_flag_tmp; + shake_atom = shake_atom_tmp; + shake_type = shake_type_tmp; +} + +/* ---------------------------------------------------------------------- + set bond and angle distances + this init must happen after force->bond and force->angle inits +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::init() +{ + FixShake::init(); + + if (strstr(update->integrate_style,"respa")) + error->all(FLERR,"Cannot yet use respa with Kokkos"); + + if (rattle) + error->all(FLERR,"Cannot yet use KOKKOS package with fix rattle"); + + // set equilibrium bond distances + + for (int i = 1; i <= atom->nbondtypes; i++) + k_bond_distance.h_view[i] = bond_distance[i]; + + // set equilibrium angle distances + + for (int i = 1; i <= atom->nangletypes; i++) + k_angle_distance.h_view[i] = angle_distance[i]; + + k_bond_distance.modify_host(); + k_angle_distance.modify_host(); + + k_bond_distance.sync(); + k_angle_distance.sync(); +} + + +/* ---------------------------------------------------------------------- + build list of SHAKE clusters to constrain + if one or more atoms in cluster are on this proc, + this proc lists the cluster exactly once +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::pre_neighbor() +{ + // local copies of atom quantities + // used by SHAKE until next re-neighboring + + x = atom->x; + v = atom->v; + f = atom->f; + mass = atom->mass; + rmass = atom->rmass; + type = atom->type; + nlocal = atom->nlocal; + + k_shake_flag.sync(); + k_shake_atom.sync(); + + // extend size of SHAKE list if necessary + + if (nlocal > maxlist) { + maxlist = nlocal; + memoryKK->destroy_kokkos(k_list,list); + memoryKK->create_kokkos(k_list,list,maxlist,"shake:list"); + d_list = k_list.view(); + } + + // don't yet have atom_map_kokkos routines, so move data from host to device + + if (atom->map_style != Atom::MAP_ARRAY) + error->all(FLERR,"Must use atom map style array with Kokkos"); + + int* map_array_host = atom->get_map_array(); + int map_size = atom->get_map_size(); + int map_maxarray = atom->get_map_maxarray(); + if (map_maxarray > (int)k_map_array.extent(0)) + k_map_array = DAT::tdual_int_1d("NeighBond:map_array",map_maxarray); + for (int i=0; i(); + k_map_array.template sync(); + map_array = k_map_array.view(); + + // build list of SHAKE clusters I compute + + Kokkos::deep_copy(d_scalars,0); + + { + // local variables for lambda capture + + auto d_shake_flag = this->d_shake_flag; + auto d_shake_atom = this->d_shake_atom; + auto d_list = this->d_list; + auto d_error_flag = this->d_error_flag; + auto d_nlist = this->d_nlist; + auto map_array = this->map_array; + + Kokkos::parallel_for(nlocal, LAMMPS_LAMBDA(const int& i) { + if (d_shake_flag[i]) { + if (d_shake_flag[i] == 2) { + const int atom1 = map_array(d_shake_atom(i,0)); + const int atom2 = map_array(d_shake_atom(i,1)); + if (atom1 == -1 || atom2 == -1) { + d_error_flag() = 1; + } + if (i <= atom1 && i <= atom2) { + const int nlist = Kokkos::atomic_fetch_add(&d_nlist(),1); + d_list[nlist] = i; + } + } else if (d_shake_flag[i] % 2 == 1) { + const int atom1 = map_array(d_shake_atom(i,0)); + const int atom2 = map_array(d_shake_atom(i,1)); + const int atom3 = map_array(d_shake_atom(i,2)); + if (atom1 == -1 || atom2 == -1 || atom3 == -1) + d_error_flag() = 1; + if (i <= atom1 && i <= atom2 && i <= atom3) { + const int nlist = Kokkos::atomic_fetch_add(&d_nlist(),1); + d_list[nlist] = i; + } + } else { + const int atom1 = map_array(d_shake_atom(i,0)); + const int atom2 = map_array(d_shake_atom(i,1)); + const int atom3 = map_array(d_shake_atom(i,2)); + const int atom4 = map_array(d_shake_atom(i,3)); + if (atom1 == -1 || atom2 == -1 || atom3 == -1 || atom4 == -1) + d_error_flag() = 1; + if (i <= atom1 && i <= atom2 && i <= atom3 && i <= atom4) { + const int nlist = Kokkos::atomic_fetch_add(&d_nlist(),1); + d_list[nlist] = i; + } + } + } + }); + } + + Kokkos::deep_copy(h_scalars,d_scalars); + nlist = h_nlist(); + + if (h_error_flag() == 1) { + error->one(FLERR,fmt::format("Shake atoms missing on proc " + "{} at step {}",me,update->ntimestep)); + } +} + +/* ---------------------------------------------------------------------- + compute the force adjustment for SHAKE constraint +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::post_force(int vflag) +{ + copymode = 1; + + d_x = atomKK->k_x.view(); + d_f = atomKK->k_f.view(); + d_type = atomKK->k_type.view(); + d_rmass = atomKK->k_rmass.view(); + d_mass = atomKK->k_mass.view(); + nlocal = atomKK->nlocal; + + if (d_rmass.data()) + atomKK->sync(Device,X_MASK|F_MASK|RMASS_MASK); + else + atomKK->sync(Device,X_MASK|F_MASK|TYPE_MASK); + + k_shake_flag.sync(); + k_shake_atom.sync(); + k_shake_type.sync(); + + if (update->ntimestep == next_output) { + atomKK->sync(Host,X_MASK); + k_shake_flag.sync_host(); + k_shake_atom.sync_host(); + stats(); + } + + // xshake = unconstrained move with current v,f + // communicate results if necessary + + unconstrained_update(); + if (nprocs > 1) comm->forward_comm_fix(this); + k_xshake.sync(); + + // virial setup + + if (vflag) v_setup(vflag); + else evflag = 0; + + // reallocate per-atom arrays if necessary + + if (vflag_atom) { + memoryKK->destroy_kokkos(k_vatom,vatom); + memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"improper:vatom"); + d_vatom = k_vatom.template view(); + } + + need_dup = lmp->kokkos->need_dup(); + neighflag = lmp->kokkos->neighflag; + + // allocate duplicated memory + + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view(d_f); + dup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view(d_f); + ndup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } + + Kokkos::deep_copy(d_error_flag,0); + + update_domain_variables(); + + EV_FLOAT ev; + + // loop over clusters to add constraint forces + + if (neighflag == HALF) { + if (evflag) + Kokkos::parallel_reduce(Kokkos::RangePolicy >(0,nlist),*this,ev); + else + Kokkos::parallel_for(Kokkos::RangePolicy >(0,nlist),*this); + } else if (neighflag == HALFTHREAD) { + if (evflag) + Kokkos::parallel_reduce(Kokkos::RangePolicy >(0,nlist),*this,ev); + else + Kokkos::parallel_for(Kokkos::RangePolicy >(0,nlist),*this); + } + + copymode = 0; + + Kokkos::deep_copy(h_error_flag,d_error_flag); + + if (h_error_flag() == 2) + error->warning(FLERR,"Shake determinant < 0.0",0); + else if (h_error_flag() == 3) + error->one(FLERR,"Shake determinant = 0.0"); + + // store vflag for coordinate_constraints_end_of_step() + + vflag_post_force = vflag; + + // reduction over duplicated memory + + if (need_dup) + Kokkos::Experimental::contribute(d_f,dup_f); + + atomKK->modified(Device,F_MASK); + + if (vflag_global) { + virial[0] += ev.v[0]; + virial[1] += ev.v[1]; + virial[2] += ev.v[2]; + virial[3] += ev.v[3]; + virial[4] += ev.v[4]; + virial[5] += ev.v[5]; + } + + if (vflag_atom) { + k_vatom.template modify(); + k_vatom.template sync(); + } + + // free duplicated memory + + if (need_dup) { + dup_f = decltype(dup_f)(); + dup_vatom = decltype(dup_vatom)(); + } +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::operator()(TagFixShakePostForce, const int &i, EV_FLOAT& ev) const { + const int m = d_list[i]; + if (d_shake_flag[m] == 2) shake(m,ev); + else if (d_shake_flag[m] == 3) shake3(m,ev); + else if (d_shake_flag[m] == 4) shake4(m,ev); + else shake3angle(m,ev); +} + +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::operator()(TagFixShakePostForce, const int &i) const { + EV_FLOAT ev; + this->template operator()(TagFixShakePostForce(), i, ev); +} + +/* ---------------------------------------------------------------------- + count # of degrees-of-freedom removed by SHAKE for atoms in igroup +------------------------------------------------------------------------- */ + +template +int FixShakeKokkos::dof(int igroup) +{ + + d_mask = atomKK->k_mask.view(); + d_tag = atomKK->k_tag.view(); + nlocal = atom->nlocal; + + atomKK->sync(Device,MASK_MASK|TAG_MASK); + k_shake_flag.sync(); + k_shake_atom.sync(); + + // count dof in a cluster if and only if + // the central atom is in group and atom i is the central atom + + int n = 0; + { + // local variables for lambda capture + + auto d_shake_flag = this->d_shake_flag; + auto d_shake_atom = this->d_shake_atom; + auto tag = this->d_tag; + auto mask = this->d_mask; + auto groupbit = group->bitmask[igroup]; + + Kokkos::parallel_reduce(nlocal, LAMMPS_LAMBDA(const int& i, int& n) { + if (!(mask[i] & groupbit)) return; + if (d_shake_flag[i] == 0) return; + if (d_shake_atom(i,0) != tag[i]) return; + if (d_shake_flag[i] == 1) n += 3; + else if (d_shake_flag[i] == 2) n += 1; + else if (d_shake_flag[i] == 3) n += 2; + else if (d_shake_flag[i] == 4) n += 3; + },n); + } + + int nall; + MPI_Allreduce(&n,&nall,1,MPI_INT,MPI_SUM,world); + return nall; +} + + +/* ---------------------------------------------------------------------- + assumes NVE update, seems to be accurate enough for NVT,NPT,NPH as well +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::unconstrained_update() +{ + d_x = atomKK->k_x.view(); + d_v = atomKK->k_v.view(); + d_f = atomKK->k_f.view(); + d_type = atomKK->k_type.view(); + d_rmass = atomKK->k_rmass.view(); + d_mass = atomKK->k_mass.view(); + nlocal = atom->nlocal; + + if (d_rmass.data()) + atomKK->sync(Device,X_MASK|V_MASK|F_MASK|RMASS_MASK); + else + atomKK->sync(Device,X_MASK|V_MASK|F_MASK|TYPE_MASK); + + + k_shake_flag.sync(); + k_xshake.sync(); + + { + // local variables for lambda capture + + auto d_shake_flag = this->d_shake_flag; + auto d_xshake = this->d_xshake; + auto x = this->d_x; + auto v = this->d_v; + auto f = this->d_f; + auto dtfsq = this->dtfsq; + auto dtv = this->dtv; + + if (d_rmass.data()) { + + auto rmass = this->d_rmass; + + Kokkos::parallel_for(nlocal, LAMMPS_LAMBDA(const int& i) { + if (d_shake_flag[i]) { + const double dtfmsq = dtfsq / rmass[i]; + d_xshake(i,0) = x(i,0) + dtv*v(i,0) + dtfmsq*f(i,0); + d_xshake(i,1) = x(i,1) + dtv*v(i,1) + dtfmsq*f(i,1); + d_xshake(i,2) = x(i,2) + dtv*v(i,2) + dtfmsq*f(i,2); + } else d_xshake(i,2) = d_xshake(i,1) = d_xshake(i,0) = 0.0; + }); + } else { + + auto mass = this->d_mass; + auto type = this->d_type; + + Kokkos::parallel_for(nlocal, LAMMPS_LAMBDA(const int& i) { + if (d_shake_flag[i]) { + const double dtfmsq = dtfsq / mass[type[i]]; + d_xshake(i,0) = x(i,0) + dtv*v(i,0) + dtfmsq*f(i,0); + d_xshake(i,1) = x(i,1) + dtv*v(i,1) + dtfmsq*f(i,1); + d_xshake(i,2) = x(i,2) + dtv*v(i,2) + dtfmsq*f(i,2); + } else d_xshake(i,2) = d_xshake(i,1) = d_xshake(i,0) = 0.0; + }); + } + } + + k_xshake.modify(); +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::shake(int m, EV_FLOAT& ev) const +{ + + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access::value>(); + + int nlist,list[2]; + double v[6]; + double invmass0,invmass1; + + // local atom IDs and constraint distances + + int i0 = map_array(d_shake_atom(m,0)); + int i1 = map_array(d_shake_atom(m,1)); + double bond1 = d_bond_distance[d_shake_type(m,0)]; + + // r01 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = d_x(i0,0) - d_x(i1,0); + r01[1] = d_x(i0,1) - d_x(i1,1); + r01[2] = d_x(i0,2) - d_x(i1,2); + minimum_image(r01); + + // s01 = distance vec after unconstrained update, with PBC + // use Domain::minimum_image_once(), not minimum_image() + // b/c xshake values might be huge, due to e.g. fix gcmc + + double s01[3]; + s01[0] = d_xshake(i0,0) - d_xshake(i1,0); + s01[1] = d_xshake(i0,1) - d_xshake(i1,1); + s01[2] = d_xshake(i0,2) - d_xshake(i1,2); + minimum_image_once(s01); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + + // a,b,c = coeffs in quadratic equation for lamda + + if (d_rmass.data()) { + invmass0 = 1.0/d_rmass[i0]; + invmass1 = 1.0/d_rmass[i1]; + } else { + invmass0 = 1.0/d_mass[d_type[i0]]; + invmass1 = 1.0/d_mass[d_type[i1]]; + } + + double a = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double b = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double c = s01sq - bond1*bond1; + + // error check + + double determ = b*b - 4.0*a*c; + if (determ < 0.0) { + //error->warning(FLERR,"Shake determinant < 0.0",0); + d_error_flag() = 2; + determ = 0.0; + } + + // exact quadratic solution for lamda + + double lamda,lamda1,lamda2; + lamda1 = (-b+sqrt(determ)) / (2.0*a); + lamda2 = (-b-sqrt(determ)) / (2.0*a); + + if (fabs(lamda1) <= fabs(lamda2)) lamda = lamda1; + else lamda = lamda2; + + // update forces if atom is owned by this processor + + lamda /= dtfsq; + + if (i0 < nlocal) { + a_f(i0,0) += lamda*r01[0]; + a_f(i0,1) += lamda*r01[1]; + a_f(i0,2) += lamda*r01[2]; + } + + if (i1 < nlocal) { + a_f(i1,0) -= lamda*r01[0]; + a_f(i1,1) -= lamda*r01[1]; + a_f(i1,2) -= lamda*r01[2]; + } + + if (EVFLAG) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + + v[0] = lamda*r01[0]*r01[0]; + v[1] = lamda*r01[1]*r01[1]; + v[2] = lamda*r01[2]*r01[2]; + v[3] = lamda*r01[0]*r01[1]; + v[4] = lamda*r01[0]*r01[2]; + v[5] = lamda*r01[1]*r01[2]; + + v_tally(ev,nlist,list,2.0,v); + } +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::shake3(int m, EV_FLOAT& ev) const +{ + + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access::value>(); + + int nlist,list[3]; + double v[6]; + double invmass0,invmass1,invmass2; + + // local atom IDs and constraint distances + + int i0 = map_array(d_shake_atom(m,0)); + int i1 = map_array(d_shake_atom(m,1)); + int i2 = map_array(d_shake_atom(m,2)); + double bond1 = d_bond_distance[d_shake_type(m,0)]; + double bond2 = d_bond_distance[d_shake_type(m,1)]; + + // r01,r02 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = d_x(i0,0) - d_x(i1,0); + r01[1] = d_x(i0,1) - d_x(i1,1); + r01[2] = d_x(i0,2) - d_x(i1,2); + minimum_image(r01); + + double r02[3]; + r02[0] = d_x(i0,0) - d_x(i2,0); + r02[1] = d_x(i0,1) - d_x(i2,1); + r02[2] = d_x(i0,2) - d_x(i2,2); + minimum_image(r02); + + // s01,s02 = distance vec after unconstrained update, with PBC + // use Domain::minimum_image_once(), not minimum_image() + // b/c xshake values might be huge, due to e.g. fix gcmc + + double s01[3]; + s01[0] = d_xshake(i0,0) - d_xshake(i1,0); + s01[1] = d_xshake(i0,1) - d_xshake(i1,1); + s01[2] = d_xshake(i0,2) - d_xshake(i1,2); + minimum_image_once(s01); + + double s02[3]; + s02[0] = d_xshake(i0,0) - d_xshake(i2,0); + s02[1] = d_xshake(i0,1) - d_xshake(i2,1); + s02[2] = d_xshake(i0,2) - d_xshake(i2,2); + minimum_image_once(s02); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2]; + + // matrix coeffs and rhs for lamda equations + + if (d_rmass.data()) { + invmass0 = 1.0/d_rmass[i0]; + invmass1 = 1.0/d_rmass[i1]; + invmass2 = 1.0/d_rmass[i2]; + } else { + invmass0 = 1.0/d_mass[d_type[i0]]; + invmass1 = 1.0/d_mass[d_type[i1]]; + invmass2 = 1.0/d_mass[d_type[i2]]; + } + + double a11 = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double a12 = 2.0 * invmass0 * + (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]); + double a21 = 2.0 * invmass0 * + (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]); + double a22 = 2.0 * (invmass0+invmass2) * + (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]); + + // inverse of matrix + + double determ = a11*a22 - a12*a21; + if (determ == 0.0) d_error_flag() = 3; + //error->one(FLERR,"Shake determinant = 0.0"); + double determinv = 1.0/determ; + + double a11inv = a22*determinv; + double a12inv = -a12*determinv; + double a21inv = -a21*determinv; + double a22inv = a11*determinv; + + // quadratic correction coeffs + + double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]); + + double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double quad1_0202 = invmass0*invmass0 * r02sq; + double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102; + + double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + double quad2_0101 = invmass0*invmass0 * r01sq; + double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102; + + // iterate until converged + + double lamda01 = 0.0; + double lamda02 = 0.0; + int niter = 0; + int done = 0; + + double quad1,quad2,b1,b2,lamda01_new,lamda02_new; + + while (!done && niter < max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + quad1_0202 * lamda02*lamda02 + + quad1_0102 * lamda01*lamda02; + quad2 = quad2_0101 * lamda01*lamda01 + quad2_0202 * lamda02*lamda02 + + quad2_0102 * lamda01*lamda02; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + + lamda01_new = a11inv*b1 + a12inv*b2; + lamda02_new = a21inv*b1 + a22inv*b2; + + done = 1; + if (fabs(lamda01_new-lamda01) > tolerance) done = 0; + if (fabs(lamda02_new-lamda02) > tolerance) done = 0; + + lamda01 = lamda01_new; + lamda02 = lamda02_new; + + // stop iterations before we have a floating point overflow + // max double is < 1.0e308, so 1e150 is a reasonable cutoff + + if (fabs(lamda01) > 1e150 || fabs(lamda02) > 1e150) done = 1; + + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 = lamda01/dtfsq; + lamda02 = lamda02/dtfsq; + + if (i0 < nlocal) { + a_f(i0,0) += lamda01*r01[0] + lamda02*r02[0]; + a_f(i0,1) += lamda01*r01[1] + lamda02*r02[1]; + a_f(i0,2) += lamda01*r01[2] + lamda02*r02[2]; + } + + if (i1 < nlocal) { + a_f(i1,0) -= lamda01*r01[0]; + a_f(i1,1) -= lamda01*r01[1]; + a_f(i1,2) -= lamda01*r01[2]; + } + + if (i2 < nlocal) { + a_f(i2,0) -= lamda02*r02[0]; + a_f(i2,1) -= lamda02*r02[1]; + a_f(i2,2) -= lamda02*r02[2]; + } + + if (EVFLAG) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + if (i2 < nlocal) list[nlist++] = i2; + + v[0] = lamda01*r01[0]*r01[0] + lamda02*r02[0]*r02[0]; + v[1] = lamda01*r01[1]*r01[1] + lamda02*r02[1]*r02[1]; + v[2] = lamda01*r01[2]*r01[2] + lamda02*r02[2]*r02[2]; + v[3] = lamda01*r01[0]*r01[1] + lamda02*r02[0]*r02[1]; + v[4] = lamda01*r01[0]*r01[2] + lamda02*r02[0]*r02[2]; + v[5] = lamda01*r01[1]*r01[2] + lamda02*r02[1]*r02[2]; + + v_tally(ev,nlist,list,3.0,v); + } +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::shake4(int m, EV_FLOAT& ev) const +{ + + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access::value>(); + + int nlist,list[4]; + double v[6]; + double invmass0,invmass1,invmass2,invmass3; + + // local atom IDs and constraint distances + + int i0 = map_array(d_shake_atom(m,0)); + int i1 = map_array(d_shake_atom(m,1)); + int i2 = map_array(d_shake_atom(m,2)); + int i3 = map_array(d_shake_atom(m,3)); + double bond1 = d_bond_distance[d_shake_type(m,0)]; + double bond2 = d_bond_distance[d_shake_type(m,1)]; + double bond3 = d_bond_distance[d_shake_type(m,2)]; + + // r01,r02,r03 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = d_x(i0,0) - d_x(i1,0); + r01[1] = d_x(i0,1) - d_x(i1,1); + r01[2] = d_x(i0,2) - d_x(i1,2); + minimum_image(r01); + + double r02[3]; + r02[0] = d_x(i0,0) - d_x(i2,0); + r02[1] = d_x(i0,1) - d_x(i2,1); + r02[2] = d_x(i0,2) - d_x(i2,2); + minimum_image(r02); + + double r03[3]; + r03[0] = d_x(i0,0) - d_x(i3,0); + r03[1] = d_x(i0,1) - d_x(i3,1); + r03[2] = d_x(i0,2) - d_x(i3,2); + minimum_image(r03); + + // s01,s02,s03 = distance vec after unconstrained update, with PBC + // use Domain::minimum_image_once(), not minimum_image() + // b/c xshake values might be huge, due to e.g. fix gcmc + + double s01[3]; + s01[0] = d_xshake(i0,0) - d_xshake(i1,0); + s01[1] = d_xshake(i0,1) - d_xshake(i1,1); + s01[2] = d_xshake(i0,2) - d_xshake(i1,2); + minimum_image_once(s01); + + double s02[3]; + s02[0] = d_xshake(i0,0) - d_xshake(i2,0); + s02[1] = d_xshake(i0,1) - d_xshake(i2,1); + s02[2] = d_xshake(i0,2) - d_xshake(i2,2); + minimum_image_once(s02); + + double s03[3]; + s03[0] = d_xshake(i0,0) - d_xshake(i3,0); + s03[1] = d_xshake(i0,1) - d_xshake(i3,1); + s03[2] = d_xshake(i0,2) - d_xshake(i3,2); + minimum_image_once(s03); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2]; + double r03sq = r03[0]*r03[0] + r03[1]*r03[1] + r03[2]*r03[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2]; + double s03sq = s03[0]*s03[0] + s03[1]*s03[1] + s03[2]*s03[2]; + + // matrix coeffs and rhs for lamda equations + + if (d_rmass.data()) { + invmass0 = 1.0/d_rmass[i0]; + invmass1 = 1.0/d_rmass[i1]; + invmass2 = 1.0/d_rmass[i2]; + invmass3 = 1.0/d_rmass[i3]; + } else { + invmass0 = 1.0/d_mass[d_type[i0]]; + invmass1 = 1.0/d_mass[d_type[i1]]; + invmass2 = 1.0/d_mass[d_type[i2]]; + invmass3 = 1.0/d_mass[d_type[i3]]; + } + + double a11 = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double a12 = 2.0 * invmass0 * + (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]); + double a13 = 2.0 * invmass0 * + (s01[0]*r03[0] + s01[1]*r03[1] + s01[2]*r03[2]); + double a21 = 2.0 * invmass0 * + (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]); + double a22 = 2.0 * (invmass0+invmass2) * + (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]); + double a23 = 2.0 * invmass0 * + (s02[0]*r03[0] + s02[1]*r03[1] + s02[2]*r03[2]); + double a31 = 2.0 * invmass0 * + (s03[0]*r01[0] + s03[1]*r01[1] + s03[2]*r01[2]); + double a32 = 2.0 * invmass0 * + (s03[0]*r02[0] + s03[1]*r02[1] + s03[2]*r02[2]); + double a33 = 2.0 * (invmass0+invmass3) * + (s03[0]*r03[0] + s03[1]*r03[1] + s03[2]*r03[2]); + + // inverse of matrix; + + double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - + a11*a23*a32 - a12*a21*a33 - a13*a22*a31; + if (determ == 0.0) d_error_flag() = 3; + //error->one(FLERR,"Shake determinant = 0.0"); + double determinv = 1.0/determ; + + double a11inv = determinv * (a22*a33 - a23*a32); + double a12inv = -determinv * (a12*a33 - a13*a32); + double a13inv = determinv * (a12*a23 - a13*a22); + double a21inv = -determinv * (a21*a33 - a23*a31); + double a22inv = determinv * (a11*a33 - a13*a31); + double a23inv = -determinv * (a11*a23 - a13*a21); + double a31inv = determinv * (a21*a32 - a22*a31); + double a32inv = -determinv * (a11*a32 - a12*a31); + double a33inv = determinv * (a11*a22 - a12*a21); + + // quadratic correction coeffs + + double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]); + double r0103 = (r01[0]*r03[0] + r01[1]*r03[1] + r01[2]*r03[2]); + double r0203 = (r02[0]*r03[0] + r02[1]*r03[1] + r02[2]*r03[2]); + + double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double quad1_0202 = invmass0*invmass0 * r02sq; + double quad1_0303 = invmass0*invmass0 * r03sq; + double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102; + double quad1_0103 = 2.0 * (invmass0+invmass1)*invmass0 * r0103; + double quad1_0203 = 2.0 * invmass0*invmass0 * r0203; + + double quad2_0101 = invmass0*invmass0 * r01sq; + double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + double quad2_0303 = invmass0*invmass0 * r03sq; + double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102; + double quad2_0103 = 2.0 * invmass0*invmass0 * r0103; + double quad2_0203 = 2.0 * (invmass0+invmass2)*invmass0 * r0203; + + double quad3_0101 = invmass0*invmass0 * r01sq; + double quad3_0202 = invmass0*invmass0 * r02sq; + double quad3_0303 = (invmass0+invmass3)*(invmass0+invmass3) * r03sq; + double quad3_0102 = 2.0 * invmass0*invmass0 * r0102; + double quad3_0103 = 2.0 * (invmass0+invmass3)*invmass0 * r0103; + double quad3_0203 = 2.0 * (invmass0+invmass3)*invmass0 * r0203; + + // iterate until converged + + double lamda01 = 0.0; + double lamda02 = 0.0; + double lamda03 = 0.0; + int niter = 0; + int done = 0; + + double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda03_new; + + while (!done && niter < max_iter) { + quad1 = quad1_0101 * lamda01*lamda01 + + quad1_0202 * lamda02*lamda02 + + quad1_0303 * lamda03*lamda03 + + quad1_0102 * lamda01*lamda02 + + quad1_0103 * lamda01*lamda03 + + quad1_0203 * lamda02*lamda03; + + quad2 = quad2_0101 * lamda01*lamda01 + + quad2_0202 * lamda02*lamda02 + + quad2_0303 * lamda03*lamda03 + + quad2_0102 * lamda01*lamda02 + + quad2_0103 * lamda01*lamda03 + + quad2_0203 * lamda02*lamda03; + + quad3 = quad3_0101 * lamda01*lamda01 + + quad3_0202 * lamda02*lamda02 + + quad3_0303 * lamda03*lamda03 + + quad3_0102 * lamda01*lamda02 + + quad3_0103 * lamda01*lamda03 + + quad3_0203 * lamda02*lamda03; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + b3 = bond3*bond3 - s03sq - quad3; + + lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; + lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; + lamda03_new = a31inv*b1 + a32inv*b2 + a33inv*b3; + + done = 1; + if (fabs(lamda01_new-lamda01) > tolerance) done = 0; + if (fabs(lamda02_new-lamda02) > tolerance) done = 0; + if (fabs(lamda03_new-lamda03) > tolerance) done = 0; + + lamda01 = lamda01_new; + lamda02 = lamda02_new; + lamda03 = lamda03_new; + + // stop iterations before we have a floating point overflow + // max double is < 1.0e308, so 1e150 is a reasonable cutoff + + if (fabs(lamda01) > 1e150 || fabs(lamda02) > 1e150 + || fabs(lamda03) > 1e150) done = 1; + + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 = lamda01/dtfsq; + lamda02 = lamda02/dtfsq; + lamda03 = lamda03/dtfsq; + + if (i0 < nlocal) { + a_f(i0,0) += lamda01*r01[0] + lamda02*r02[0] + lamda03*r03[0]; + a_f(i0,1) += lamda01*r01[1] + lamda02*r02[1] + lamda03*r03[1]; + a_f(i0,2) += lamda01*r01[2] + lamda02*r02[2] + lamda03*r03[2]; + } + + if (i1 < nlocal) { + a_f(i1,0) -= lamda01*r01[0]; + a_f(i1,1) -= lamda01*r01[1]; + a_f(i1,2) -= lamda01*r01[2]; + } + + if (i2 < nlocal) { + a_f(i2,0) -= lamda02*r02[0]; + a_f(i2,1) -= lamda02*r02[1]; + a_f(i2,2) -= lamda02*r02[2]; + } + + if (i3 < nlocal) { + a_f(i3,0) -= lamda03*r03[0]; + a_f(i3,1) -= lamda03*r03[1]; + a_f(i3,2) -= lamda03*r03[2]; + } + + if (EVFLAG) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + if (i2 < nlocal) list[nlist++] = i2; + if (i3 < nlocal) list[nlist++] = i3; + + v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda03*r03[0]*r03[0]; + v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda03*r03[1]*r03[1]; + v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda03*r03[2]*r03[2]; + v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda03*r03[0]*r03[1]; + v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda03*r03[0]*r03[2]; + v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda03*r03[1]*r03[2]; + + v_tally(ev,nlist,list,4.0,v); + } +} + +/* ---------------------------------------------------------------------- */ + +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::shake3angle(int m, EV_FLOAT& ev) const +{ + + // The f array is duplicated for OpenMP, atomic for CUDA, and neither for Serial + + auto v_f = ScatterViewHelper::value,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access::value>(); + + int nlist,list[3]; + double v[6]; + double invmass0,invmass1,invmass2; + + // local atom IDs and constraint distances + + int i0 = map_array(d_shake_atom(m,0)); + int i1 = map_array(d_shake_atom(m,1)); + int i2 = map_array(d_shake_atom(m,2)); + double bond1 = d_bond_distance[d_shake_type(m,0)]; + double bond2 = d_bond_distance[d_shake_type(m,1)]; + double bond12 = d_angle_distance[d_shake_type(m,2)]; + + // r01,r02,r12 = distance vec between atoms, with PBC + + double r01[3]; + r01[0] = d_x(i0,0) - d_x(i1,0); + r01[1] = d_x(i0,1) - d_x(i1,1); + r01[2] = d_x(i0,2) - d_x(i1,2); + minimum_image(r01); + + double r02[3]; + r02[0] = d_x(i0,0) - d_x(i2,0); + r02[1] = d_x(i0,1) - d_x(i2,1); + r02[2] = d_x(i0,2) - d_x(i2,2); + minimum_image(r02); + + double r12[3]; + r12[0] = d_x(i1,0) - d_x(i2,0); + r12[1] = d_x(i1,1) - d_x(i2,1); + r12[2] = d_x(i1,2) - d_x(i2,2); + minimum_image(r12); + + // s01,s02,s12 = distance vec after unconstrained update, with PBC + // use Domain::minimum_image_once(), not minimum_image() + // b/c xshake values might be huge, due to e.g. fix gcmc + + double s01[3]; + s01[0] = d_xshake(i0,0) - d_xshake(i1,0); + s01[1] = d_xshake(i0,1) - d_xshake(i1,1); + s01[2] = d_xshake(i0,2) - d_xshake(i1,2); + minimum_image_once(s01); + + double s02[3]; + s02[0] = d_xshake(i0,0) - d_xshake(i2,0); + s02[1] = d_xshake(i0,1) - d_xshake(i2,1); + s02[2] = d_xshake(i0,2) - d_xshake(i2,2); + minimum_image_once(s02); + + double s12[3]; + s12[0] = d_xshake(i1,0) - d_xshake(i2,0); + s12[1] = d_xshake(i1,1) - d_xshake(i2,1); + s12[2] = d_xshake(i1,2) - d_xshake(i2,2); + minimum_image_once(s12); + + // scalar distances between atoms + + double r01sq = r01[0]*r01[0] + r01[1]*r01[1] + r01[2]*r01[2]; + double r02sq = r02[0]*r02[0] + r02[1]*r02[1] + r02[2]*r02[2]; + double r12sq = r12[0]*r12[0] + r12[1]*r12[1] + r12[2]*r12[2]; + double s01sq = s01[0]*s01[0] + s01[1]*s01[1] + s01[2]*s01[2]; + double s02sq = s02[0]*s02[0] + s02[1]*s02[1] + s02[2]*s02[2]; + double s12sq = s12[0]*s12[0] + s12[1]*s12[1] + s12[2]*s12[2]; + + // matrix coeffs and rhs for lamda equations + + if (d_rmass.data()) { + invmass0 = 1.0/d_rmass[i0]; + invmass1 = 1.0/d_rmass[i1]; + invmass2 = 1.0/d_rmass[i2]; + } else { + invmass0 = 1.0/d_mass[d_type[i0]]; + invmass1 = 1.0/d_mass[d_type[i1]]; + invmass2 = 1.0/d_mass[d_type[i2]]; + } + + double a11 = 2.0 * (invmass0+invmass1) * + (s01[0]*r01[0] + s01[1]*r01[1] + s01[2]*r01[2]); + double a12 = 2.0 * invmass0 * + (s01[0]*r02[0] + s01[1]*r02[1] + s01[2]*r02[2]); + double a13 = - 2.0 * invmass1 * + (s01[0]*r12[0] + s01[1]*r12[1] + s01[2]*r12[2]); + double a21 = 2.0 * invmass0 * + (s02[0]*r01[0] + s02[1]*r01[1] + s02[2]*r01[2]); + double a22 = 2.0 * (invmass0+invmass2) * + (s02[0]*r02[0] + s02[1]*r02[1] + s02[2]*r02[2]); + double a23 = 2.0 * invmass2 * + (s02[0]*r12[0] + s02[1]*r12[1] + s02[2]*r12[2]); + double a31 = - 2.0 * invmass1 * + (s12[0]*r01[0] + s12[1]*r01[1] + s12[2]*r01[2]); + double a32 = 2.0 * invmass2 * + (s12[0]*r02[0] + s12[1]*r02[1] + s12[2]*r02[2]); + double a33 = 2.0 * (invmass1+invmass2) * + (s12[0]*r12[0] + s12[1]*r12[1] + s12[2]*r12[2]); + + // inverse of matrix + + double determ = a11*a22*a33 + a12*a23*a31 + a13*a21*a32 - + a11*a23*a32 - a12*a21*a33 - a13*a22*a31; + if (determ == 0.0) d_error_flag() = 3; + //error->one(FLERR,"Shake determinant = 0.0"); + double determinv = 1.0/determ; + + double a11inv = determinv * (a22*a33 - a23*a32); + double a12inv = -determinv * (a12*a33 - a13*a32); + double a13inv = determinv * (a12*a23 - a13*a22); + double a21inv = -determinv * (a21*a33 - a23*a31); + double a22inv = determinv * (a11*a33 - a13*a31); + double a23inv = -determinv * (a11*a23 - a13*a21); + double a31inv = determinv * (a21*a32 - a22*a31); + double a32inv = -determinv * (a11*a32 - a12*a31); + double a33inv = determinv * (a11*a22 - a12*a21); + + // quadratic correction coeffs + + double r0102 = (r01[0]*r02[0] + r01[1]*r02[1] + r01[2]*r02[2]); + double r0112 = (r01[0]*r12[0] + r01[1]*r12[1] + r01[2]*r12[2]); + double r0212 = (r02[0]*r12[0] + r02[1]*r12[1] + r02[2]*r12[2]); + + double quad1_0101 = (invmass0+invmass1)*(invmass0+invmass1) * r01sq; + double quad1_0202 = invmass0*invmass0 * r02sq; + double quad1_1212 = invmass1*invmass1 * r12sq; + double quad1_0102 = 2.0 * (invmass0+invmass1)*invmass0 * r0102; + double quad1_0112 = - 2.0 * (invmass0+invmass1)*invmass1 * r0112; + double quad1_0212 = - 2.0 * invmass0*invmass1 * r0212; + + double quad2_0101 = invmass0*invmass0 * r01sq; + double quad2_0202 = (invmass0+invmass2)*(invmass0+invmass2) * r02sq; + double quad2_1212 = invmass2*invmass2 * r12sq; + double quad2_0102 = 2.0 * (invmass0+invmass2)*invmass0 * r0102; + double quad2_0112 = 2.0 * invmass0*invmass2 * r0112; + double quad2_0212 = 2.0 * (invmass0+invmass2)*invmass2 * r0212; + + double quad3_0101 = invmass1*invmass1 * r01sq; + double quad3_0202 = invmass2*invmass2 * r02sq; + double quad3_1212 = (invmass1+invmass2)*(invmass1+invmass2) * r12sq; + double quad3_0102 = - 2.0 * invmass1*invmass2 * r0102; + double quad3_0112 = - 2.0 * (invmass1+invmass2)*invmass1 * r0112; + double quad3_0212 = 2.0 * (invmass1+invmass2)*invmass2 * r0212; + + // iterate until converged + + double lamda01 = 0.0; + double lamda02 = 0.0; + double lamda12 = 0.0; + int niter = 0; + int done = 0; + + double quad1,quad2,quad3,b1,b2,b3,lamda01_new,lamda02_new,lamda12_new; + + while (!done && niter < max_iter) { + + quad1 = quad1_0101 * lamda01*lamda01 + + quad1_0202 * lamda02*lamda02 + + quad1_1212 * lamda12*lamda12 + + quad1_0102 * lamda01*lamda02 + + quad1_0112 * lamda01*lamda12 + + quad1_0212 * lamda02*lamda12; + + quad2 = quad2_0101 * lamda01*lamda01 + + quad2_0202 * lamda02*lamda02 + + quad2_1212 * lamda12*lamda12 + + quad2_0102 * lamda01*lamda02 + + quad2_0112 * lamda01*lamda12 + + quad2_0212 * lamda02*lamda12; + + quad3 = quad3_0101 * lamda01*lamda01 + + quad3_0202 * lamda02*lamda02 + + quad3_1212 * lamda12*lamda12 + + quad3_0102 * lamda01*lamda02 + + quad3_0112 * lamda01*lamda12 + + quad3_0212 * lamda02*lamda12; + + b1 = bond1*bond1 - s01sq - quad1; + b2 = bond2*bond2 - s02sq - quad2; + b3 = bond12*bond12 - s12sq - quad3; + + lamda01_new = a11inv*b1 + a12inv*b2 + a13inv*b3; + lamda02_new = a21inv*b1 + a22inv*b2 + a23inv*b3; + lamda12_new = a31inv*b1 + a32inv*b2 + a33inv*b3; + + done = 1; + if (fabs(lamda01_new-lamda01) > tolerance) done = 0; + if (fabs(lamda02_new-lamda02) > tolerance) done = 0; + if (fabs(lamda12_new-lamda12) > tolerance) done = 0; + + lamda01 = lamda01_new; + lamda02 = lamda02_new; + lamda12 = lamda12_new; + + // stop iterations before we have a floating point overflow + // max double is < 1.0e308, so 1e150 is a reasonable cutoff + + if (fabs(lamda01) > 1e150 || fabs(lamda02) > 1e150 + || fabs(lamda12) > 1e150) done = 1; + + niter++; + } + + // update forces if atom is owned by this processor + + lamda01 = lamda01/dtfsq; + lamda02 = lamda02/dtfsq; + lamda12 = lamda12/dtfsq; + + if (i0 < nlocal) { + a_f(i0,0) += lamda01*r01[0] + lamda02*r02[0]; + a_f(i0,1) += lamda01*r01[1] + lamda02*r02[1]; + a_f(i0,2) += lamda01*r01[2] + lamda02*r02[2]; + } + + if (i1 < nlocal) { + a_f(i1,0) -= lamda01*r01[0] - lamda12*r12[0]; + a_f(i1,1) -= lamda01*r01[1] - lamda12*r12[1]; + a_f(i1,2) -= lamda01*r01[2] - lamda12*r12[2]; + } + + if (i2 < nlocal) { + a_f(i2,0) -= lamda02*r02[0] + lamda12*r12[0]; + a_f(i2,1) -= lamda02*r02[1] + lamda12*r12[1]; + a_f(i2,2) -= lamda02*r02[2] + lamda12*r12[2]; + } + + if (EVFLAG) { + nlist = 0; + if (i0 < nlocal) list[nlist++] = i0; + if (i1 < nlocal) list[nlist++] = i1; + if (i2 < nlocal) list[nlist++] = i2; + + v[0] = lamda01*r01[0]*r01[0]+lamda02*r02[0]*r02[0]+lamda12*r12[0]*r12[0]; + v[1] = lamda01*r01[1]*r01[1]+lamda02*r02[1]*r02[1]+lamda12*r12[1]*r12[1]; + v[2] = lamda01*r01[2]*r01[2]+lamda02*r02[2]*r02[2]+lamda12*r12[2]*r12[2]; + v[3] = lamda01*r01[0]*r01[1]+lamda02*r02[0]*r02[1]+lamda12*r12[0]*r12[1]; + v[4] = lamda01*r01[0]*r01[2]+lamda02*r02[0]*r02[2]+lamda12*r12[0]*r12[2]; + v[5] = lamda01*r01[1]*r01[2]+lamda02*r02[1]*r02[2]+lamda12*r12[1]*r12[2]; + + v_tally(ev,nlist,list,3.0,v); + } +} + +/* ---------------------------------------------------------------------- + allocate local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::grow_arrays(int nmax) +{ + memoryKK->grow_kokkos(k_shake_flag,shake_flag,nmax,"shake:shake_flag"); + memoryKK->grow_kokkos(k_shake_atom,shake_atom,nmax,4,"shake:shake_atom"); + memoryKK->grow_kokkos(k_shake_type,shake_type,nmax,3,"shake:shake_type"); + memoryKK->destroy_kokkos(k_xshake,xshake); + memoryKK->create_kokkos(k_xshake,xshake,nmax,"shake:xshake"); + + d_shake_flag = k_shake_flag.view(); + d_shake_atom = k_shake_atom.view(); + d_shake_type = k_shake_type.view(); + d_xshake = k_xshake.view(); + + memory->destroy(ftmp); + memory->create(ftmp,nmax,3,"shake:ftmp"); + memory->destroy(vtmp); + memory->create(vtmp,nmax,3,"shake:vtmp"); +} + +/* ---------------------------------------------------------------------- + copy values within local atom-based arrays +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::copy_arrays(int i, int j, int delflag) +{ + k_shake_flag.sync_host(); + k_shake_atom.sync_host(); + k_shake_type.sync_host(); + + FixShake::copy_arrays(i,j,delflag); + + k_shake_flag.modify_host(); + k_shake_atom.modify_host(); + k_shake_type.modify_host(); +} + +/* ---------------------------------------------------------------------- + initialize one atom's array values, called when atom is created +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::set_arrays(int i) +{ + k_shake_flag.sync_host(); + + shake_flag[i] = 0; + + k_shake_flag.modify_host(); +} + +/* ---------------------------------------------------------------------- + update one atom's array values + called when molecule is created from fix gcmc +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::update_arrays(int i, int atom_offset) +{ + k_shake_flag.sync_host(); + k_shake_atom.sync_host(); + + FixShake::update_arrays(i,atom_offset); + + k_shake_flag.modify_host(); + k_shake_atom.modify_host(); +} + +/* ---------------------------------------------------------------------- + pack values in local atom-based arrays for exchange with another proc +------------------------------------------------------------------------- */ + +template +int FixShakeKokkos::pack_exchange(int i, double *buf) +{ + k_shake_flag.sync_host(); + k_shake_atom.sync_host(); + k_shake_type.sync_host(); + + int m = FixShake::pack_exchange(i,buf); + + k_shake_flag.modify_host(); + k_shake_atom.modify_host(); + k_shake_type.modify_host(); + + return m; +} + +/* ---------------------------------------------------------------------- + unpack values in local atom-based arrays from exchange with another proc +------------------------------------------------------------------------- */ + +template +int FixShakeKokkos::unpack_exchange(int nlocal, double *buf) +{ + k_shake_flag.sync_host(); + k_shake_atom.sync_host(); + k_shake_type.sync_host(); + + int m = FixShake::unpack_exchange(nlocal,buf); + + k_shake_flag.modify_host(); + k_shake_atom.modify_host(); + k_shake_type.modify_host(); + + return m; +} + +/* ---------------------------------------------------------------------- */ + +template +int FixShakeKokkos::pack_forward_comm_fix_kokkos(int n, DAT::tdual_int_2d k_sendlist, + int iswap_in, DAT::tdual_xfloat_1d &k_buf, + int pbc_flag, int* pbc) +{ + d_sendlist = k_sendlist.view(); + iswap = iswap_in; + d_buf = k_buf.view(); + + if (domain->triclinic == 0) { + dx = pbc[0]*domain->xprd; + dy = pbc[1]*domain->yprd; + dz = pbc[2]*domain->zprd; + } else { + dx = pbc[0]*domain->xprd + pbc[5]*domain->xy + pbc[4]*domain->xz; + dy = pbc[1]*domain->yprd + pbc[3]*domain->yz; + dz = pbc[2]*domain->zprd; + } + + if (pbc_flag) + Kokkos::parallel_for(Kokkos::RangePolicy >(0,n),*this); + else + Kokkos::parallel_for(Kokkos::RangePolicy >(0,n),*this); + return n*3; +} + +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::operator()(TagFixShakePackForwardComm, const int &i) const { + const int j = d_sendlist(iswap, i); + + if (PBC_FLAG == 0) { + d_buf[3*i] = d_xshake(j,0); + d_buf[3*i+1] = d_xshake(j,1); + d_buf[3*i+2] = d_xshake(j,2); + } else { + d_buf[3*i] = d_xshake(j,0) + dx; + d_buf[3*i+1] = d_xshake(j,1) + dy; + d_buf[3*i+2] = d_xshake(j,2) + dz; + } +} + +/* ---------------------------------------------------------------------- */ + +template +int FixShakeKokkos::pack_forward_comm(int n, int *list, double *buf, + int pbc_flag, int *pbc) +{ + k_xshake.sync_host(); + + int m = FixShake::pack_forward_comm(n,list,buf,pbc_flag,pbc); + + k_xshake.modify_host(); + + return m; +} + +/* ---------------------------------------------------------------------- */ + +template +void FixShakeKokkos::unpack_forward_comm_fix_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf) +{ + first = first_in; + d_buf = buf.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); +} + +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::operator()(TagFixShakeUnpackForwardComm, const int &i) const { + d_xshake(i + first,0) = d_buf[3*i]; + d_xshake(i + first,1) = d_buf[3*i+1]; + d_xshake(i + first,2) = d_buf[3*i+2]; +} + +/* ---------------------------------------------------------------------- */ + +template +void FixShakeKokkos::unpack_forward_comm(int n, int first, double *buf) +{ + k_xshake.sync_host(); + + FixShake::unpack_forward_comm(n,first,buf); + + k_xshake.modify_host(); +} + +/* ---------------------------------------------------------------------- + add coordinate constraining forces + this method is called at the end of a timestep +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::shake_end_of_step(int vflag) { + dtv = update->dt; + dtfsq = 0.5 * update->dt * update->dt * force->ftm2v; + FixShakeKokkos::post_force(vflag); + if (!rattle) dtfsq = update->dt * update->dt * force->ftm2v; +} + +/* ---------------------------------------------------------------------- + calculate constraining forces based on the current configuration + change coordinates +------------------------------------------------------------------------- */ + +template +void FixShakeKokkos::correct_coordinates(int vflag) { + atomKK->sync(Host,X_MASK|V_MASK|F_MASK); + + // save current forces and velocities so that you + // initialize them to zero such that FixShake::unconstrained_coordinate_update has no effect + + for (int j=0; jmodified(Host,V_MASK|F_MASK); + + // call SHAKE to correct the coordinates which were updated without constraints + // IMPORTANT: use 1 as argument and thereby enforce velocity Verlet + + dtfsq = 0.5 * update->dt * update->dt * force->ftm2v; + FixShakeKokkos::post_force(vflag); + + atomKK->sync(Host,X_MASK|F_MASK); + + // integrate coordinates: x' = xnp1 + dt^2/2m_i * f, where f is the constraining force + // NOTE: After this command, the coordinates geometry of the molecules will be correct! + + double dtfmsq; + if (rmass) { + for (int i = 0; i < nlocal; i++) { + dtfmsq = dtfsq/ rmass[i]; + x[i][0] = x[i][0] + dtfmsq*f[i][0]; + x[i][1] = x[i][1] + dtfmsq*f[i][1]; + x[i][2] = x[i][2] + dtfmsq*f[i][2]; + } + } + else { + for (int i = 0; i < nlocal; i++) { + dtfmsq = dtfsq / mass[type[i]]; + x[i][0] = x[i][0] + dtfmsq*f[i][0]; + x[i][1] = x[i][1] + dtfmsq*f[i][1]; + x[i][2] = x[i][2] + dtfmsq*f[i][2]; + } + } + + // copy forces and velocities back + + for (int j=0; jdt * update->dt * force->ftm2v; + + // communicate changes + // NOTE: for compatibility xshake is temporarily set to x, such that pack/unpack_forward + // can be used for communicating the coordinates. + + double **xtmp = xshake; + xshake = x; + if (nprocs > 1) { + forward_comm_device = 0; + comm->forward_comm_fix(this); + forward_comm_device = 1; + } + xshake = xtmp; + + atomKK->modified(Host,X_MASK|V_MASK|F_MASK); +} + +/* ---------------------------------------------------------------------- + tally virial into global and per-atom accumulators + n = # of local owned atoms involved, with local indices in list + v = total virial for the interaction involving total atoms + increment global virial by n/total fraction + increment per-atom virial of each atom in list by 1/total fraction + this method can be used when fix computes forces in post_force() + e.g. fix shake, fix rigid: compute virial only on owned atoms + whether newton_bond is on or off + other procs will tally left-over fractions for atoms they own +------------------------------------------------------------------------- */ +template +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::v_tally(EV_FLOAT &ev, int n, int *list, double total, + double *v) const +{ + int m; + + if (vflag_global) { + double fraction = n/total; + ev.v[0] += fraction*v[0]; + ev.v[1] += fraction*v[1]; + ev.v[2] += fraction*v[2]; + ev.v[3] += fraction*v[3]; + ev.v[4] += fraction*v[4]; + ev.v[5] += fraction*v[5]; + } + + if (vflag_atom) { + double fraction = 1.0/total; + for (int i = 0; i < n; i++) { + auto v_vatom = ScatterViewHelper::value,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access::value>(); + m = list[i]; + a_vatom(m,0) += fraction*v[0]; + a_vatom(m,1) += fraction*v[1]; + a_vatom(m,2) += fraction*v[2]; + a_vatom(m,3) += fraction*v[3]; + a_vatom(m,4) += fraction*v[4]; + a_vatom(m,5) += fraction*v[5]; + } + } +} + +/* ---------------------------------------------------------------------- */ + +template +void FixShakeKokkos::update_domain_variables() +{ + triclinic = domain->triclinic; + xperiodic = domain->xperiodic; + xprd_half = domain->xprd_half; + xprd = domain->xprd; + yperiodic = domain->yperiodic; + yprd_half = domain->yprd_half; + yprd = domain->yprd; + zperiodic = domain->zperiodic; + zprd_half = domain->zprd_half; + zprd = domain->zprd; + xy = domain->xy; + xz = domain->xz; + yz = domain->yz; +} + +/* ---------------------------------------------------------------------- + minimum image convention in periodic dimensions + use 1/2 of box size as test + for triclinic, also add/subtract tilt factors in other dims as needed + changed "if" to "while" to enable distance to + far-away ghost atom returned by atom->map() to be wrapped back into box + could be problem for looking up atom IDs when cutoff > boxsize + this should not be used if atom has moved infinitely far outside box + b/c while could iterate forever + e.g. fix shake prediction of new position with highly overlapped atoms + use minimum_image_once() instead + copied from domain.cpp +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::minimum_image(double *delta) const +{ + if (triclinic == 0) { + if (xperiodic) { + while (fabs(delta[0]) > xprd_half) { + if (delta[0] < 0.0) delta[0] += xprd; + else delta[0] -= xprd; + } + } + if (yperiodic) { + while (fabs(delta[1]) > yprd_half) { + if (delta[1] < 0.0) delta[1] += yprd; + else delta[1] -= yprd; + } + } + if (zperiodic) { + while (fabs(delta[2]) > zprd_half) { + if (delta[2] < 0.0) delta[2] += zprd; + else delta[2] -= zprd; + } + } + + } else { + if (zperiodic) { + while (fabs(delta[2]) > zprd_half) { + if (delta[2] < 0.0) { + delta[2] += zprd; + delta[1] += yz; + delta[0] += xz; + } else { + delta[2] -= zprd; + delta[1] -= yz; + delta[0] -= xz; + } + } + } + if (yperiodic) { + while (fabs(delta[1]) > yprd_half) { + if (delta[1] < 0.0) { + delta[1] += yprd; + delta[0] += xy; + } else { + delta[1] -= yprd; + delta[0] -= xy; + } + } + } + if (xperiodic) { + while (fabs(delta[0]) > xprd_half) { + if (delta[0] < 0.0) delta[0] += xprd; + else delta[0] -= xprd; + } + } + } +} + +/* ---------------------------------------------------------------------- + minimum image convention in periodic dimensions + use 1/2 of box size as test + for triclinic, also add/subtract tilt factors in other dims as needed + only shift by one box length in each direction + this should not be used if multiple box shifts are required + copied from domain.cpp +------------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void FixShakeKokkos::minimum_image_once(double *delta) const +{ + if (triclinic == 0) { + if (xperiodic) { + if (fabs(delta[0]) > xprd_half) { + if (delta[0] < 0.0) delta[0] += xprd; + else delta[0] -= xprd; + } + } + if (yperiodic) { + if (fabs(delta[1]) > yprd_half) { + if (delta[1] < 0.0) delta[1] += yprd; + else delta[1] -= yprd; + } + } + if (zperiodic) { + if (fabs(delta[2]) > zprd_half) { + if (delta[2] < 0.0) delta[2] += zprd; + else delta[2] -= zprd; + } + } + + } else { + if (zperiodic) { + if (fabs(delta[2]) > zprd_half) { + if (delta[2] < 0.0) { + delta[2] += zprd; + delta[1] += yz; + delta[0] += xz; + } else { + delta[2] -= zprd; + delta[1] -= yz; + delta[0] -= xz; + } + } + } + if (yperiodic) { + if (fabs(delta[1]) > yprd_half) { + if (delta[1] < 0.0) { + delta[1] += yprd; + delta[0] += xy; + } else { + delta[1] -= yprd; + delta[0] -= xy; + } + } + } + if (xperiodic) { + if (fabs(delta[0]) > xprd_half) { + if (delta[0] < 0.0) delta[0] += xprd; + else delta[0] -= xprd; + } + } + } +} + +/* ---------------------------------------------------------------------- */ + +namespace LAMMPS_NS { +template class FixShakeKokkos; +#ifdef LMP_KOKKOS_GPU +template class FixShakeKokkos; +#endif +} + diff --git a/src/KOKKOS/fix_shake_kokkos.h b/src/KOKKOS/fix_shake_kokkos.h new file mode 100644 index 0000000000..40aecb4a44 --- /dev/null +++ b/src/KOKKOS/fix_shake_kokkos.h @@ -0,0 +1,210 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ + +#ifdef FIX_CLASS + +FixStyle(shake/kk,FixShakeKokkos) +FixStyle(shake/kk/device,FixShakeKokkos) +FixStyle(shake/kk/host,FixShakeKokkos) + +#else + +#ifndef LMP_FIX_SHAKE_KOKKOS_H +#define LMP_FIX_SHAKE_KOKKOS_H + +#include "fix_shake.h" +#include "kokkos_type.h" +#include "kokkos_base.h" + +namespace LAMMPS_NS { + +template +struct TagFixShakePostForce{}; + +template +struct TagFixShakePackForwardComm{}; + +struct TagFixShakeUnpackForwardComm{}; + +template +class FixShakeKokkos : public FixShake, public KokkosBase { + + //friend class FixEHEX; + + public: + typedef DeviceType device_type; + typedef EV_FLOAT value_type; + typedef ArrayTypes AT; + + FixShakeKokkos(class LAMMPS *, int, char **); + virtual ~FixShakeKokkos(); + void init(); + void pre_neighbor(); + void post_force(int); + + void grow_arrays(int); + void copy_arrays(int, int, int); + void set_arrays(int); + void update_arrays(int, int); + + int pack_exchange(int, double *); + int unpack_exchange(int, double *); + int pack_forward_comm_fix_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d&, + int, int *); + void unpack_forward_comm_fix_kokkos(int, int, DAT::tdual_xfloat_1d&); + int pack_forward_comm(int, int *, double *, int, int *); + void unpack_forward_comm(int, int, double *); + + void shake_end_of_step(int vflag); + void correct_coordinates(int vflag); + + int dof(int); + + void unconstrained_update(); + + template + KOKKOS_INLINE_FUNCTION + void operator()(TagFixShakePostForce, const int&, EV_FLOAT&) const; + + template + KOKKOS_INLINE_FUNCTION + void operator()(TagFixShakePostForce, const int&) const; + + template + KOKKOS_INLINE_FUNCTION + void operator()(TagFixShakePackForwardComm, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagFixShakeUnpackForwardComm, const int&) const; + + protected: + + typename AT::t_x_array d_x; + typename AT::t_v_array d_v; + typename AT::t_f_array d_f; + typename AT::t_float_1d d_rmass; + typename AT::t_float_1d d_mass; + typename AT::t_tagint_1d_randomread d_tag; + typename AT::t_int_1d d_type; + typename AT::t_int_1d d_mask; + + DAT::tdual_efloat_1d k_eatom; + typename AT::t_efloat_1d d_eatom; + + DAT::tdual_virial_array k_vatom; + typename AT::t_virial_array d_vatom; + + DAT::tdual_float_1d k_bond_distance; // constraint distances + typename AT::t_float_1d d_bond_distance; + DAT::tdual_float_1d k_angle_distance; + typename AT::t_float_1d d_angle_distance; + + // atom-based arrays + DAT::tdual_int_1d k_shake_flag; + typename AT::t_int_1d d_shake_flag; // 0 if atom not in SHAKE cluster + // 1 = size 3 angle cluster + // 2,3,4 = size of bond-only cluster + DAT::tdual_tagint_2d k_shake_atom; + typename AT::t_tagint_2d d_shake_atom; // global IDs of atoms in cluster + // central atom is 1st + // lowest global ID is 1st for size 2 + DAT::tdual_int_2d k_shake_type; + typename AT::t_int_2d d_shake_type; // bondtype of each bond in cluster + // for angle cluster, 3rd value + // is angletype + DAT::tdual_x_array k_xshake; + typename AT::t_x_array d_xshake; // unconstrained atom coords + + DAT::tdual_int_1d k_list; + typename AT::t_int_1d d_list; // list of clusters to SHAKE + + DAT::tdual_int_scalar k_error_flag; + DAT::tdual_int_scalar k_nlist; + + + template + KOKKOS_INLINE_FUNCTION + void shake(int, EV_FLOAT&) const; + + template + KOKKOS_INLINE_FUNCTION + void shake3(int, EV_FLOAT&) const; + + template + KOKKOS_INLINE_FUNCTION + void shake4(int, EV_FLOAT&) const; + + template + KOKKOS_INLINE_FUNCTION + void shake3angle(int, EV_FLOAT&) const; + + typedef typename KKDevice::value KKDeviceType; + Kokkos::Experimental::ScatterView::value,typename Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_f; + Kokkos::Experimental::ScatterView::value,typename Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_eatom; + Kokkos::Experimental::ScatterView::value,typename Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterDuplicated> dup_vatom; + + Kokkos::Experimental::ScatterView::value,typename Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_f; + Kokkos::Experimental::ScatterView::value,typename Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_eatom; + Kokkos::Experimental::ScatterView::value,typename Kokkos::Experimental::ScatterSum,Kokkos::Experimental::ScatterNonDuplicated> ndup_vatom; + + int neighflag,need_dup; + + typename AT::t_int_1d d_scalars; + HAT::t_int_1d h_scalars; + typename AT::t_int_scalar d_error_flag; + typename AT::t_int_scalar d_nlist; + HAT::t_int_scalar h_error_flag; + HAT::t_int_scalar h_nlist; + + template + KOKKOS_INLINE_FUNCTION + void v_tally(EV_FLOAT&, int, int *, double, double *) const; + + DAT::tdual_int_1d k_map_array; + typename AT::t_int_1d_randomread map_array; + + int iswap; + int first; + typename AT::t_int_2d d_sendlist; + typename AT::t_xfloat_1d_um d_buf; + X_FLOAT dx,dy,dz; + + int *shake_flag_tmp; + tagint **shake_atom_tmp; + int **shake_type_tmp; + + // copied from Domain + + KOKKOS_INLINE_FUNCTION + void minimum_image(double *) const; + + KOKKOS_INLINE_FUNCTION + void minimum_image_once(double *) const; + + void update_domain_variables(); + + int triclinic; + int xperiodic,yperiodic,zperiodic; + X_FLOAT xprd_half,yprd_half,zprd_half; + X_FLOAT xprd,yprd,zprd; + X_FLOAT xy,xz,yz; +}; + +} + +#endif +#endif + +/* ERROR/WARNING messages: + +*/ diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h index 1d8f1e93e2..adacaf6ad8 100644 --- a/src/KOKKOS/kokkos_base.h +++ b/src/KOKKOS/kokkos_base.h @@ -28,6 +28,13 @@ class KokkosBase { int, int *) {return 0;}; virtual void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d &) {} + // Fix + virtual int pack_forward_comm_fix_kokkos(int, DAT::tdual_int_2d, + int, DAT::tdual_xfloat_1d &, + int, int *) {return 0;}; + virtual void unpack_forward_comm_fix_kokkos(int, int, DAT::tdual_xfloat_1d &) {} + + // Region virtual void match_all_kokkos(int, DAT::tdual_int_1d) {} }; diff --git a/src/RIGID/fix_shake.cpp b/src/RIGID/fix_shake.cpp index 1244cd36fb..365fc16fd7 100644 --- a/src/RIGID/fix_shake.cpp +++ b/src/RIGID/fix_shake.cpp @@ -240,6 +240,8 @@ FixShake::FixShake(LAMMPS *lmp, int narg, char **arg) : FixShake::~FixShake() { + if (copymode) return; + // unregister callbacks to this fix from Atom class atom->delete_callback(id,Atom::GROW); diff --git a/src/RIGID/fix_shake.h b/src/RIGID/fix_shake.h index ee9f520fea..53d61397bf 100644 --- a/src/RIGID/fix_shake.h +++ b/src/RIGID/fix_shake.h @@ -34,14 +34,14 @@ class FixShake : public Fix { virtual int setmask(); virtual void init(); void setup(int); - void pre_neighbor(); + virtual void pre_neighbor(); virtual void post_force(int); virtual void post_force_respa(int, int, int); virtual double memory_usage(); virtual void grow_arrays(int); virtual void copy_arrays(int, int, int); - void set_arrays(int); + virtual void set_arrays(int); virtual void update_arrays(int, int); void set_molecule(int, tagint, int, double *, double *, double *); @@ -54,7 +54,7 @@ class FixShake : public Fix { virtual void correct_coordinates(int vflag); virtual void correct_velocities(); - int dof(int); + virtual int dof(int); virtual void reset_dt(); void *extract(const char *, int &); @@ -126,7 +126,7 @@ class FixShake : public Fix { void shake_info(int *, tagint **, int **); int masscheck(double); - void unconstrained_update(); + virtual void unconstrained_update(); void unconstrained_update_respa(int); void shake(int); void shake3(int); diff --git a/src/fix.cpp b/src/fix.cpp index 16628e5374..905374ee05 100644 --- a/src/fix.cpp +++ b/src/fix.cpp @@ -111,6 +111,7 @@ Fix::Fix(LAMMPS *lmp, int /*narg*/, char **arg) : datamask_modify = ALL_MASK; kokkosable = 0; + forward_comm_device = 0; copymode = 0; } diff --git a/src/fix.h b/src/fix.h index 07a222063c..1a28e5c924 100644 --- a/src/fix.h +++ b/src/fix.h @@ -107,6 +107,7 @@ class Fix : protected Pointers { // KOKKOS host/device flag and data masks int kokkosable; // 1 if Kokkos fix + int forward_comm_device; // 1 if forward comm on Device ExecutionSpace execution_space; unsigned int datamask_read,datamask_modify;