diff --git a/doc/src/Commands_pair.rst b/doc/src/Commands_pair.rst index f902b40b29..11334ea2d1 100644 --- a/doc/src/Commands_pair.rst +++ b/doc/src/Commands_pair.rst @@ -194,7 +194,7 @@ OPT. * :doc:`lubricateU/poly ` * :doc:`mdpd ` * :doc:`mdpd/rhosum ` - * :doc:`meam ` + * :doc:`meam (k) ` * :doc:`meam/spline (o) ` * :doc:`meam/sw/spline ` * :doc:`mesocnt ` diff --git a/doc/src/package.rst b/doc/src/package.rst index 8f9a65b2cc..6ff34515cf 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -71,7 +71,7 @@ Syntax *no_affinity* values = none *kokkos* args = keyword value ... zero or more keyword/value pairs may be appended - keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* *comm/pair/forward* *comm/fix/forward* or *comm/reverse* or *gpu/aware* or *pair/only* + keywords = *neigh* or *neigh/qeq* or *neigh/thread* or *newton* or *binsize* or *comm* or *comm/exchange* or *comm/forward* *comm/pair/forward* *comm/fix/forward* or *comm/reverse* or *comm/pair/reverse* or *gpu/aware* or *pair/only* *neigh* value = *full* or *half* full = full neighbor list half = half neighbor list built in thread-safe manner @@ -96,6 +96,7 @@ Syntax *comm/pair/forward* value = *no* or *device* *comm/fix/forward* value = *no* or *device* *comm/reverse* value = *no* or *host* or *device* + *comm/pair/reverse* value = *no* or *device* no = perform communication pack/unpack in non-KOKKOS mode host = perform pack/unpack on host (e.g. with OpenMP threading) device = perform pack/unpack on device (e.g. on GPU) @@ -500,7 +501,7 @@ rule of thumb may give too large a binsize and the default should be overridden with a smaller value. The *comm* and *comm/exchange* and *comm/forward* and *comm/pair/forward* -and *comm/fix/forward* and comm/reverse* +and *comm/fix/forward* and *comm/reverse* and *comm/pair/reverse* keywords determine whether the host or device performs the packing and unpacking of data when communicating per-atom data between processors. "Exchange" communication happens only on timesteps that neighbor lists @@ -521,9 +522,16 @@ packing/unpacking data for the communication. A value of *host* means to use the host, typically a multi-core CPU, and perform the packing/unpacking in parallel with threads. A value of *device* means to use the device, typically a GPU, to perform the packing/unpacking -operation. If a value of *host* is used for the *comm/pair/forward* or -*comm/fix/forward* keyword, it will be automatically be changed to *no* -since these keywords don't support *host* mode. +operation. + +For the *comm/pair/forward* or *comm/fix/forward* or *comm/pair/reverse* +keywords, if a value of *host* is used it will be automatically +be changed to *no* since these keywords don't support *host* mode. The +value of *no* will also always be used when running on the CPU, i.e. setting +the value to *device* will have no effect if the pair/fix style is +running on the CPU. For the *comm/fix/forward* or *comm/pair/reverse* +keywords, not all styles support *device* mode and in that case will run +in *no* mode instead. The optimal choice for these keywords depends on the input script and the hardware used. The *no* value is useful for verifying that the diff --git a/doc/src/pair_meam.rst b/doc/src/pair_meam.rst index afa89e10ca..eac8449b21 100644 --- a/doc/src/pair_meam.rst +++ b/doc/src/pair_meam.rst @@ -1,8 +1,11 @@ .. index:: pair_style meam +.. index:: pair_style meam/kk pair_style meam command ========================= +Accelerator Variants: *meam/kk* + Syntax """""" @@ -347,6 +350,12 @@ Most published MEAM parameter sets use the default values *attrac* = *repulse* = Setting *repuls* = *attrac* = *delta* corresponds to the form used in several recent published MEAM parameter sets, such as :ref:`(Valone) ` +---------- + +.. include:: accel_styles.rst + +---------- + .. note:: The default form of the *erose* expression in LAMMPS was corrected diff --git a/src/KOKKOS/Install.sh b/src/KOKKOS/Install.sh index 952f21e403..314205ea3e 100755 --- a/src/KOKKOS/Install.sh +++ b/src/KOKKOS/Install.sh @@ -182,6 +182,13 @@ action kokkos_base.h action kokkos_base_fft.h fft3d.h action kokkos_few.h action kokkos_type.h +action meam_kokkos.h meam.h +action meam_dens_final_kokkos.h meam_dens_final.cpp +action meam_dens_init_kokkos.h meam_dens_init.cpp +action meam_force_kokkos.h meam_force.cpp +action meam_funcs_kokkos.h meam_funcs.cpp +action meam_impl_kokkos.h meam_impl.cpp +action meam_setup_done_kokkos.h meam_setup_done.cpp action memory_kokkos.h action modify_kokkos.cpp action modify_kokkos.h @@ -207,13 +214,6 @@ action nbin_ssa_kokkos.cpp nbin_ssa.cpp action nbin_ssa_kokkos.h nbin_ssa.h action math_special_kokkos.cpp action math_special_kokkos.h -action meam_setup_done_kokkos.h meam_setup_done.cpp -action meam_kokkos.h meam.h -action meam_impl_kokkos.h meam_impl.cpp -action meam_funcs_kokkos.h meam_funcs.cpp -action meam_force_kokkos.h meam_force.cpp -action meam_dens_init_kokkos.h meam_dens_init.cpp -action meam_dens_final_kokkos.h meam_dens_final.cpp action min_cg_kokkos.cpp action min_cg_kokkos.h action min_kokkos.cpp @@ -294,8 +294,8 @@ action pair_lj_gromacs_kokkos.cpp pair_lj_gromacs.cpp action pair_lj_gromacs_kokkos.h pair_lj_gromacs.h action pair_lj_sdk_kokkos.cpp pair_lj_sdk.cpp action pair_lj_sdk_kokkos.h pair_lj_sdk.h -action pair_meam_kokkos.h pair_meam.h action pair_meam_kokkos.cpp pair_meam.cpp +action pair_meam_kokkos.h pair_meam.h action pair_morse_kokkos.cpp action pair_morse_kokkos.h action pair_multi_lucy_rx_kokkos.cpp pair_multi_lucy_rx.cpp diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index 4883838273..fea5864225 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -109,6 +109,7 @@ void CommKokkos::init() exchange_comm_classic = lmp->kokkos->exchange_comm_classic; forward_comm_classic = lmp->kokkos->forward_comm_classic; forward_pair_comm_classic = lmp->kokkos->forward_pair_comm_classic; + reverse_pair_comm_classic = lmp->kokkos->reverse_pair_comm_classic; forward_fix_comm_classic = lmp->kokkos->forward_fix_comm_classic; reverse_comm_classic = lmp->kokkos->reverse_comm_classic; exchange_comm_on_host = lmp->kokkos->exchange_comm_on_host; @@ -478,12 +479,13 @@ void CommKokkos::forward_comm_device(Pair *pair) int nsize = pair->comm_forward; KokkosBase* pairKKBase = dynamic_cast(pair); + int nmax = max_buf_pair; for (iswap = 0; iswap < nswap; iswap++) { - int n = MAX(max_buf_pair,nsize*sendnum[iswap]); - n = MAX(n,nsize*recvnum[iswap]); - if (n > max_buf_pair) - grow_buf_pair(n); + nmax = MAX(nmax,nsize*sendnum[iswap]); + nmax = MAX(nmax,nsize*recvnum[iswap]); } + if (nmax > max_buf_pair) + grow_buf_pair(nmax); for (iswap = 0; iswap < nswap; iswap++) { @@ -545,8 +547,76 @@ void CommKokkos::grow_buf_fix(int n) { void CommKokkos::reverse_comm(Pair *pair) { - k_sendlist.sync(); - CommBrick::reverse_comm(pair); + if (pair->execution_space == Host || !pair->reverse_comm_device || reverse_pair_comm_classic) { + k_sendlist.sync(); + CommBrick::reverse_comm(pair); + } else { + k_sendlist.sync(); + reverse_comm_device(pair); + } +} + +template +void CommKokkos::reverse_comm_device(Pair *pair) +{ + int iswap,n; + MPI_Request request; + DAT::tdual_xfloat_1d k_buf_tmp; + + KokkosBase* pairKKBase = dynamic_cast(pair); + + int nsize = MAX(pair->comm_reverse,pair->comm_reverse_off); + + int nmax = max_buf_pair; + for (iswap = 0; iswap < nswap; iswap++) { + nmax = MAX(nmax,nsize*sendnum[iswap]); + nmax = MAX(nmax,nsize*recvnum[iswap]); + } + if (nmax > max_buf_pair) + grow_buf_pair(nmax); + + for (iswap = nswap-1; iswap >= 0; iswap--) { + + // pack buffer + + n = pairKKBase->pack_reverse_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send_pair); + DeviceType().fence(); + + // exchange with another proc + // if self, set recv buffer to send buffer + + double* buf_send_pair; + double* buf_recv_pair; + if (lmp->kokkos->gpu_aware_flag) { + buf_send_pair = k_buf_send_pair.view().data(); + buf_recv_pair = k_buf_recv_pair.view().data(); + } else { + k_buf_send_pair.modify(); + k_buf_send_pair.sync(); + buf_send_pair = k_buf_send_pair.h_view.data(); + buf_recv_pair = k_buf_recv_pair.h_view.data(); + } + + if (sendproc[iswap] != me) { + if (sendnum[iswap]) + MPI_Irecv(buf_recv_pair,nsize*sendnum[iswap],MPI_DOUBLE,sendproc[iswap],0,world,&request); + if (recvnum[iswap]) + MPI_Send(buf_send_pair,n,MPI_DOUBLE,recvproc[iswap],0,world); + if (sendnum[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); + + if (!lmp->kokkos->gpu_aware_flag) { + k_buf_recv_pair.modify(); + k_buf_recv_pair.sync(); + } + k_buf_tmp = k_buf_recv_pair; + } else k_buf_tmp = k_buf_send_pair; + + // unpack buffer + + pairKKBase->unpack_reverse_comm_kokkos(sendnum[iswap],k_sendlist, + iswap,k_buf_tmp); + DeviceType().fence(); + } } void CommKokkos::forward_comm(Dump *dump) diff --git a/src/KOKKOS/comm_kokkos.h b/src/KOKKOS/comm_kokkos.h index 3fcce82e2c..80736c1f92 100644 --- a/src/KOKKOS/comm_kokkos.h +++ b/src/KOKKOS/comm_kokkos.h @@ -27,6 +27,7 @@ class CommKokkos : public CommBrick { bool exchange_comm_classic; bool forward_comm_classic; bool forward_pair_comm_classic; + bool reverse_pair_comm_classic; bool forward_fix_comm_classic; bool reverse_comm_classic; bool exchange_comm_on_host; @@ -58,6 +59,7 @@ class CommKokkos : public CommBrick { template void forward_comm_device(int dummy); template void reverse_comm_device(); template void forward_comm_device(Pair *pair); + template void reverse_comm_device(Pair *pair); template void forward_comm_device(Fix *fix, int size=0); template void exchange_device(); template void borders_device(); diff --git a/src/KOKKOS/kokkos.cpp b/src/KOKKOS/kokkos.cpp index 18f05fa281..3ecce0d75d 100644 --- a/src/KOKKOS/kokkos.cpp +++ b/src/KOKKOS/kokkos.cpp @@ -91,6 +91,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) exchange_comm_changed = 0; forward_comm_changed = 0; forward_pair_comm_changed = 0; + reverse_pair_comm_changed = 0; forward_fix_comm_changed = 0; reverse_comm_changed = 0; @@ -239,7 +240,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 0; exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; - forward_pair_comm_classic = forward_fix_comm_classic = 0; + forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else { @@ -253,7 +254,7 @@ KokkosLMP::KokkosLMP(LAMMPS *lmp, int narg, char **arg) : Pointers(lmp) newtonflag = 1; exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; - forward_pair_comm_classic = forward_fix_comm_classic = 1; + forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } @@ -394,17 +395,17 @@ void KokkosLMP::accelerator(int narg, char **arg) if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) { exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 1; - forward_pair_comm_classic = forward_fix_comm_classic = 1; + forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else if (strcmp(arg[iarg+1],"host") == 0) { exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; - forward_pair_comm_classic = forward_fix_comm_classic = 1; + forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 1; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 1; } else if (strcmp(arg[iarg+1],"device") == 0) { exchange_comm_classic = forward_comm_classic = reverse_comm_classic = 0; - forward_pair_comm_classic = forward_fix_comm_classic = 0; + forward_pair_comm_classic = reverse_pair_comm_classic = forward_fix_comm_classic = 0; exchange_comm_on_host = forward_comm_on_host = reverse_comm_on_host = 0; } else error->all(FLERR,"Illegal package kokkos command"); @@ -441,6 +442,14 @@ void KokkosLMP::accelerator(int narg, char **arg) else error->all(FLERR,"Illegal package kokkos command"); forward_pair_comm_changed = 0; iarg += 2; + } else if (strcmp(arg[iarg],"comm/pair/reverse") == 0) { + if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); + if (strcmp(arg[iarg+1],"no") == 0) reverse_pair_comm_classic = 1; + else if (strcmp(arg[iarg+1],"host") == 0) reverse_pair_comm_classic = 1; + else if (strcmp(arg[iarg+1],"device") == 0) reverse_pair_comm_classic = 0; + else error->all(FLERR,"Illegal package kokkos command"); + reverse_pair_comm_changed = 0; + iarg += 2; } else if (strcmp(arg[iarg],"comm/fix/forward") == 0) { if (iarg+2 > narg) error->all(FLERR,"Illegal package kokkos command"); if (strcmp(arg[iarg+1],"no") == 0) forward_fix_comm_classic = 1; @@ -515,6 +524,10 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_pair_comm_classic = 1; forward_pair_comm_changed = 1; } + if (reverse_pair_comm_classic == 0) { + reverse_pair_comm_classic = 1; + reverse_pair_comm_changed = 1; + } if (forward_fix_comm_classic == 0) { forward_fix_comm_classic = 1; forward_fix_comm_changed = 1; @@ -540,6 +553,10 @@ void KokkosLMP::accelerator(int narg, char **arg) forward_pair_comm_classic = 0; forward_pair_comm_changed = 0; } + if (reverse_pair_comm_changed) { + reverse_pair_comm_classic = 0; + reverse_pair_comm_changed = 0; + } if (forward_fix_comm_changed) { forward_fix_comm_classic = 0; forward_fix_comm_changed = 0; diff --git a/src/KOKKOS/kokkos.h b/src/KOKKOS/kokkos.h index 07d172e524..1cecd69c5f 100644 --- a/src/KOKKOS/kokkos.h +++ b/src/KOKKOS/kokkos.h @@ -30,6 +30,7 @@ class KokkosLMP : protected Pointers { int exchange_comm_classic; int forward_comm_classic; int forward_pair_comm_classic; + int reverse_pair_comm_classic; int forward_fix_comm_classic; int reverse_comm_classic; int exchange_comm_on_host; @@ -38,6 +39,7 @@ class KokkosLMP : protected Pointers { int exchange_comm_changed; int forward_comm_changed; int forward_pair_comm_changed; + int reverse_pair_comm_changed; int forward_fix_comm_changed; int reverse_comm_changed; int nthreads,ngpus; diff --git a/src/KOKKOS/kokkos_base.h b/src/KOKKOS/kokkos_base.h index f18d55eec2..c593f0ae60 100644 --- a/src/KOKKOS/kokkos_base.h +++ b/src/KOKKOS/kokkos_base.h @@ -29,6 +29,10 @@ class KokkosBase { int, int *) {return 0;}; virtual void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d &) {} + virtual int pack_reverse_comm_kokkos(int, int, DAT::tdual_xfloat_1d &) {return 0;}; + virtual void unpack_reverse_comm_kokkos(int, DAT::tdual_int_2d, + int, DAT::tdual_xfloat_1d &) {} + // Fix virtual int pack_forward_comm_fix_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d &, diff --git a/src/KOKKOS/math_special_kokkos.cpp b/src/KOKKOS/math_special_kokkos.cpp index 89f6586581..eca64fbb4e 100644 --- a/src/KOKKOS/math_special_kokkos.cpp +++ b/src/KOKKOS/math_special_kokkos.cpp @@ -1,11 +1,202 @@ // clang-format off - #include "math_special_kokkos.h" + #include -#include +#include // IWYU pragma: keep +#include using namespace LAMMPS_NS; +static constexpr int nmaxfactorial = 167; + +/* ---------------------------------------------------------------------- + factorial n table, size nmaxfactorial+1 +------------------------------------------------------------------------- */ + +static const double nfac_table[] = { + 1, + 1, + 2, + 6, + 24, + 120, + 720, + 5040, + 40320, + 362880, + 3628800, + 39916800, + 479001600, + 6227020800, + 87178291200, + 1307674368000, + 20922789888000, + 355687428096000, + 6.402373705728e+15, + 1.21645100408832e+17, + 2.43290200817664e+18, + 5.10909421717094e+19, + 1.12400072777761e+21, + 2.5852016738885e+22, + 6.20448401733239e+23, + 1.5511210043331e+25, + 4.03291461126606e+26, + 1.08888694504184e+28, + 3.04888344611714e+29, + 8.8417619937397e+30, + 2.65252859812191e+32, + 8.22283865417792e+33, + 2.63130836933694e+35, + 8.68331761881189e+36, + 2.95232799039604e+38, + 1.03331479663861e+40, + 3.71993326789901e+41, + 1.37637530912263e+43, + 5.23022617466601e+44, + 2.03978820811974e+46, + 8.15915283247898e+47, + 3.34525266131638e+49, + 1.40500611775288e+51, + 6.04152630633738e+52, + 2.65827157478845e+54, + 1.1962222086548e+56, + 5.50262215981209e+57, + 2.58623241511168e+59, + 1.24139155925361e+61, + 6.08281864034268e+62, + 3.04140932017134e+64, + 1.55111875328738e+66, + 8.06581751709439e+67, + 4.27488328406003e+69, + 2.30843697339241e+71, + 1.26964033536583e+73, + 7.10998587804863e+74, + 4.05269195048772e+76, + 2.35056133128288e+78, + 1.3868311854569e+80, + 8.32098711274139e+81, + 5.07580213877225e+83, + 3.14699732603879e+85, + 1.98260831540444e+87, + 1.26886932185884e+89, + 8.24765059208247e+90, + 5.44344939077443e+92, + 3.64711109181887e+94, + 2.48003554243683e+96, + 1.71122452428141e+98, + 1.19785716699699e+100, + 8.50478588567862e+101, + 6.12344583768861e+103, + 4.47011546151268e+105, + 3.30788544151939e+107, + 2.48091408113954e+109, + 1.88549470166605e+111, + 1.45183092028286e+113, + 1.13242811782063e+115, + 8.94618213078297e+116, + 7.15694570462638e+118, + 5.79712602074737e+120, + 4.75364333701284e+122, + 3.94552396972066e+124, + 3.31424013456535e+126, + 2.81710411438055e+128, + 2.42270953836727e+130, + 2.10775729837953e+132, + 1.85482642257398e+134, + 1.65079551609085e+136, + 1.48571596448176e+138, + 1.3520015276784e+140, + 1.24384140546413e+142, + 1.15677250708164e+144, + 1.08736615665674e+146, + 1.03299784882391e+148, + 9.91677934870949e+149, + 9.61927596824821e+151, + 9.42689044888324e+153, + 9.33262154439441e+155, + 9.33262154439441e+157, + 9.42594775983835e+159, + 9.61446671503512e+161, + 9.90290071648618e+163, + 1.02990167451456e+166, + 1.08139675824029e+168, + 1.14628056373471e+170, + 1.22652020319614e+172, + 1.32464181945183e+174, + 1.44385958320249e+176, + 1.58824554152274e+178, + 1.76295255109024e+180, + 1.97450685722107e+182, + 2.23119274865981e+184, + 2.54355973347219e+186, + 2.92509369349301e+188, + 3.3931086844519e+190, + 3.96993716080872e+192, + 4.68452584975429e+194, + 5.5745857612076e+196, + 6.68950291344912e+198, + 8.09429852527344e+200, + 9.8750442008336e+202, + 1.21463043670253e+205, + 1.50614174151114e+207, + 1.88267717688893e+209, + 2.37217324288005e+211, + 3.01266001845766e+213, + 3.8562048236258e+215, + 4.97450422247729e+217, + 6.46685548922047e+219, + 8.47158069087882e+221, + 1.118248651196e+224, + 1.48727070609069e+226, + 1.99294274616152e+228, + 2.69047270731805e+230, + 3.65904288195255e+232, + 5.01288874827499e+234, + 6.91778647261949e+236, + 9.61572319694109e+238, + 1.34620124757175e+241, + 1.89814375907617e+243, + 2.69536413788816e+245, + 3.85437071718007e+247, + 5.5502938327393e+249, + 8.04792605747199e+251, + 1.17499720439091e+254, + 1.72724589045464e+256, + 2.55632391787286e+258, + 3.80892263763057e+260, + 5.71338395644585e+262, + 8.62720977423323e+264, + 1.31133588568345e+267, + 2.00634390509568e+269, + 3.08976961384735e+271, + 4.78914290146339e+273, + 7.47106292628289e+275, + 1.17295687942641e+278, + 1.85327186949373e+280, + 2.94670227249504e+282, + 4.71472363599206e+284, + 7.59070505394721e+286, + 1.22969421873945e+289, + 2.0044015765453e+291, + 3.28721858553429e+293, + 5.42391066613159e+295, + 9.00369170577843e+297, + 1.503616514865e+300, // nmaxfactorial = 167 +}; + +/* ---------------------------------------------------------------------- + factorial n vial lookup from precomputed table +------------------------------------------------------------------------- */ + +double MathSpecialKokkos::factorial(const int n) +{ + if (n < 0 || n > nmaxfactorial) + return std::numeric_limits::quiet_NaN(); + + return nfac_table[n]; +} + + /* Library libcerf: * Compute complex error functions, based on a new implementation of * Faddeeva's w_of_z. Also provide Dawson and Voigt functions. @@ -477,59 +668,3 @@ double MathSpecialKokkos::erfcx_y100(const double y100) return 1.0; } /* erfcx_y100 */ -/* optimizer friendly implementation of exp2(x). - * - * strategy: - * - * split argument into an integer part and a fraction: - * ipart = floor(x+0.5); - * fpart = x - ipart; - * - * compute exp2(ipart) from setting the ieee754 exponent - * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[ - * - * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart) - */ - -/* IEEE 754 double precision floating point data manipulation */ -typedef union -{ - double f; - uint64_t u; - struct {int32_t i0,i1;} s; -} udi_t; - -static const double fm_exp2_q[] = { -/* 1.00000000000000000000e0, */ - 2.33184211722314911771e2, - 4.36821166879210612817e3 -}; -static const double fm_exp2_p[] = { - 2.30933477057345225087e-2, - 2.02020656693165307700e1, - 1.51390680115615096133e3 -}; - -double MathSpecialKokkos::exp2_x86(double x) -{ - double ipart, fpart, px, qx; - udi_t epart; - - ipart = floor(x+0.5); - fpart = x - ipart; - epart.s.i0 = 0; - epart.s.i1 = (((int) ipart) + 1023) << 20; - - x = fpart*fpart; - - px = fm_exp2_p[0]; - px = px*x + fm_exp2_p[1]; - qx = x + fm_exp2_q[0]; - px = px*x + fm_exp2_p[2]; - qx = qx*x + fm_exp2_q[1]; - - px = px * fpart; - - x = 1.0 + 2.0*(px/(qx-px)); - return epart.f*x; -} diff --git a/src/KOKKOS/math_special_kokkos.h b/src/KOKKOS/math_special_kokkos.h index f2d23a65eb..0264cb4134 100644 --- a/src/KOKKOS/math_special_kokkos.h +++ b/src/KOKKOS/math_special_kokkos.h @@ -1,4 +1,3 @@ -// clang-format off /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator https://www.lammps.org/, Sandia National Laboratories @@ -15,98 +14,240 @@ #ifndef LMP_MATH_SPECIAL_KOKKOS_H #define LMP_MATH_SPECIAL_KOKKOS_H -#include #include "kokkos_type.h" +#include namespace LAMMPS_NS { namespace MathSpecialKokkos { + /*! Fast tabulated factorial function + * + * This function looks up pre-computed factorial values for arguments of n = 0 + * to a maximum of 167, which is the maximal value representable by a double + * precision floating point number. For other values of n a NaN value is returned. + * + * \param n argument (valid: 0 <= n <= 167) + * \return value of n! as double precision number or NaN */ + + extern double factorial(const int n); + + /* optimizer friendly implementation of exp2(x). + * + * strategy: + * + * split argument into an integer part and a fraction: + * ipart = floor(x+0.5); + * fpart = x - ipart; + * + * compute exp2(ipart) from setting the ieee754 exponent + * compute exp2(fpart) using a pade' approximation for x in [-0.5;0.5[ + * + * the result becomes: exp2(x) = exp2(ipart) * exp2(fpart) + */ + + /* IEEE 754 double precision floating point data manipulation */ + typedef union + { + double f; + uint64_t u; + struct {int32_t i0,i1;} s; + } udi_t; + + /* double precision constants */ + #define FM_DOUBLE_LOG2OFE 1.4426950408889634074 + + /*! Fast implementation of 2^x without argument checks for little endian CPUs + * + * This function implements an optimized version of pow(2.0, x) that does not + * check for valid arguments and thus may only be used where arguments are well + * behaved. The implementation makes assumptions about the layout of double + * precision floating point numbers in memory and thus will only work on little + * endian CPUs. If little endian cannot be safely detected, the result of + * calling pow(2.0, x) will be returned. This function also is the basis for + * the fast exponential fm_exp(x). + * + * \param x argument + * \return value of 2^x as double precision number */ + + KOKKOS_INLINE_FUNCTION + static double exp2_x86(double x) + { + #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + double ipart, fpart, px, qx; + udi_t epart; + + const double fm_exp2_q[2] = { + /* 1.00000000000000000000e0, */ + 2.33184211722314911771e2, + 4.36821166879210612817e3 + }; + const double fm_exp2_p[3] = { + 2.30933477057345225087e-2, + 2.02020656693165307700e1, + 1.51390680115615096133e3 + }; + + ipart = floor(x+0.5); + fpart = x - ipart; + epart.s.i0 = 0; + epart.s.i1 = (((int) ipart) + 1023) << 20; + + x = fpart*fpart; + + px = fm_exp2_p[0]; + px = px*x + fm_exp2_p[1]; + qx = x + fm_exp2_q[0]; + px = px*x + fm_exp2_p[2]; + qx = qx*x + fm_exp2_q[1]; + + px = px * fpart; + + x = 1.0 + 2.0*(px/(qx-px)); + return epart.f*x; + #else + return pow(2.0, x); + #endif + } + + /*! Fast implementation of exp(x) for little endian CPUs + * + * This function implements an optimized version of exp(x) for little endian CPUs. + * It calls the exp2_x86(x) function with a suitable prefactor to x to return exp(x). + * The implementation makes assumptions about the layout of double + * precision floating point numbers in memory and thus will only work on little + * endian CPUs. If little endian cannot be safely detected, the result of + * calling the exp(x) implementation in the standard math library will be returned. + * + * \param x argument + * \return value of e^x as double precision number */ + + KOKKOS_INLINE_FUNCTION + static double fm_exp(double x) + { + #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) + if (x < -1022.0/FM_DOUBLE_LOG2OFE) return 0; + if (x > 1023.0/FM_DOUBLE_LOG2OFE) return INFINITY; + return exp2_x86(FM_DOUBLE_LOG2OFE * x); + #else + return ::exp(x); + #endif + } + // support function for scaled error function complement extern double erfcx_y100(const double y100); - // fast 2**x function without argument checks for little endian CPUs - extern double exp2_x86(double x); - - // scaled error function complement exp(x*x)*erfc(x) for coul/long styles + /*! Fast scaled error function complement exp(x*x)*erfc(x) for coul/long styles + * + * This is a portable fast implementation of exp(x*x)*erfc(x) that can be used + * in coul/long pair styles as a replacement for the polynomial expansion that + * is/was widely used. Unlike the polynomial expansion, that is only accurate + * at the level of single precision floating point it provides full double precision + * accuracy, but at comparable speed (unlike the erfc() implementation shipped + * with GNU standard math library). + * + * \param x argument + * \return value of e^(x*x)*erfc(x) */ static inline double my_erfcx(const double x) { - if (x >= 0.0) return erfcx_y100(400.0/(4.0+x)); - else return 2.0*exp(x*x) - erfcx_y100(400.0/(4.0-x)); + if (x >= 0.0) + return erfcx_y100(400.0 / (4.0 + x)); + else + return 2.0 * exp(x * x) - erfcx_y100(400.0 / (4.0 - x)); } - // exp(-x*x) for coul/long styles + /*! Fast implementation of exp(-x*x) for little endian CPUs for coul/long styles + * + * This function implements an optimized version of exp(-x*x) based on exp2_x86() + * for use with little endian CPUs. If little endian cannot be safely detected, + * the result of calling the exp(-x*x) implementation in the standard math + * library will be returned. + * + * \param x argument + * \return value of e^(-x*x) as double precision number */ static inline double expmsq(double x) { x *= x; - x *= 1.4426950408889634074; // log_2(e) -#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + x *= 1.4426950408889634074; // log_2(e) +#if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ return (x < 1023.0) ? exp2_x86(-x) : 0.0; #else return (x < 1023.0) ? exp2(-x) : 0.0; #endif } -#define FM_DOUBLE_LOG2OFE 1.4426950408889634074 - // fast e**x function for little endian CPUs, falls back to libc on other platforms - KOKKOS_INLINE_FUNCTION - static double fm_exp(double x) - { -#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) - return exp2_x86(FM_DOUBLE_LOG2OFE * x); -#else - return ::exp(x); -#endif - } + /*! Fast inline version of pow(x, 2.0) + * + * \param x argument + * \return x*x */ - // x**2, use instead of pow(x,2.0) KOKKOS_INLINE_FUNCTION - static double square(const double &x) { return x*x; } + static double square(const double &x) { return x * x; } + + /*! Fast inline version of pow(x, 3.0) + * + * \param x argument + * \return x*x */ - // x**3, use instead of pow(x,3.0) KOKKOS_INLINE_FUNCTION - static double cube(const double &x) { return x*x*x; } + static double cube(const double &x) { return x * x * x; } + + /* Fast inline version of pow(-1.0, n) + * + * \param n argument (integer) + * \return -1 if n is odd, 1.0 if n is even */ - // return -1.0 for odd n, 1.0 for even n, like pow(-1.0,n) KOKKOS_INLINE_FUNCTION static double powsign(const int n) { return (n & 1) ? -1.0 : 1.0; } - // optimized version of pow(x,n) with n being integer - // up to 10x faster than pow(x,y) + /* Fast inline version of pow(x,n) for integer n + * + * This is a version of pow(x,n) optimized for n being integer. + * Speedups of up to 10x faster than pow(x,y) have been measured. + * + * \param n argument (integer) + * \return value of x^n */ KOKKOS_INLINE_FUNCTION - static double powint(const double &x, const int n) { - double yy,ww; + static double powint(const double &x, const int n) + { + double yy, ww; if (x == 0.0) return 0.0; int nn = (n > 0) ? n : -n; ww = x; - for (yy = 1.0; nn != 0; nn >>= 1, ww *=ww) + for (yy = 1.0; nn != 0; nn >>= 1, ww *= ww) if (nn & 1) yy *= ww; - return (n > 0) ? yy : 1.0/yy; + return (n > 0) ? yy : 1.0 / yy; } - // optimized version of (sin(x)/x)**n with n being a _positive_ integer + /* Fast inline version of (sin(x)/x)^n as used by PPPM kspace styles + * + * This is an optimized function to compute (sin(x)/x)^n as frequently used by PPPM. + * + * \param n argument (integer). Expected to be positive. + * \return value of (sin(x)/x)^n */ KOKKOS_INLINE_FUNCTION - static double powsinxx(const double &x, int n) { - double yy,ww; + static double powsinxx(const double &x, int n) + { + double yy, ww; if (x == 0.0) return 1.0; - ww = sin(x)/x; + ww = sin(x) / x; - for (yy = 1.0; n != 0; n >>= 1, ww *=ww) + for (yy = 1.0; n != 0; n >>= 1, ww *= ww) if (n & 1) yy *= ww; return yy; } -} -} +} // namespace MathSpecialKokkos +} // namespace LAMMPS_NS #endif diff --git a/src/KOKKOS/meam_dens_final_kokkos.h b/src/KOKKOS/meam_dens_final_kokkos.h index b44a8af3bd..4a212d3597 100644 --- a/src/KOKKOS/meam_dens_final_kokkos.h +++ b/src/KOKKOS/meam_dens_final_kokkos.h @@ -3,179 +3,148 @@ using namespace LAMMPS_NS; +/* ---------------------------------------------------------------------- */ + template void -MEAMKokkos::meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom, double* eng_vdwl, - typename ArrayTypes::t_efloat_1d eatom, int ntype, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, int& errorflag) +MEAMKokkos::meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom, + typename ArrayTypes::t_efloat_1d eatom, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_int_2d d_scale, int& errorflag, EV_FLOAT &ev_all) { EV_FLOAT ev; - ev.evdwl = *eng_vdwl; this->eflag_either = eflag_either; this->eflag_global = eflag_global; this->eflag_atom = eflag_atom; this->d_eatom = eatom; this->ntype = ntype; this->type = type; - this->fmap = fmap; + this->d_map = d_map; + this->d_scale = d_scale; - // Complete the calculation of density + Kokkos::deep_copy(d_errorflag,0); + // Complete the calculation of density + + copymode = 1; Kokkos::parallel_reduce(Kokkos::RangePolicy(0,nlocal),*this,ev); - *eng_vdwl = ev.evdwl; + ev_all.evdwl =+ ev.evdwl; + copymode = 0; + + auto h_errorflag = Kokkos::create_mirror_view_and_copy(LMPHostType(),d_errorflag); + errorflag = h_errorflag(); } +/* ---------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION void MEAMKokkos::operator()(TagMEAMDensFinal, const int &i, EV_FLOAT& ev) const { - lattice_t elti; - int m; - int errorflag; F_FLOAT rhob, G, dG, Gbar, dGbar, gam, shp[3], Z; - F_FLOAT B, denom, rho_bkgd; + F_FLOAT denom, rho_bkgd, Fl; + double scaleii; - // may or may not be legal - elti = static_cast(fmap[type[i]]); - if (elti >= 0) { - d_rho1[i] = 0.0; - d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i]; - d_rho3[i] = 0.0; - for (m = 0; m < 3; m++) { - d_rho1[i] = d_rho1[i] + d_arho1(i,m) * d_arho1(i,m); - d_rho3[i] = d_rho3[i] - 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m); - } - for (m = 0; m < 6; m++) { - d_rho2[i] = d_rho2[i] + v2D[m] * d_arho2(i,m) * d_arho2(i,m); - } - for (m = 0; m < 10; m++) { - d_rho3[i] = d_rho3[i] + v3D[m] * d_arho3(i,m) * d_arho3(i,m); - } + int elti = d_map[type[i]]; + if (elti >= 0) { + scaleii = d_scale(type[i],type[i]); + d_rho1[i] = 0.0; + d_rho2[i] = -1.0 / 3.0 * d_arho2b[i] * d_arho2b[i]; + d_rho3[i] = 0.0; + for (int m = 0; m < 3; m++) { + d_rho1[i] += d_arho1(i,m) * d_arho1(i,m); + d_rho3[i] -= 3.0 / 5.0 * d_arho3b(i,m) * d_arho3b(i,m); + } + for (int m = 0; m < 6; m++) + d_rho2[i] += v2D[m] * d_arho2(i,m) * d_arho2(i,m); + for (int m = 0; m < 10; m++) + d_rho3[i] += v3D[m] * d_arho3(i,m) * d_arho3(i,m); - if (d_rho0[i] > 0.0) { - if (this->ialloy == 1) { - d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0)); - d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1)); - d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2)); - } else if (this->ialloy == 2) { - d_t_ave(i,0) = this->t1_meam[elti]; - d_t_ave(i,1) = this->t2_meam[elti]; - d_t_ave(i,2) = this->t3_meam[elti]; - } else { - d_t_ave(i,0) = d_t_ave(i,0) / d_rho0[i]; - d_t_ave(i,1) = d_t_ave(i,1) / d_rho0[i]; - d_t_ave(i,2) = d_t_ave(i,2) / d_rho0[i]; - } - } - - d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i]; - - if (d_rho0[i] > 0.0) { - d_gamma[i] = d_gamma[i] / (d_rho0[i] * d_rho0[i]); - } - - // need to double check, the analogous function is - // Z = get_Zij(this->lattce_meam[elti][elti]); in the non-KOKKOS version? - Z = get_Zij(elti); - - G = G_gam(d_gamma[i], this->ibar_meam[elti], errorflag); - if (errorflag != 0) - { - //char str[128]; - //sprintf(str,"MEAMKokkos library error %d",errorflag); - //error->one(FLERR,str); - return; - } - get_shpfcn(this->lattce_meam[elti][elti], shp); - if (this->ibar_meam[elti] <= 0) { - Gbar = 1.0; - dGbar = 0.0; + if (d_rho0[i] > 0.0) { + if (ialloy == 1) { + d_t_ave(i,0) = fdiv_zero_kk(d_t_ave(i,0), d_tsq_ave(i,0)); + d_t_ave(i,1) = fdiv_zero_kk(d_t_ave(i,1), d_tsq_ave(i,1)); + d_t_ave(i,2) = fdiv_zero_kk(d_t_ave(i,2), d_tsq_ave(i,2)); + } else if (ialloy == 2) { + d_t_ave(i,0) = t1_meam[elti]; + d_t_ave(i,1) = t2_meam[elti]; + d_t_ave(i,2) = t3_meam[elti]; } else { - if (this->mix_ref_t == 1) { - gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z); - } else { - gam = (this->t1_meam[elti] * shp[0] + this->t2_meam[elti] * shp[1] + this->t3_meam[elti] * shp[2]) / - (Z * Z); - } - Gbar = G_gam(gam, this->ibar_meam[elti], errorflag); - } - d_rho[i] = d_rho0[i] * G; - - if (this->mix_ref_t == 1) { - if (this->ibar_meam[elti] <= 0) { - Gbar = 1.0; - dGbar = 0.0; - } else { - gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z); - Gbar = dG_gam(gam, this->ibar_meam[elti], dGbar); - } - rho_bkgd = this->rho0_meam[elti] * Z * Gbar; - } else { - if (this->bkgd_dyn == 1) { - rho_bkgd = this->rho0_meam[elti] * Z; - } else { - rho_bkgd = this->rho_ref_meam[elti]; - } - } - rhob = d_rho[i] / rho_bkgd; - denom = 1.0 / rho_bkgd; - - G = dG_gam(d_gamma[i], this->ibar_meam[elti], dG); - - d_dgamma1[i] = (G - 2 * dG * d_gamma[i]) * denom; - - if (!iszero_kk(d_rho0[i])) { - d_dgamma2[i] = (dG / d_rho0[i]) * denom; - } else { - d_dgamma2[i] = 0.0; - } - - // dgamma3 is nonzero only if we are using the "mixed" rule for - // computing t in the reference system (which is not correct, but - // included for backward compatibility - if (this->mix_ref_t == 1) { - d_dgamma3[i] = d_rho0[i] * G * dGbar / (Gbar * Z * Z) * denom; - } else { - d_dgamma3[i] = 0.0; - } - - B = this->A_meam[elti] * this->Ec_meam[elti][elti]; - - if (!iszero_kk(rhob)) { - if (this->emb_lin_neg == 1 && rhob <= 0) { - d_frhop[i] = -B; - } else { - d_frhop[i] = B * (log(rhob) + 1.0); - } - if (eflag_either != 0) { - if (eflag_global != 0) { - if (this->emb_lin_neg == 1 && rhob <= 0) { - //*eng_vdwl = *eng_vdwl - B * rhob; - ev.evdwl = ev.evdwl - B * rhob; - } else { - //*eng_vdwl = *eng_vdwl + B * rhob * log(rhob); - ev.evdwl = ev.evdwl + B * rhob * log(rhob); - } - } - if (eflag_atom != 0) { - if (this->emb_lin_neg == 1 && rhob <= 0) { - d_eatom[i] = d_eatom[i] - B * rhob; - } else { - d_eatom[i] = d_eatom[i] + B * rhob * log(rhob); - } - } - } - } else { - if (this->emb_lin_neg == 1) { - d_frhop[i] = -B; - } else { - d_frhop[i] = B; - } + d_t_ave(i,0) /= d_rho0[i]; + d_t_ave(i,1) /= d_rho0[i]; + d_t_ave(i,2) /= d_rho0[i]; } } - if (errorflag) - { - //char str[128]; - //sprintf(str,"MEAMKokkos library error %d",errorflag); - //error->one(FLERR,str); + + d_gamma[i] = d_t_ave(i,0) * d_rho1[i] + d_t_ave(i,1) * d_rho2[i] + d_t_ave(i,2) * d_rho3[i]; + + if (d_rho0[i] > 0.0) + d_gamma[i] /= (d_rho0[i] * d_rho0[i]); + + Z = get_Zij(lattce_meam[elti][elti]); + + G = G_gam(d_gamma[i], ibar_meam[elti], d_errorflag()); + if (d_errorflag() != 0) + return; + + get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shp); + if (ibar_meam[elti] <= 0) { + Gbar = 1.0; + dGbar = 0.0; + } else { + if (mix_ref_t == 1) + gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z); + else + gam = (t1_meam[elti] * shp[0] + t2_meam[elti] * shp[1] + t3_meam[elti] * shp[2]) / + (Z * Z); + Gbar = G_gam(gam, ibar_meam[elti], d_errorflag()); } + d_rho[i] = d_rho0[i] * G; + + if (mix_ref_t == 1) { + if (ibar_meam[elti] <= 0) { + Gbar = 1.0; + dGbar = 0.0; + } else { + gam = (d_t_ave(i,0) * shp[0] + d_t_ave(i,1) * shp[1] + d_t_ave(i,2) * shp[2]) / (Z * Z); + Gbar = dG_gam(gam, ibar_meam[elti], dGbar); + } + rho_bkgd = rho0_meam[elti] * Z * Gbar; + } else { + if (bkgd_dyn == 1) + rho_bkgd = rho0_meam[elti] * Z; + else + rho_bkgd = rho_ref_meam[elti]; + } + rhob = d_rho[i] / rho_bkgd; + denom = 1.0 / rho_bkgd; + + G = dG_gam(d_gamma[i], ibar_meam[elti], dG); + + d_dgamma1[i] = (G - 2 * dG * d_gamma[i]) * denom; + + if (!iszero_kk(d_rho0[i])) + d_dgamma2[i] = (dG / d_rho0[i]) * denom; + else + d_dgamma2[i] = 0.0; + + // dgamma3 is nonzero only if we are using the "mixed" rule for + // computing t in the reference system (which is not correct, but + // included for backward compatibility + if (mix_ref_t == 1) + d_dgamma3[i] = d_rho0[i] * G * dGbar / (Gbar * Z * Z) * denom; + else + d_dgamma3[i] = 0.0; + + Fl = embedding(A_meam[elti], Ec_meam[elti][elti], rhob, d_frhop[i]); + + if (eflag_either) { + Fl *= scaleii; + if (eflag_global) { + ev.evdwl += Fl; + } + if (eflag_atom) { + d_eatom[i] += Fl; + } + } + } } + diff --git a/src/KOKKOS/meam_dens_init_kokkos.h b/src/KOKKOS/meam_dens_init_kokkos.h index c3dfdbbcb8..bcf0e7433e 100644 --- a/src/KOKKOS/meam_dens_init_kokkos.h +++ b/src/KOKKOS/meam_dens_init_kokkos.h @@ -1,41 +1,45 @@ #include "meam_kokkos.h" -#include "math_special.h" -#include "mpi.h" +#include "math_special_kokkos.h" using namespace LAMMPS_NS; using namespace MathSpecialKokkos; +/* ---------------------------------------------------------------------- */ + template template KOKKOS_INLINE_FUNCTION -void MEAMKokkos::operator()(TagMEAMDensInit, const int &i, EV_FLOAT &ev) const { +void MEAMKokkos::operator()(TagMEAMDensInit, const int &i) const { int ii, offsetval; ii = d_ilist_half[i]; offsetval = d_offset[i]; - // Compute screening function and derivatives + // compute screening function and derivatives this->template getscreen(ii, offsetval, x, d_numneigh_half, - d_numneigh_full, ntype, type, fmap); + d_numneigh_full, ntype, type, d_map); - // Calculate intermediate density terms to be communicated - this->template calc_rho1(ii, ntype, type, fmap, x, d_numneigh_half, offsetval); - ev.evdwl += d_numneigh_half[i]; + // calculate intermediate density terms to be communicated + this->template calc_rho1(ii, ntype, type, d_map, x, d_numneigh_half, offsetval); } +/* ---------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -void MEAMKokkos::operator()(TagMEAMInitialize, const int &i) const { - d_rho0[i] = 0.0; - d_arho2b[i] = 0.0; - d_arho1(i,0) = d_arho1(i,1) = d_arho1(i,2) = 0.0; - for (int j = 0; j < 6; j++) - d_arho2(i,j) = 0.0; - for (int j = 0; j < 10; j++) - d_arho3(i,j) = 0.0; - d_arho3b(i,0) = d_arho3b(i,1) = d_arho3b(i,2) = 0.0; - d_t_ave(i,0) = d_t_ave(i,1) = d_t_ave(i,2) = 0.0; - d_tsq_ave(i,0) = d_tsq_ave(i,1) = d_tsq_ave(i,2) = 0.0; +void MEAMKokkos::operator()(TagMEAMZero, const int &i) const { + d_rho0[i] = 0.0; + d_arho2b[i] = 0.0; + d_arho1(i,0) = d_arho1(i,1) = d_arho1(i,2) = 0.0; + for (int j = 0; j < 6; j++) + d_arho2(i,j) = 0.0; + for (int j = 0; j < 10; j++) + d_arho3(i,j) = 0.0; + d_arho3b(i,0) = d_arho3b(i,1) = d_arho3b(i,2) = 0.0; + d_t_ave(i,0) = d_t_ave(i,1) = d_t_ave(i,2) = 0.0; + d_tsq_ave(i,0) = d_tsq_ave(i,1) = d_tsq_ave(i,2) = 0.0; } +/* ---------------------------------------------------------------------- */ + template void MEAMKokkos::meam_dens_setup(int atom_nmax, int nall, int n_neigh) @@ -155,20 +159,21 @@ MEAMKokkos::meam_dens_setup(int atom_nmax, int nall, int n_neigh) // zero out local arrays - Kokkos::parallel_for(Kokkos::RangePolicy(0, nall),*this); + copymode = 1; + Kokkos::parallel_for(Kokkos::RangePolicy(0, nall),*this); + copymode = 0; } +/* ---------------------------------------------------------------------- */ + template void -MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread d_numneigh_half, typename AT::t_int_1d_randomread d_numneigh_full, - int *fnoffset, typename AT::t_int_1d_randomread d_ilist_half, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, typename AT::t_int_1d_randomread d_offset, int neighflag) +MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh_half, typename AT::t_int_1d d_numneigh_full, + typename AT::t_int_1d d_ilist_half, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, typename AT::t_int_1d d_offset, int neighflag, int need_dup) { - EV_FLOAT ev; - - ev.evdwl = 0; this->ntype = ntype; this->type = type; - this->fmap = fmap; + this->d_map = d_map; this->x = x; this->d_numneigh_half = d_numneigh_half; this->d_numneigh_full = d_numneigh_full; @@ -176,298 +181,325 @@ MEAMKokkos::meam_dens_init(int inum_half, int ntype, typename AT::t_ this->d_neighbors_half = d_neighbors_half; this->d_neighbors_full = d_neighbors_full; this->d_offset = d_offset; - if (neighflag == FULL) - Kokkos::parallel_reduce(Kokkos::RangePolicy >(0,inum_half),*this, ev); - else if (neighflag == HALF) - Kokkos::parallel_reduce(Kokkos::RangePolicy >(0,inum_half),*this, ev); + this->nlocal = nlocal; + + if (need_dup) { + dup_rho0 = Kokkos::Experimental::create_scatter_view(d_rho0); + dup_arho2b = Kokkos::Experimental::create_scatter_view(d_arho2b); + dup_arho1 = Kokkos::Experimental::create_scatter_view(d_arho1); + dup_arho2 = Kokkos::Experimental::create_scatter_view(d_arho2); + dup_arho3 = Kokkos::Experimental::create_scatter_view(d_arho3); + dup_arho3b = Kokkos::Experimental::create_scatter_view(d_arho3b); + dup_t_ave = Kokkos::Experimental::create_scatter_view(d_t_ave); + dup_tsq_ave = Kokkos::Experimental::create_scatter_view(d_tsq_ave); + } else { + ndup_rho0 = Kokkos::Experimental::create_scatter_view(d_rho0); + ndup_arho2b = Kokkos::Experimental::create_scatter_view(d_arho2b); + ndup_arho1 = Kokkos::Experimental::create_scatter_view(d_arho1); + ndup_arho2 = Kokkos::Experimental::create_scatter_view(d_arho2); + ndup_arho3 = Kokkos::Experimental::create_scatter_view(d_arho3); + ndup_arho3b = Kokkos::Experimental::create_scatter_view(d_arho3b); + ndup_t_ave = Kokkos::Experimental::create_scatter_view(d_t_ave); + ndup_tsq_ave = Kokkos::Experimental::create_scatter_view(d_tsq_ave); + } + + copymode = 1; + if (neighflag == HALF) + Kokkos::parallel_for(Kokkos::RangePolicy >(0,inum_half),*this); else if (neighflag == HALFTHREAD) - Kokkos::parallel_reduce(Kokkos::RangePolicy >(0,inum_half),*this, ev); - *fnoffset = (int)ev.evdwl; + Kokkos::parallel_for(Kokkos::RangePolicy >(0,inum_half),*this); + copymode = 0; + + if (need_dup) { + Kokkos::Experimental::contribute(d_rho0, dup_rho0); + Kokkos::Experimental::contribute(d_arho2b, dup_arho2b); + Kokkos::Experimental::contribute(d_arho1, dup_arho1); + Kokkos::Experimental::contribute(d_arho2, dup_arho2); + Kokkos::Experimental::contribute(d_arho3, dup_arho3); + Kokkos::Experimental::contribute(d_arho3b, dup_arho3b); + Kokkos::Experimental::contribute(d_t_ave, dup_t_ave); + Kokkos::Experimental::contribute(d_tsq_ave, dup_tsq_ave); + + // free duplicated memory + dup_rho0 = decltype(dup_rho0)(); + dup_arho2b = decltype(dup_arho2b)(); + dup_arho1 = decltype(dup_arho1)(); + dup_arho2 = decltype(dup_arho2)(); + dup_arho3 = decltype(dup_arho3)(); + dup_arho3b = decltype(dup_arho3b)(); + dup_t_ave = decltype(dup_t_ave)(); + dup_tsq_ave = decltype(dup_tsq_ave)(); + } } -// ccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc +/* ---------------------------------------------------------------------- */ + template template KOKKOS_INLINE_FUNCTION void -MEAMKokkos::getscreen(int i, int offset, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread numneigh_half, - typename AT::t_int_1d_randomread numneigh_full, int /*ntype*/, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap) +MEAMKokkos::getscreen(int i, int offset, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh_half, + typename AT::t_int_1d d_numneigh_full, int /*ntype*/, typename AT::t_int_1d type, typename AT::t_int_1d d_map) const { - int jn, j, kn, k; - int elti, eltj, eltk; - X_FLOAT xitmp, yitmp, zitmp, delxij, delyij, delzij; - F_FLOAT rij2, rij; - X_FLOAT xjtmp, yjtmp, zjtmp, delxik, delyik, delzik; - F_FLOAT rik2 /*,rik*/; - X_FLOAT xktmp, yktmp, zktmp, delxjk, delyjk, delzjk; - F_FLOAT rjk2 /*,rjk*/; - X_FLOAT xik, xjk; - F_FLOAT sij, fcij, sfcij, dfcij, sikj, dfikj, cikj; - F_FLOAT Cmin, Cmax, delc, /*ebound,*/ a, coef1, coef2; - F_FLOAT dCikj; - F_FLOAT rnorm, fc, dfc, drinv; - - drinv = 1.0 / this->delr_meam; - elti = fmap[type[i]]; + const double drinv = 1.0 / delr_meam; + const int elti = d_map[type[i]]; if (elti < 0) return; - xitmp = x(i,0); - yitmp = x(i,1); - zitmp = x(i,2); + const double xitmp = x(i,0); + const double yitmp = x(i,1); + const double zitmp = x(i,2); - for (jn = 0; jn < numneigh_half[i]; jn++) { - //j = firstneigh[jn]; - j = d_neighbors_half(i,jn); + for (int jn = 0; jn < d_numneigh_half[i]; jn++) { + const int j = d_neighbors_half(i,jn); - eltj = fmap[type[j]]; + const int eltj = d_map[type[j]]; if (eltj < 0) continue; - // First compute screening function itself, sij - xjtmp = x(j,0); - yjtmp = x(j,1); - zjtmp = x(j,2); - delxij = xjtmp - xitmp; - delyij = yjtmp - yitmp; - delzij = zjtmp - zitmp; + // First compute screening function itself, sij + const double xjtmp = x(j,0); + const double yjtmp = x(j,1); + const double zjtmp = x(j,2); + const double delxij = xjtmp - xitmp; + const double delyij = yjtmp - yitmp; + const double delzij = zjtmp - zitmp; - rij2 = delxij * delxij + delyij * delyij + delzij * delzij; - rij = sqrt(rij2); + const double rij2 = delxij * delxij + delyij * delyij + delzij * delzij; - double rbound = this->ebound_meam[elti][eltj] * rij2; - if (rij > this->rc_meam) { - fcij = 0.0; - dfcij = 0.0; - sij = 0.0; - } else { - rnorm = (this->rc_meam - rij) * drinv; - sij = 1.0; - - // if rjk2 > ebound*rijsq, atom k is definitely outside the ellipse - for (kn = 0; kn < numneigh_full[i]; kn++) { - //k = firstneigh_full[kn]; - k = d_neighbors_full(i,kn); - eltk = fmap[type[k]]; - if (eltk < 0) continue; - if (k == j) continue; - - delxjk = x(k,0) - xjtmp; - delyjk = x(k,1) - yjtmp; - delzjk = x(k,2) - zjtmp; - rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk; - if (rjk2 > rbound) continue; - - delxik = x(k,0) - xitmp; - delyik = x(k,1) - yitmp; - delzik = x(k,2) - zitmp; - rik2 = delxik * delxik + delyik * delyik + delzik * delzik; - if (rik2 > rbound) continue; - - xik = rik2 / rij2; - xjk = rjk2 / rij2; - a = 1 - (xik - xjk) * (xik - xjk); - // if a < 0, then ellipse equation doesn't describe this case and - // atom k can't possibly screen i-j - if (a <= 0.0) continue; - - cikj = (2.0 * (xik + xjk) + a - 2.0) / a; - Cmax = this->Cmax_meam[elti][eltj][eltk]; - Cmin = this->Cmin_meam[elti][eltj][eltk]; - if (cikj >= Cmax) continue; - // note that cikj may be slightly negative (within numerical - // tolerance) if atoms are colinear, so don't reject that case here - // (other negative cikj cases were handled by the test on "a" above) - else if (cikj <= Cmin) { - sij = 0.0; - break; - } else { - delc = Cmax - Cmin; - cikj = (cikj - Cmin) / delc; - sikj = fcut(cikj); - } - sij *= sikj; - } - - fc = dfcut(rnorm, dfc); - fcij = fc; - dfcij = dfc * drinv; - } - // Now compute derivatives - d_dscrfcn[offset+jn] = 0.0; - sfcij = sij * fcij; - if (iszero_kk(sfcij) || iszero_kk(sfcij - 1.0)) - { - d_scrfcn[offset+jn] = sij; - d_fcpair[offset+jn] = fcij; + if (rij2 > cutforcesq) { + d_dscrfcn[offset+jn] = 0.0; + d_scrfcn[offset+jn] = 0.0; + d_fcpair[offset+jn] = 0.0; continue; - //goto LABEL_100; } - for (kn = 0; kn < numneigh_full[i]; kn++) { - //k = firstneigh_full[kn]; - k = d_neighbors_full(i,kn); + // Now compute derivatives + const double rbound = ebound_meam[elti][eltj] * rij2; + const double rij = sqrt(rij2); + const double rnorm = (cutforce - rij) * drinv; + double sij = 1.0; + + // if rjk2 > ebound*rijsq, atom k is definitely outside the ellipse + for (int kn = 0; kn < d_numneigh_full[i]; kn++) { + int k = d_neighbors_full(i,kn); if (k == j) continue; - eltk = fmap[type[k]]; + int eltk = d_map[type[k]]; if (eltk < 0) continue; - xktmp = x(k,0); - yktmp = x(k,1); - zktmp = x(k,2); - delxjk = xktmp - xjtmp; - delyjk = yktmp - yjtmp; - delzjk = zktmp - zjtmp; - rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk; + const double xktmp = x(k,0); + const double yktmp = x(k,1); + const double zktmp = x(k,2); + + const double delxjk = xktmp - xjtmp; + const double delyjk = yktmp - yjtmp; + const double delzjk = zktmp - zjtmp; + const double rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk; if (rjk2 > rbound) continue; - delxik = xktmp - xitmp; - delyik = yktmp - yitmp; - delzik = zktmp - zitmp; - rik2 = delxik * delxik + delyik * delyik + delzik * delzik; + const double delxik = xktmp - xitmp; + const double delyik = yktmp - yitmp; + const double delzik = zktmp - zitmp; + const double rik2 = delxik * delxik + delyik * delyik + delzik * delzik; if (rik2 > rbound) continue; - xik = rik2 / rij2; - xjk = rjk2 / rij2; - a = 1 - (xik - xjk) * (xik - xjk); - // if a < 0, then ellipse equation doesn't describe this case and - // atom k can't possibly screen i-j + const double xik = rik2 / rij2; + const double xjk = rjk2 / rij2; + const double a = 1 - (xik - xjk) * (xik - xjk); + // if a < 0, then ellipse equation doesn't describe this case and + // atom k can't possibly screen i-j if (a <= 0.0) continue; - cikj = (2.0 * (xik + xjk) + a - 2.0) / a; - Cmax = this->Cmax_meam[elti][eltj][eltk]; - Cmin = this->Cmin_meam[elti][eltj][eltk]; - if (cikj >= Cmax) { - continue; - // Note that cikj may be slightly negative (within numerical - // tolerance) if atoms are colinear, so don't reject that case - // here - // (other negative cikj cases were handled by the test on "a" - // above) - // Note that we never have 0= Cmax) continue; + // note that cikj may be slightly negative (within numerical + // tolerance) if atoms are colinear, so don't reject that case here + // (other negative cikj cases were handled by the test on "a" above) + else if (cikj <= Cmin) { + sij = 0.0; + break; } else { - delc = Cmax - Cmin; + const double delc = Cmax - Cmin; cikj = (cikj - Cmin) / delc; - sikj = dfcut(cikj, dfikj); - coef1 = dfikj / (delc * sikj); - dCikj = dCfunc(rij2, rik2, rjk2); - d_dscrfcn[offset+jn] = d_dscrfcn[offset+jn] + coef1 * dCikj; + sikj = fcut(cikj); } + sij *= sikj; + } + + double dfc; + const double fc = dfcut(rnorm, dfc); + const double fcij = fc; + const double dfcij = dfc * drinv; + + // Now compute derivatives + d_dscrfcn[offset+jn] = 0.0; + const double sfcij = sij * fcij; + if (!iszero_kk(sfcij) && !isone_kk(sfcij)) { + for (int kn = 0; kn < d_numneigh_full[i]; kn++) { + const int k = d_neighbors_full(i,kn); + if (k == j) continue; + const int eltk = d_map[type[k]]; + if (eltk < 0) continue; + + const double delxjk = x(k,0) - xjtmp; + const double delyjk = x(k,1) - yjtmp; + const double delzjk = x(k,2) - zjtmp; + const double rjk2 = delxjk * delxjk + delyjk * delyjk + delzjk * delzjk; + if (rjk2 > rbound) continue; + + const double delxik = x(k,0) - xitmp; + const double delyik = x(k,1) - yitmp; + const double delzik = x(k,2) - zitmp; + const double rik2 = delxik * delxik + delyik * delyik + delzik * delzik; + if (rik2 > rbound) continue; + + const double xik = rik2 / rij2; + const double xjk = rjk2 / rij2; + const double a = 1 - (xik - xjk) * (xik - xjk); + // if a < 0, then ellipse equation doesn't describe this case and + // atom k can't possibly screen i-j + if (a <= 0.0) continue; + + double cikj = (2.0 * (xik + xjk) + a - 2.0) / a; + const double Cmax = Cmax_meam[elti][eltj][eltk]; + const double Cmin = Cmin_meam[elti][eltj][eltk]; + if (cikj >= Cmax) { + continue; + // Note that cikj may be slightly negative (within numerical + // tolerance) if atoms are colinear, so don't reject that case + // here + // (other negative cikj cases were handled by the test on "a" + // above) + // Note that we never have 0 template KOKKOS_INLINE_FUNCTION void -MEAMKokkos::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread numneigh, +MEAMKokkos::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d d_numneigh, int offset) const { - int jn, j, m, n, p, elti, eltj; - int nv2, nv3; - X_FLOAT xtmp, ytmp, ztmp, delij[3]; - F_FLOAT rij2, rij, sij; - F_FLOAT ai, aj, rhoa0j, rhoa1j, rhoa2j, rhoa3j, A1j, A2j, A3j; - // double G,Gbar,gam,shp[3+1]; - F_FLOAT ro0i, ro0j; - F_FLOAT rhoa0i, rhoa1i, rhoa2i, rhoa3i, A1i, A2i, A3i; + // The rho0, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial - // Likely to do: replace with duplicated view for OpenMP, atomic view for GPU abstraction - Kokkos::View::value> > a_rho0 = d_rho0; - Kokkos::View::value> > a_arho2b = d_arho2b; - Kokkos::View::value> > a_t_ave = d_t_ave; - Kokkos::View::value> > a_tsq_ave = d_tsq_ave; - Kokkos::View::value> > a_arho1 = d_arho1; - Kokkos::View::value> > a_arho2 = d_arho2; - Kokkos::View::value> > a_arho3 = d_arho3; - Kokkos::View::value> > a_arho3b = d_arho3b; + auto v_rho0 = ScatterViewHelper,decltype(dup_rho0),decltype(ndup_rho0)>::get(dup_rho0,ndup_rho0); + auto a_rho0 = v_rho0.template access>(); + auto v_arho2b = ScatterViewHelper,decltype(dup_arho2b),decltype(ndup_arho2b)>::get(dup_arho2b,ndup_arho2b); + auto a_arho2b = v_arho2b.template access>(); + auto v_arho1 = ScatterViewHelper,decltype(dup_arho1),decltype(ndup_arho1)>::get(dup_arho1,ndup_arho1); + auto a_arho1 = v_arho1.template access>(); + auto v_arho2 = ScatterViewHelper,decltype(dup_arho2),decltype(ndup_arho2)>::get(dup_arho2,ndup_arho2); + auto a_arho2 = v_arho2.template access>(); + auto v_arho3 = ScatterViewHelper,decltype(dup_arho3),decltype(ndup_arho3)>::get(dup_arho3,ndup_arho3); + auto a_arho3 = v_arho3.template access>(); + auto v_arho3b = ScatterViewHelper,decltype(dup_arho3b),decltype(ndup_arho3b)>::get(dup_arho3b,ndup_arho3b); + auto a_arho3b = v_arho3b.template access>(); + auto v_t_ave = ScatterViewHelper,decltype(dup_t_ave),decltype(ndup_t_ave)>::get(dup_t_ave,ndup_t_ave); + auto a_t_ave = v_t_ave.template access>(); + auto v_tsq_ave = ScatterViewHelper,decltype(dup_tsq_ave),decltype(ndup_tsq_ave)>::get(dup_tsq_ave,ndup_tsq_ave); + auto a_tsq_ave = v_tsq_ave.template access>(); - elti = fmap[type[i]]; - xtmp = x(i,0); - ytmp = x(i,1); - ztmp = x(i,2); - for (jn = 0; jn < numneigh[i]; jn++) { + const int elti = d_map[type[i]]; + const double xtmp = x(i,0); + const double ytmp = x(i,1); + const double ztmp = x(i,2); + for (int jn = 0; jn < d_numneigh[i]; jn++) { if (!iszero_kk(d_scrfcn[offset+jn])) { - //j = firstneigh[jn]; - j = d_neighbors_half(i,jn); - sij = d_scrfcn[offset+jn] * d_fcpair[offset+jn]; + const int j = d_neighbors_half(i,jn); + const double sij = d_scrfcn[offset+jn] * d_fcpair[offset+jn]; + double delij[3]; delij[0] = x(j,0) - xtmp; delij[1] = x(j,1) - ytmp; delij[2] = x(j,2) - ztmp; - rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2]; - if (rij2 < this->cutforcesq) { - eltj = fmap[type[j]]; - rij = sqrt(rij2); - ai = rij / this->re_meam[elti][elti] - 1.0; - aj = rij / this->re_meam[eltj][eltj] - 1.0; - ro0i = this->rho0_meam[elti]; - ro0j = this->rho0_meam[eltj]; - rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-this->beta0_meam[eltj] * aj) * sij; - rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-this->beta1_meam[eltj] * aj) * sij; - rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-this->beta2_meam[eltj] * aj) * sij; - rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-this->beta3_meam[eltj] * aj) * sij; - rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-this->beta0_meam[elti] * ai) * sij; - rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-this->beta1_meam[elti] * ai) * sij; - rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-this->beta2_meam[elti] * ai) * sij; - rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-this->beta3_meam[elti] * ai) * sij; - if (this->ialloy == 1) { - rhoa1j = rhoa1j * this->t1_meam[eltj]; - rhoa2j = rhoa2j * this->t2_meam[eltj]; - rhoa3j = rhoa3j * this->t3_meam[eltj]; - rhoa1i = rhoa1i * this->t1_meam[elti]; - rhoa2i = rhoa2i * this->t2_meam[elti]; - rhoa3i = rhoa3i * this->t3_meam[elti]; + const double rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2]; + if (rij2 < cutforcesq) { + const int eltj = d_map[type[j]]; + const double rij = sqrt(rij2); + const double ai = rij / re_meam[elti][elti] - 1.0; + const double aj = rij / re_meam[eltj][eltj] - 1.0; + const double ro0i = rho0_meam[elti]; + const double ro0j = rho0_meam[eltj]; + const double rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-beta0_meam[eltj] * aj) * sij; + double rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-beta1_meam[eltj] * aj) * sij; + double rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-beta2_meam[eltj] * aj) * sij; + double rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-beta3_meam[eltj] * aj) * sij; + const double rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-beta0_meam[elti] * ai) * sij; + double rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai) * sij; + double rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai) * sij; + double rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai) * sij; + if (ialloy == 1) { + rhoa1j *= t1_meam[eltj]; + rhoa2j *= t2_meam[eltj]; + rhoa3j *= t3_meam[eltj]; + rhoa1i *= t1_meam[elti]; + rhoa2i *= t2_meam[elti]; + rhoa3i *= t3_meam[elti]; } a_rho0[i] += rhoa0j; a_rho0[j] += rhoa0i; // For ialloy = 2, use single-element value (not average) - if (this->ialloy != 2) { - a_t_ave(i,0) = a_t_ave(i,0) + this->t1_meam[eltj] * rhoa0j; - a_t_ave(i,1) = a_t_ave(i,1) + this->t2_meam[eltj] * rhoa0j; - a_t_ave(i,2) = a_t_ave(i,2) + this->t3_meam[eltj] * rhoa0j; - a_t_ave(j,0) = a_t_ave(j,0) + this->t1_meam[elti] * rhoa0i; - a_t_ave(j,1) = a_t_ave(j,1) + this->t2_meam[elti] * rhoa0i; - a_t_ave(j,2) = a_t_ave(j,2) + this->t3_meam[elti] * rhoa0i; + if (ialloy != 2) { + a_t_ave(i,0) += t1_meam[eltj] * rhoa0j; + a_t_ave(i,1) += t2_meam[eltj] * rhoa0j; + a_t_ave(i,2) += t3_meam[eltj] * rhoa0j; + a_t_ave(j,0) += t1_meam[elti] * rhoa0i; + a_t_ave(j,1) += t2_meam[elti] * rhoa0i; + a_t_ave(j,2) += t3_meam[elti] * rhoa0i; } - if (this->ialloy == 1) { - a_tsq_ave(i,0) = a_tsq_ave(i,0) + this->t1_meam[eltj] * this->t1_meam[eltj] * rhoa0j; - a_tsq_ave(i,1) = a_tsq_ave(i,1) + this->t2_meam[eltj] * this->t2_meam[eltj] * rhoa0j; - a_tsq_ave(i,2) = a_tsq_ave(i,2) + this->t3_meam[eltj] * this->t3_meam[eltj] * rhoa0j; - a_tsq_ave(j,0) = a_tsq_ave(j,0) + this->t1_meam[elti] * this->t1_meam[elti] * rhoa0i; - a_tsq_ave(j,1) = a_tsq_ave(j,1) + this->t2_meam[elti] * this->t2_meam[elti] * rhoa0i; - a_tsq_ave(j,2) = a_tsq_ave(j,2) + this->t3_meam[elti] * this->t3_meam[elti] * rhoa0i; + if (ialloy == 1) { + a_tsq_ave(i,0) += t1_meam[eltj] * t1_meam[eltj] * rhoa0j; + a_tsq_ave(i,1) += t2_meam[eltj] * t2_meam[eltj] * rhoa0j; + a_tsq_ave(i,2) += t3_meam[eltj] * t3_meam[eltj] * rhoa0j; + a_tsq_ave(j,0) += t1_meam[elti] * t1_meam[elti] * rhoa0i; + a_tsq_ave(j,1) += t2_meam[elti] * t2_meam[elti] * rhoa0i; + a_tsq_ave(j,2) += t3_meam[elti] * t3_meam[elti] * rhoa0i; } a_arho2b[i] += rhoa2j; a_arho2b[j] += rhoa2i; - A1j = rhoa1j / rij; - A2j = rhoa2j / rij2; - A3j = rhoa3j / (rij2 * rij); - A1i = rhoa1i / rij; - A2i = rhoa2i / rij2; - A3i = rhoa3i / (rij2 * rij); - nv2 = 0; - nv3 = 0; - for (m = 0; m < 3; m++) { + const double A1j = rhoa1j / rij; + const double A2j = rhoa2j / rij2; + const double A3j = rhoa3j / (rij2 * rij); + const double A1i = rhoa1i / rij; + const double A2i = rhoa2i / rij2; + const double A3i = rhoa3i / (rij2 * rij); + int nv2 = 0; + int nv3 = 0; + for (int m = 0; m < 3; m++) { a_arho1(i,m) += A1j * delij[m]; - a_arho1(j,m) += (-A1i * delij[m]); + a_arho1(j,m) += -A1i * delij[m]; a_arho3b(i,m) += rhoa3j * delij[m] / rij; - a_arho3b(j,m) += (- rhoa3i * delij[m] / rij); - for (n = m; n < 3; n++) { + a_arho3b(j,m) += -rhoa3i * delij[m] / rij; + for (int n = m; n < 3; n++) { a_arho2(i,nv2) += A2j * delij[m] * delij[n]; a_arho2(j,nv2) += A2i * delij[m] * delij[n]; - nv2 = nv2 + 1; - for (p = n; p < 3; p++) { + nv2++; + for (int p = n; p < 3; p++) { a_arho3(i,nv3) += A3j * delij[m] * delij[n] * delij[p]; - a_arho3(j,nv3) += (- A3i * delij[m] * delij[n] * delij[p]); - nv3 = nv3 + 1; + a_arho3(j,nv3) += -A3i * delij[m] * delij[n] * delij[p]; + nv3++; } } } @@ -476,79 +508,81 @@ MEAMKokkos::calc_rho1(int i, int /*ntype*/, typename AT::t_int_1d_ra } } +/* ---------------------------------------------------------------------- */ + //Cutoff function and derivative template KOKKOS_INLINE_FUNCTION -double MEAMKokkos::dfcut(const double xi, double& dfc) const { - double a, a3, a4, a1m4; - if (xi >= 1.0) { - dfc = 0.0; - return 1.0; - } else if (xi <= 0.0) { - dfc = 0.0; - return 0.0; - } else { - a = 1.0 - xi; - a3 = a * a * a; - a4 = a * a3; - a1m4 = 1.0-a4; +double MEAMKokkos::dfcut(const double xi, double& dfc) const +{ + if (xi >= 1.0) { + dfc = 0.0; + return 1.0; + } else if (xi <= 0.0) { + dfc = 0.0; + return 0.0; + } else { + const double a = 1.0 - xi; + const double a3 = a * a * a; + const double a4 = a * a3; + const double a1m4 = 1.0 - a4; - dfc = 8 * a1m4 * a3; - return a1m4*a1m4; - } + dfc = 8 * a1m4 * a3; + return a1m4*a1m4; } +} //----------------------------------------------------------------------------- - // // Derivative of Cikj w.r.t. rij - // // Inputs: rij,rij2,rik2,rjk2 - // // - // + // Derivative of Cikj w.r.t. rij + // Inputs: rij,rij2,rik2,rjk2 template KOKKOS_INLINE_FUNCTION double MEAMKokkos::dCfunc(const double rij2, const double rik2, const double rjk2) const { - double rij4, a, asq, b,denom; - - rij4 = rij2 * rij2; - a = rik2 - rjk2; - b = rik2 + rjk2; - asq = a*a; - denom = rij4 - asq; - denom = denom * denom; - return -4 * (-2 * rij2 * asq + rij4 * b + asq * b) / denom; + const double rij4 = rij2 * rij2; + const double a = rik2 - rjk2; + const double b = rik2 + rjk2; + const double asq = a*a; + double denom = rij4 - asq; + denom = denom * denom; + return -4 * (-2 * rij2 * asq + rij4 * b + asq * b) / denom; } +/* ---------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION void MEAMKokkos::dCfunc2(const double rij2, const double rik2, const double rjk2, double& dCikj1, double& dCikj2) const { - double rij4, rik4, rjk4, a, denom; - - rij4 = rij2 * rij2; - rik4 = rik2 * rik2; - rjk4 = rjk2 * rjk2; - a = rik2 - rjk2; - denom = rij4 - a * a; - denom = denom * denom; - dCikj1 = 4 * rij2 * (rij4 + rik4 + 2 * rik2 * rjk2 - 3 * rjk4 - 2 * rij2 * a) / denom; - dCikj2 = 4 * rij2 * (rij4 - 3 * rik4 + 2 * rik2 * rjk2 + rjk4 + 2 * rij2 * a) / denom; - + const double rij4 = rij2 * rij2; + const double rik4 = rik2 * rik2; + const double rjk4 = rjk2 * rjk2; + const double a = rik2 - rjk2; + double denom = rij4 - a * a; + denom = denom * denom; + dCikj1 = 4 * rij2 * (rij4 + rik4 + 2 * rik2 * rjk2 - 3 * rjk4 - 2 * rij2 * a) / denom; + dCikj2 = 4 * rij2 * (rij4 - 3 * rik4 + 2 * rik2 * rjk2 + rjk4 + 2 * rij2 * a) / denom; } + +/* ---------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION -double MEAMKokkos::fcut(const double xi) const{ - double a; - if (xi >= 1.0) - return 1.0; - else if (xi <= 0.0) - return 0.0; - else { - a = 1.0 - xi; - a *= a; a *= a; - a = 1.0 - a; - return a * a; - } +double MEAMKokkos::fcut(const double xi) const +{ + double a; + if (xi >= 1.0) + return 1.0; + else if (xi <= 0.0) + return 0.0; + else { + // ( 1.d0 - (1.d0 - xi)**4 )**2, but with better codegen + a = 1.0 - xi; + a *= a; a *= a; + a = 1.0 - a; + return a * a; } +} diff --git a/src/KOKKOS/meam_force_kokkos.h b/src/KOKKOS/meam_force_kokkos.h index 4e061c9043..aafcf80473 100644 --- a/src/KOKKOS/meam_force_kokkos.h +++ b/src/KOKKOS/meam_force_kokkos.h @@ -5,106 +5,129 @@ using namespace LAMMPS_NS; using namespace MathSpecialKokkos; - template void -MEAMKokkos::meam_force(int inum_half, int eflag_either, int eflag_global, int eflag_atom, int vflag_atom, double* eng_vdwl, - typename ArrayTypes::t_efloat_1d eatom, int ntype, typename AT::t_int_1d_randomread type, typename AT::t_int_1d_randomread fmap, typename AT::t_x_array_randomread x, typename AT::t_int_1d_randomread numneigh, - typename AT::t_int_1d_randomread numneigh_full, typename AT::t_f_array f, typename ArrayTypes::t_virial_array vatom, typename AT::t_int_1d_randomread d_ilist_half, typename AT::t_int_1d_randomread d_offset, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, int neighflag) +MEAMKokkos::meam_force(int inum_half, int eflag_global, int eflag_atom, int vflag_global, int vflag_atom, + typename ArrayTypes::t_efloat_1d eatom, int ntype, typename AT::t_int_1d type, typename AT::t_int_1d d_map, typename AT::t_x_array x, typename AT::t_int_1d numneigh, + typename AT::t_int_1d numneigh_full, typename AT::t_f_array f, typename ArrayTypes::t_virial_array vatom, typename AT::t_int_1d d_ilist_half, typename AT::t_int_1d d_offset, typename AT::t_neighbors_2d d_neighbors_half, typename AT::t_neighbors_2d d_neighbors_full, int neighflag, int need_dup, EV_FLOAT &ev_all) { - EV_FLOAT ev; - ev.evdwl = *eng_vdwl; + EV_FLOAT ev; - this->eflag_either = eflag_either; - this->eflag_global = eflag_global; - this->eflag_atom = eflag_atom; - this->vflag_atom = vflag_atom; - this->d_eatom = eatom; - this->ntype = ntype; - this->type = type; - this->fmap = fmap; - this->x = x; - this->d_numneigh_half = numneigh; - this->d_numneigh_full = numneigh_full; - this->d_neighbors_half = d_neighbors_half; - this->d_neighbors_full = d_neighbors_full; - this->f = f; - this->d_vatom = vatom; - this->d_ilist_half = d_ilist_half; - this->d_offset = d_offset; - - if (neighflag == FULL) - Kokkos::parallel_reduce(Kokkos::RangePolicy>(0,inum_half),*this,ev); - else if (neighflag == HALF) - Kokkos::parallel_reduce(Kokkos::RangePolicy>(0,inum_half),*this,ev); - else if (neighflag == HALFTHREAD) - Kokkos::parallel_reduce(Kokkos::RangePolicy>(0,inum_half),*this,ev); - *eng_vdwl = ev.evdwl; + this->eflag_either = eflag_either; + this->eflag_global = eflag_global; + this->eflag_atom = eflag_atom; + this->vflag_global = vflag_global; + this->vflag_atom = vflag_atom; + eflag_either = eflag_atom || eflag_global; + vflag_either = vflag_atom || vflag_global; + this->d_eatom = eatom; + this->ntype = ntype; + this->type = type; + this->d_map = d_map; + this->x = x; + this->d_numneigh_half = numneigh; + this->d_numneigh_full = numneigh_full; + this->d_neighbors_half = d_neighbors_half; + this->d_neighbors_full = d_neighbors_full; + this->f = f; + this->d_vatom = vatom; + this->d_ilist_half = d_ilist_half; + this->d_offset = d_offset; + + if (need_dup) { + dup_f = Kokkos::Experimental::create_scatter_view(f); + if (eflag_atom) dup_eatom = Kokkos::Experimental::create_scatter_view(d_eatom); + if (vflag_atom) dup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } else { + ndup_f = Kokkos::Experimental::create_scatter_view(f); + if (eflag_atom) ndup_eatom = Kokkos::Experimental::create_scatter_view(d_eatom); + if (vflag_atom) ndup_vatom = Kokkos::Experimental::create_scatter_view(d_vatom); + } + + copymode = 1; + if (neighflag == HALF) + Kokkos::parallel_reduce(Kokkos::RangePolicy>(0,inum_half),*this,ev); + else if (neighflag == HALFTHREAD) + Kokkos::parallel_reduce(Kokkos::RangePolicy>(0,inum_half),*this,ev); + ev_all += ev; + copymode = 0; + + if (need_dup) { + Kokkos::Experimental::contribute(f, dup_f); + if (eflag_atom) Kokkos::Experimental::contribute(d_eatom, dup_eatom); + if (vflag_atom) Kokkos::Experimental::contribute(d_vatom, dup_vatom); + + // free duplicated memory + dup_f = decltype(dup_f)(); + if (eflag_atom) dup_eatom = decltype(dup_eatom)(); + if (vflag_atom) dup_vatom = decltype(dup_vatom)(); + } } template template KOKKOS_INLINE_FUNCTION void -MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FLOAT& ev) const { +MEAMKokkos::operator()(TagMEAMForce, const int &ii, EV_FLOAT& ev) const { int i, j, jn, k, kn, kk, m, n, p, q; int nv2, nv3, elti, eltj, eltk, ind; X_FLOAT xitmp, yitmp, zitmp, delij[3]; - F_FLOAT rij2, rij, rij3; - F_FLOAT v[6], fi[3], fj[3]; - F_FLOAT third, sixth; - F_FLOAT pp, dUdrij, dUdsij, dUdrijm[3], force, forcem; - F_FLOAT r, recip, phi, phip; - F_FLOAT sij; - F_FLOAT a1, a1i, a1j, a2, a2i, a2j; - F_FLOAT a3i, a3j; - F_FLOAT shpi[3], shpj[3]; - F_FLOAT ai, aj, ro0i, ro0j, invrei, invrej; - F_FLOAT rhoa0j, drhoa0j, rhoa0i, drhoa0i; - F_FLOAT rhoa1j, drhoa1j, rhoa1i, drhoa1i; - F_FLOAT rhoa2j, drhoa2j, rhoa2i, drhoa2i; - F_FLOAT a3, a3a, rhoa3j, drhoa3j, rhoa3i, drhoa3i; - F_FLOAT drho0dr1, drho0dr2, drho0ds1, drho0ds2; - F_FLOAT drho1dr1, drho1dr2, drho1ds1, drho1ds2; - F_FLOAT drho1drm1[3], drho1drm2[3]; - F_FLOAT drho2dr1, drho2dr2, drho2ds1, drho2ds2; - F_FLOAT drho2drm1[3], drho2drm2[3]; - F_FLOAT drho3dr1, drho3dr2, drho3ds1, drho3ds2; - F_FLOAT drho3drm1[3], drho3drm2[3]; - F_FLOAT dt1dr1, dt1dr2, dt1ds1, dt1ds2; - F_FLOAT dt2dr1, dt2dr2, dt2ds1, dt2ds2; - F_FLOAT dt3dr1, dt3dr2, dt3ds1, dt3ds2; - F_FLOAT drhodr1, drhodr2, drhods1, drhods2, drhodrm1[3], drhodrm2[3]; - F_FLOAT arg; - F_FLOAT arg1i1, arg1j1, arg1i2, arg1j2, arg1i3, arg1j3, arg3i3, arg3j3; - F_FLOAT dsij1, dsij2, force1, force2; - F_FLOAT t1i, t2i, t3i, t1j, t2j, t3j; + double rij2, rij, rij3; + double v[6], fi[3], fj[3]; + double third, sixth; + double pp, dUdrij, dUdsij, dUdrijm[3], force, forcem; + double recip, phi, phip; + double sij; + double a1, a1i, a1j, a2, a2i, a2j; + double a3i, a3j; + double shpi[3], shpj[3]; + double ai, aj, ro0i, ro0j, invrei, invrej; + double rhoa0j, drhoa0j, rhoa0i, drhoa0i; + double rhoa1j, drhoa1j, rhoa1i, drhoa1i; + double rhoa2j, drhoa2j, rhoa2i, drhoa2i; + double a3, a3a, rhoa3j, drhoa3j, rhoa3i, drhoa3i; + double drho0dr1, drho0dr2, drho0ds1, drho0ds2; + double drho1dr1, drho1dr2, drho1ds1, drho1ds2; + double drho1drm1[3], drho1drm2[3]; + double drho2dr1, drho2dr2, drho2ds1, drho2ds2; + double drho2drm1[3], drho2drm2[3]; + double drho3dr1, drho3dr2, drho3ds1, drho3ds2; + double drho3drm1[3], drho3drm2[3]; + double dt1dr1, dt1dr2, dt1ds1, dt1ds2; + double dt2dr1, dt2dr2, dt2ds1, dt2ds2; + double dt3dr1, dt3dr2, dt3ds1, dt3ds2; + double drhodr1, drhodr2, drhods1, drhods2, drhodrm1[3], drhodrm2[3]; + double arg; + double arg1i1, arg1j1, arg1i2, arg1j2, arg1i3, arg1j3, arg3i3, arg3j3; + double dsij1, dsij2, force1, force2; + double t1i, t2i, t3i, t1j, t2j, t3j; int fnoffset; - // To do: update with duplication for OpenMP, atomic for GPUs - Kokkos::View::value> > a_eatom = d_eatom; - Kokkos::View::value> > a_vatom = d_vatom; - Kokkos::View::value> > a_f = f; + // The f, etc. arrays are duplicated for OpenMP, atomic for CUDA, and neither for Serial + auto v_f = ScatterViewHelper,decltype(dup_f),decltype(ndup_f)>::get(dup_f,ndup_f); + auto a_f = v_f.template access>(); + auto v_eatom = ScatterViewHelper,decltype(dup_eatom),decltype(ndup_eatom)>::get(dup_eatom,ndup_eatom); + auto a_eatom = v_eatom.template access>(); + auto v_vatom = ScatterViewHelper,decltype(dup_vatom),decltype(ndup_vatom)>::get(dup_vatom,ndup_vatom); + auto a_vatom = v_vatom.template access>(); i = d_ilist_half[ii]; fnoffset = d_offset[i]; third = 1.0 / 3.0; sixth = 1.0 / 6.0; - // Compute forces atom i - elti = fmap[type[i]]; + elti = d_map[type[i]]; if (elti < 0) return; xitmp = x(i,0); yitmp = x(i,1); zitmp = x(i,2); - // Treat each pair + // Treat each pair for (jn = 0; jn < d_numneigh_half[i]; jn++) { - //j = firstneigh[jn]; j = d_neighbors_half(i,jn); - eltj = fmap[type[j]]; + eltj = d_map[type[j]]; if (!iszero_kk(d_scrfcn[fnoffset + jn]) && eltj >= 0) { @@ -113,60 +136,60 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL delij[1] = x(j,1) - yitmp; delij[2] = x(j,2) - zitmp; rij2 = delij[0] * delij[0] + delij[1] * delij[1] + delij[2] * delij[2]; - if (rij2 < this->cutforcesq) { + if (rij2 < cutforcesq) { rij = sqrt(rij2); - r = rij; + recip = 1.0 / rij; - // Compute phi and phip - ind = this->eltind[elti][eltj]; - pp = rij * this->rdrar; + // Compute phi and phip + ind = eltind[elti][eltj]; + pp = rij * rdrar; kk = (int)pp; - kk = (kk <= (this->nrar - 2))?kk:this->nrar - 2; + kk = (kk <= (nrar - 2))?kk:nrar - 2; pp = pp - kk; pp = (pp <= 1.0)?pp:1.0; - phi = ((this->d_phirar3(ind,kk) * pp + this->d_phirar2(ind,kk)) * pp + this->d_phirar1(ind,kk)) * pp + - this->d_phirar(ind,kk); - phip = (this->d_phirar6(ind,kk) * pp + this->d_phirar5(ind,kk)) * pp + this->d_phirar4(ind,kk); - recip = 1.0 / r; + phi = ((d_phirar3(ind,kk) * pp + d_phirar2(ind,kk)) * pp + d_phirar1(ind,kk)) * pp + + d_phirar(ind,kk); + phip = (d_phirar6(ind,kk) * pp + d_phirar5(ind,kk)) * pp + d_phirar4(ind,kk); - if (eflag_either != 0) { - if (eflag_global != 0) - //*eng_vdwl = *eng_vdwl + phi * sij; - ev.evdwl = ev.evdwl + phi * sij; - if (eflag_atom != 0) { - a_eatom[i] += (0.5 * phi * sij); - a_eatom[j] += (0.5 * phi * sij); + if (eflag_either) { + double scaleij = d_scale(type[i],type[i]); + double phi_sc = phi * scaleij; + if (eflag_global) + ev.evdwl += phi_sc * sij; + if (eflag_atom) { + a_eatom[i] += 0.5 * phi * sij; + a_eatom[j] += 0.5 * phi * sij; } } - // write(1,*) "force_meamf: phi: ",phi - // write(1,*) "force_meamf: phip: ",phip + // write(1,*) "force_meamf: phi: ",phi + // write(1,*) "force_meamf: phip: ",phip - // Compute pair densities and derivatives - invrei = 1.0 / this->re_meam[elti][elti]; + // Compute pair densities and derivatives + invrei = 1.0 / re_meam[elti][elti]; ai = rij * invrei - 1.0; - ro0i = this->rho0_meam[elti]; - rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-this->beta0_meam[elti] * ai); - drhoa0i = -this->beta0_meam[elti] * invrei * rhoa0i; - rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-this->beta1_meam[elti] * ai); - drhoa1i = -this->beta1_meam[elti] * invrei * rhoa1i; - rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-this->beta2_meam[elti] * ai); - drhoa2i = -this->beta2_meam[elti] * invrei * rhoa2i; - rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-this->beta3_meam[elti] * ai); - drhoa3i = -this->beta3_meam[elti] * invrei * rhoa3i; + ro0i = rho0_meam[elti]; + rhoa0i = ro0i * MathSpecialKokkos::fm_exp(-beta0_meam[elti] * ai); + drhoa0i = -beta0_meam[elti] * invrei * rhoa0i; + rhoa1i = ro0i * MathSpecialKokkos::fm_exp(-beta1_meam[elti] * ai); + drhoa1i = -beta1_meam[elti] * invrei * rhoa1i; + rhoa2i = ro0i * MathSpecialKokkos::fm_exp(-beta2_meam[elti] * ai); + drhoa2i = -beta2_meam[elti] * invrei * rhoa2i; + rhoa3i = ro0i * MathSpecialKokkos::fm_exp(-beta3_meam[elti] * ai); + drhoa3i = -beta3_meam[elti] * invrei * rhoa3i; if (elti != eltj) { - invrej = 1.0 / this->re_meam[eltj][eltj]; + invrej = 1.0 / re_meam[eltj][eltj]; aj = rij * invrej - 1.0; - ro0j = this->rho0_meam[eltj]; - rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-this->beta0_meam[eltj] * aj); - drhoa0j = -this->beta0_meam[eltj] * invrej * rhoa0j; - rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-this->beta1_meam[eltj] * aj); - drhoa1j = -this->beta1_meam[eltj] * invrej * rhoa1j; - rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-this->beta2_meam[eltj] * aj); - drhoa2j = -this->beta2_meam[eltj] * invrej * rhoa2j; - rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-this->beta3_meam[eltj] * aj); - drhoa3j = -this->beta3_meam[eltj] * invrej * rhoa3j; + ro0j = rho0_meam[eltj]; + rhoa0j = ro0j * MathSpecialKokkos::fm_exp(-beta0_meam[eltj] * aj); + drhoa0j = -beta0_meam[eltj] * invrej * rhoa0j; + rhoa1j = ro0j * MathSpecialKokkos::fm_exp(-beta1_meam[eltj] * aj); + drhoa1j = -beta1_meam[eltj] * invrej * rhoa1j; + rhoa2j = ro0j * MathSpecialKokkos::fm_exp(-beta2_meam[eltj] * aj); + drhoa2j = -beta2_meam[eltj] * invrej * rhoa2j; + rhoa3j = ro0j * MathSpecialKokkos::fm_exp(-beta3_meam[eltj] * aj); + drhoa3j = -beta3_meam[eltj] * invrej * rhoa3j; } else { rhoa0j = rhoa0i; drhoa0j = drhoa0i; @@ -178,14 +201,14 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL drhoa3j = drhoa3i; } - const double t1mi = this->t1_meam[elti]; - const double t2mi = this->t2_meam[elti]; - const double t3mi = this->t3_meam[elti]; - const double t1mj = this->t1_meam[eltj]; - const double t2mj = this->t2_meam[eltj]; - const double t3mj = this->t3_meam[eltj]; + const double t1mi = t1_meam[elti]; + const double t2mi = t2_meam[elti]; + const double t3mi = t3_meam[elti]; + const double t1mj = t1_meam[eltj]; + const double t2mj = t2_meam[eltj]; + const double t3mj = t3_meam[eltj]; - if (this->ialloy == 1) { + if (ialloy == 1) { rhoa1j *= t1mj; rhoa2j *= t2mj; rhoa3j *= t3mj; @@ -213,12 +236,12 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL for (n = 0; n < 3; n++) { for (p = n; p < 3; p++) { for (q = p; q < 3; q++) { - arg = delij[n] * delij[p] * delij[q] * this->v3D[nv3]; + arg = delij[n] * delij[p] * delij[q] * v3D[nv3]; arg1i3 = arg1i3 + d_arho3(i,nv3) * arg; arg1j3 = arg1j3 - d_arho3(j,nv3) * arg; nv3 = nv3 + 1; } - arg = delij[n] * delij[p] * this->v2D[nv2]; + arg = delij[n] * delij[p] * v2D[nv2]; arg1i2 = arg1i2 + d_arho2(i,nv2) * arg; arg1j2 = arg1j2 + d_arho2(j,nv2) * arg; nv2 = nv2 + 1; @@ -229,11 +252,11 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL arg3j3 = arg3j3 - d_arho3b(j,n) * delij[n]; } - // rho0 terms + // rho0 terms drho0dr1 = drhoa0j * sij; drho0dr2 = drhoa0i * sij; - // rho1 terms + // rho1 terms a1 = 2 * sij / rij; drho1dr1 = a1 * (drhoa1j - rhoa1j / rij) * arg1i1; drho1dr2 = a1 * (drhoa1i - rhoa1i / rij) * arg1j1; @@ -243,7 +266,7 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL drho1drm2[m] = -a1 * rhoa1i * d_arho1(j,m); } - // rho2 terms + // rho2 terms a2 = 2 * sij / rij2; drho2dr1 = a2 * (drhoa2j - 2 * rhoa2j / rij) * arg1i2 - 2.0 / 3.0 * d_arho2b[i] * drhoa2j * sij; drho2dr2 = a2 * (drhoa2i - 2 * rhoa2i / rij) * arg1j2 - 2.0 / 3.0 * d_arho2b[j] * drhoa2i * sij; @@ -252,14 +275,14 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL drho2drm1[m] = 0.0; drho2drm2[m] = 0.0; for (n = 0; n < 3; n++) { - drho2drm1[m] = drho2drm1[m] + d_arho2(i,this->vind2D[m][n]) * delij[n]; - drho2drm2[m] = drho2drm2[m] - d_arho2(j,this->vind2D[m][n]) * delij[n]; + drho2drm1[m] = drho2drm1[m] + d_arho2(i,vind2D[m][n]) * delij[n]; + drho2drm2[m] = drho2drm2[m] - d_arho2(j,vind2D[m][n]) * delij[n]; } drho2drm1[m] = a2 * rhoa2j * drho2drm1[m]; drho2drm2[m] = -a2 * rhoa2i * drho2drm2[m]; } - // rho3 terms + // rho3 terms rij3 = rij * rij2; a3 = 2 * sij / rij3; a3a = 6.0 / 5.0 * sij / rij; @@ -273,9 +296,9 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL nv2 = 0; for (n = 0; n < 3; n++) { for (p = n; p < 3; p++) { - arg = delij[n] * delij[p] * this->v2D[nv2]; - drho3drm1[m] = drho3drm1[m] + d_arho3(i,this->vind3D[m][n][p]) * arg; - drho3drm2[m] = drho3drm2[m] + d_arho3(j,this->vind3D[m][n][p]) * arg; + arg = delij[n] * delij[p] * v2D[nv2]; + drho3drm1[m] = drho3drm1[m] + d_arho3(i,vind3D[m][n][p]) * arg; + drho3drm2[m] = drho3drm2[m] + d_arho3(j,vind3D[m][n][p]) * arg; nv2 = nv2 + 1; } } @@ -283,7 +306,7 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL drho3drm2[m] = (-a3 * drho3drm2[m] + a3a * d_arho3b(j,m)) * rhoa3i; } - // Compute derivatives of weighting functions t wrt rij + // Compute derivatives of weighting functions t wrt rij t1i = d_t_ave(i,0); t2i = d_t_ave(i,1); t3i = d_t_ave(i,2); @@ -291,7 +314,7 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL t2j = d_t_ave(j,1); t3j = d_t_ave(j,2); - if (this->ialloy == 1) { + if (ialloy == 1) { a1i = fdiv_zero_kk(drhoa0j * sij, d_tsq_ave(i,0)); a1j = fdiv_zero_kk(drhoa0i * sij, d_tsq_ave(j,0)); @@ -307,7 +330,7 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL dt3dr1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj)); dt3dr2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi)); - } else if (this->ialloy == 2) { + } else if (ialloy == 2) { dt1dr1 = 0.0; dt1dr2 = 0.0; @@ -333,25 +356,27 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL dt3dr2 = aj * (t3mi - t3j); } - // Compute derivatives of total density wrt rij, sij and rij(3) - get_shpfcn(this->lattce_meam[elti][elti], shpi); - get_shpfcn(this->lattce_meam[eltj][eltj], shpj); - drhodr1 = dgamma1[i] * drho0dr1 + - dgamma2[i] * (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 + + // Compute derivatives of total density wrt rij, sij and rij(3) + get_shpfcn(lattce_meam[elti][elti], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpi); + get_shpfcn(lattce_meam[eltj][eltj], stheta_meam[elti][elti], ctheta_meam[elti][elti], shpj); + + drhodr1 = d_dgamma1[i] * drho0dr1 + + d_dgamma2[i] * (dt1dr1 * d_rho1[i] + t1i * drho1dr1 + dt2dr1 * d_rho2[i] + t2i * drho2dr1 + dt3dr1 * d_rho3[i] + t3i * drho3dr1) - - dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); - drhodr2 = dgamma1[j] * drho0dr2 + - dgamma2[j] * (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 + + d_dgamma3[i] * (shpi[0] * dt1dr1 + shpi[1] * dt2dr1 + shpi[2] * dt3dr1); + drhodr2 = d_dgamma1[j] * drho0dr2 + + d_dgamma2[j] * (dt1dr2 * d_rho1[j] + t1j * drho1dr2 + dt2dr2 * d_rho2[j] + t2j * drho2dr2 + dt3dr2 * d_rho3[j] + t3j * drho3dr2) - - dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); + d_dgamma3[j] * (shpj[0] * dt1dr2 + shpj[1] * dt2dr2 + shpj[2] * dt3dr2); for (m = 0; m < 3; m++) { drhodrm1[m] = 0.0; drhodrm2[m] = 0.0; - drhodrm1[m] = dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]); - drhodrm2[m] = dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]); + drhodrm1[m] = d_dgamma2[i] * (t1i * drho1drm1[m] + t2i * drho2drm1[m] + t3i * drho3drm1[m]); + drhodrm2[m] = d_dgamma2[j] * (t1j * drho1drm2[m] + t2j * drho2drm2[m] + t3j * drho3drm2[m]); + } - // Compute derivatives wrt sij, but only if necessary + // Compute derivatives wrt sij, but only if necessary if (!iszero_kk(d_dscrfcn[fnoffset + jn])) { drho0ds1 = rhoa0j; drho0ds2 = rhoa0i; @@ -366,7 +391,7 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL drho3ds1 = a3 * rhoa3j * arg1i3 - a3a * rhoa3j * arg3i3; drho3ds2 = a3 * rhoa3i * arg1j3 - a3a * rhoa3i * arg3j3; - if (this->ialloy == 1) { + if (ialloy == 1) { a1i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i,0)); a1j = fdiv_zero_kk(rhoa0i, d_tsq_ave(j,0)); a2i = fdiv_zero_kk(rhoa0j, d_tsq_ave(i,1)); @@ -381,7 +406,7 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL dt3ds1 = a3i * (t3mj - t3i * MathSpecialKokkos::square(t3mj)); dt3ds2 = a3j * (t3mi - t3j * MathSpecialKokkos::square(t3mi)); - } else if (this->ialloy == 2) { + } else if (ialloy == 2) { dt1ds1 = 0.0; dt1ds2 = 0.0; @@ -417,7 +442,7 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL d_dgamma3[j] * (shpj[0] * dt1ds2 + shpj[1] * dt2ds2 + shpj[2] * dt3ds2); } - // Compute derivatives of energy wrt rij, sij and rij[3] + // Compute derivatives of energy wrt rij, sij and rij[3] dUdrij = phip * sij + d_frhop[i] * drhodr1 + d_frhop[j] * drhodr2; dUdsij = 0.0; if (!iszero_kk(d_dscrfcn[fnoffset + jn])) { @@ -427,17 +452,17 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL dUdrijm[m] = d_frhop[i] * drhodrm1[m] + d_frhop[j] * drhodrm2[m]; } - // Add the part of the force due to dUdrij and dUdsij + // Add the part of the force due to dUdrij and dUdsij force = dUdrij * recip + dUdsij * d_dscrfcn[fnoffset + jn]; for (m = 0; m < 3; m++) { forcem = delij[m] * force + dUdrijm[m]; a_f(i,m) += forcem; - a_f(j,m) -= -forcem; + a_f(j,m) -= forcem; } - // Tabulate per-atom virial as symmetrized stress tensor + // Tabulate per-atom virial as symmetrized stress tensor - if (vflag_atom != 0) { + if (vflag_either) { fi[0] = delij[0] * force + dUdrijm[0]; fi[1] = delij[1] * force + dUdrijm[1]; fi[2] = delij[2] * force + dUdrijm[2]; @@ -448,36 +473,41 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL v[4] = -0.25 * (delij[0] * fi[2] + delij[2] * fi[0]); v[5] = -0.25 * (delij[1] * fi[2] + delij[2] * fi[1]); - for (m = 0; m < 6; m++) { - a_vatom(i,m) += v[m]; - a_vatom(j,m) += v[m]; + if (vflag_global) + for (m = 0; m < 6; m++) + ev.v[m] += 2.0*v[m]; + + if (vflag_atom) { + for (m = 0; m < 6; m++) { + a_vatom(i,m) += v[m]; + a_vatom(j,m) += v[m]; + } } } - // Now compute forces on other atoms k due to change in sij + // Now compute forces on other atoms k due to change in sij - if (iszero_kk(sij) || iszero_kk(sij - 1.0)) continue; //: cont jn loop + if (iszero_kk(sij) || isone_kk(sij)) continue; //: cont jn loop double dxik(0), dyik(0), dzik(0); double dxjk(0), dyjk(0), dzjk(0); for (kn = 0; kn < d_numneigh_full[i]; kn++) { - //k = firstneigh_full[kn]; k = d_neighbors_full(i,kn); - eltk = fmap[type[k]]; + eltk = d_map[type[k]]; if (k != j && eltk >= 0) { double xik, xjk, cikj, sikj, dfc, a; double dCikj1, dCikj2; double delc, rik2, rjk2; sij = d_scrfcn[jn+fnoffset] * d_fcpair[jn+fnoffset]; - const double Cmax = this->Cmax_meam[elti][eltj][eltk]; - const double Cmin = this->Cmin_meam[elti][eltj][eltk]; + const double Cmax = Cmax_meam[elti][eltj][eltk]; + const double Cmin = Cmin_meam[elti][eltj][eltk]; dsij1 = 0.0; dsij2 = 0.0; - if (!iszero_kk(sij) && !iszero_kk(sij - 1.0)) { - const double rbound = rij2 * this->ebound_meam[elti][eltj]; + if (!iszero_kk(sij) && !isone_kk(sij)) { + const double rbound = rij2 * ebound_meam[elti][eltj]; delc = Cmax - Cmin; dxjk = x(k,0) - x(j,0); dyjk = x(k,1) - x(j,1); @@ -521,9 +551,9 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL a_f(k,1) -= force1 * dyik + force2 * dyjk; a_f(k,2) -= force1 * dzik + force2 * dzjk; - // Tabulate per-atom virial as symmetrized stress tensor + // Tabulate per-atom virial as symmetrized stress tensor - if (vflag_atom != 0) { + if (vflag_either) { fi[0] = force1 * dxik; fi[1] = force1 * dyik; fi[2] = force1 * dzik; @@ -537,18 +567,24 @@ MEAMKokkos::operator()(TagMEAMforce, const int &ii, EV_FL v[4] = -sixth * (dxik * fi[2] + dxjk * fj[2] + dzik * fi[0] + dzjk * fj[0]); v[5] = -sixth * (dyik * fi[2] + dyjk * fj[2] + dzik * fi[1] + dzjk * fj[1]); - for (m = 0; m < 6; m++) { - a_vatom(i,m) += v[m]; - a_vatom(j,m) += v[m]; - a_vatom(k,m) += v[m]; + if (vflag_global) + for (m = 0; m < 6; m++) + ev.v[m] += 3.0*v[m]; + + if (vflag_atom) { + for (m = 0; m < 6; m++) { + a_vatom(i,m) += v[m]; + a_vatom(j,m) += v[m]; + a_vatom(k,m) += v[m]; + } } } } } - // end of k loop + // end of k loop } } } - // end of j loop + // end of j loop } } diff --git a/src/KOKKOS/meam_funcs_kokkos.h b/src/KOKKOS/meam_funcs_kokkos.h index 9e41be9ea8..f02def0676 100644 --- a/src/KOKKOS/meam_funcs_kokkos.h +++ b/src/KOKKOS/meam_funcs_kokkos.h @@ -12,7 +12,7 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Sebastian Hütter (OvGU) + Contributing author: Naga Vydyanathan (NVIDIA) ------------------------------------------------------------------------- */ #include "math_special_kokkos.h" @@ -22,12 +22,12 @@ using namespace MathSpecialKokkos; //----------------------------------------------------------------------------- // Compute G(gamma) based on selection flag ibar: -// 0 => G = sqrt(1+gamma) -// 1 => G = exp(gamma/2) -// 2 => not implemented -// 3 => G = 2/(1+exp(-gamma)) -// 4 => G = sqrt(1+gamma) -// -5 => G = +-sqrt(abs(1+gamma)) +// 0 => G = sqrt(1+gamma) +// 1 => G = exp(gamma/2) +// 2 => not implemented +// 3 => G = 2/(1+exp(-gamma)) +// 4 => G = sqrt(1+gamma) +// -5 => G = +-sqrt(abs(1+gamma)) // template KOKKOS_INLINE_FUNCTION @@ -40,9 +40,9 @@ double MEAMKokkos::G_gam(const double gamma, const int ibar, int &er case 4: gsmooth_switchpoint = -gsmooth_factor / (gsmooth_factor + 1); if (gamma < gsmooth_switchpoint) { - // e.g. gsmooth_factor is 99, {: - // gsmooth_switchpoint = -0.99 - // G = 0.01*(-0.99/gamma)**99 + // e.g. gsmooth_factor is 99, {: + // gsmooth_switchpoint = -0.99 + // G = 0.01*(-0.99/gamma)**99 double G = 1 / (gsmooth_factor + 1) * pow((gsmooth_switchpoint / gamma), gsmooth_factor); return sqrt(G); } else { @@ -51,7 +51,7 @@ double MEAMKokkos::G_gam(const double gamma, const int ibar, int &er case 1: return MathSpecialKokkos::fm_exp(gamma / 2.0); case 3: - return 2.0 / (1.0 + exp(-gamma)); + return 2.0 / (1.0 + MathSpecialKokkos::fm_exp(-gamma)); case -5: if ((1.0 + gamma) >= 0) { return sqrt(1.0 + gamma); @@ -65,12 +65,12 @@ double MEAMKokkos::G_gam(const double gamma, const int ibar, int &er //----------------------------------------------------------------------------- // Compute G(gamma and dG(gamma) based on selection flag ibar: -// 0 => G = sqrt(1+gamma) -// 1 => G = exp(gamma/2) -// 2 => not implemented -// 3 => G = 2/(1+exp(-gamma)) -// 4 => G = sqrt(1+gamma) -// -5 => G = +-sqrt(abs(1+gamma)) +// 0 => G = sqrt(1+gamma) +// 1 => G = exp(gamma/2) +// 2 => not implemented +// 3 => G = 2/(1+exp(-gamma)) +// 4 => G = sqrt(1+gamma) +// -5 => G = +-sqrt(abs(1+gamma)) // template KOKKOS_INLINE_FUNCTION @@ -84,9 +84,9 @@ double MEAMKokkos::dG_gam(const double gamma, const int ibar, double case 4: gsmooth_switchpoint = -gsmooth_factor / (gsmooth_factor + 1); if (gamma < gsmooth_switchpoint) { - // e.g. gsmooth_factor is 99, {: - // gsmooth_switchpoint = -0.99 - // G = 0.01*(-0.99/gamma)**99 + // e.g. gsmooth_factor is 99, {: + // gsmooth_switchpoint = -0.99 + // G = 0.01*(-0.99/gamma)**99 G = 1 / (gsmooth_factor + 1) * pow((gsmooth_switchpoint / gamma), gsmooth_factor); G = sqrt(G); dG = -gsmooth_factor * G / (2.0 * gamma); @@ -144,6 +144,30 @@ double MEAMKokkos::zbl(const double r, const int z1, const int z2) c return result; } +//----------------------------------------------------------------------------- +// Compute embedding function F(rhobar) and derivative F'(rhobar), eqn I.5 +// +template +KOKKOS_INLINE_FUNCTION +double MEAMKokkos::embedding(const double A, const double Ec, const double rhobar, double& dF) const +{ + const double AEc = A * Ec; + + if (rhobar > 0.0) { + const double lrb = log(rhobar); + dF = AEc * (1.0 + lrb); + return AEc * rhobar * lrb; + } else { + if (emb_lin_neg == 0) { + dF = 0.0; + return 0.0; + } else { + dF = - AEc; + return - AEc * rhobar; + } + } +} + //----------------------------------------------------------------------------- // Compute Rose energy function, I.16 // @@ -178,7 +202,7 @@ double MEAMKokkos::erose(const double r, const double re, const doub // template KOKKOS_INLINE_FUNCTION -void MEAMKokkos::get_shpfcn(const lattice_t latt, double (&s)[3]) const +void MEAMKokkos::get_shpfcn(const lattice_t latt, const double sthe, const double cthe, double (&s)[3]) const { switch (latt) { case FCC: @@ -194,7 +218,9 @@ void MEAMKokkos::get_shpfcn(const lattice_t latt, double (&s)[3]) co s[1] = 0.0; s[2] = 1.0 / 3.0; break; + case CH4: // CH4 actually needs shape factor for diamond for C, dimer for H case DIA: + case DIA3: s[0] = 0.0; s[1] = 0.0; s[2] = 32.0 / 9.0; @@ -202,12 +228,24 @@ void MEAMKokkos::get_shpfcn(const lattice_t latt, double (&s)[3]) co case DIM: s[0] = 1.0; s[1] = 2.0 / 3.0; - // s(3) = 1.d0 - s[2] = 0.40; + // s(4) = 1.d0 // this should be 0.4 unless (1-legendre) is multiplied in the density calc. + s[2] = 0.40; // this is (1-legendre) where legendre = 0.6 in dynamo is accounted. + break; + case LIN: // linear, theta being 180 + s[0] = 0.0; + s[1] = 8.0 / 3.0; // 4*(co**4 + si**4 - 1.0/3.0) in zig become 4*(1-1/3) + s[2] = 0.0; + break; + case ZIG: //zig-zag + case TRI: //trimer e.g. H2O + s[0] = 4.0*pow(cthe,2); + s[1] = 4.0*(pow(cthe,4) + pow(sthe,4) - 1.0/3.0); + s[2] = 4.0*(pow(cthe,2) * (3*pow(sthe,4) + pow(cthe,4))); + s[2] = s[2] - 0.6*s[0]; //legend in dyn, 0.6 is default value. break; default: s[0] = 0.0; - // call error('Lattice not defined in get_shpfcn.') + // call error('Lattice not defined in get_shpfcn.') } } @@ -225,102 +263,26 @@ int MEAMKokkos::get_Zij(const lattice_t latt) const return 8; case HCP: return 12; - case B1: - return 6; case DIA: + case DIA3: return 4; case DIM: return 1; + case B1: + return 6; case C11: return 10; case L12: return 12; case B2: return 8; - // call error('Lattice not defined in get_Zij.') + case CH4: // DYNAMO currently implemented this way while it needs two Z values, 4 and 1 + return 4; + case LIN: + case ZIG: + case TRI: + return 2; + // call error('Lattice not defined in get_Zij.') } return 0; } - -//----------------------------------------------------------------------------- -// Number of second neighbors for the reference structure -// a = distance ratio R1/R2 -// S = second neighbor screening function -// -template -KOKKOS_INLINE_FUNCTION -int MEAMKokkos::get_Zij2(const lattice_t latt, const double cmin, const double cmax, double& a, double& S) const -{ - - double C, x, sijk; - int Zij2 = 0, numscr = 0; - - switch (latt) { - - case FCC: - Zij2 = 6; - a = sqrt(2.0); - numscr = 4; - break; - - case BCC: - Zij2 = 6; - a = 2.0 / sqrt(3.0); - numscr = 4; - break; - - case HCP: - Zij2 = 6; - a = sqrt(2.0); - numscr = 4; - break; - - case B1: - Zij2 = 12; - a = sqrt(2.0); - numscr = 2; - break; - - case DIA: - Zij2 = 12; - a = sqrt(8.0 / 3.0); - numscr = 1; - if (cmin < 0.500001) { - // call error('can not do 2NN MEAM for dia') - } - break; - - case DIM: - // this really shouldn't be allowed; make sure screening is zero - a = 1.0; - S = 0.0; - return 0; - - case L12: - Zij2 = 6; - a = sqrt(2.0); - numscr = 4; - break; - - case B2: - Zij2 = 6; - a = 2.0 / sqrt(3.0); - numscr = 4; - break; - case C11: - // unsupported lattice flag C11 in get_Zij - break; - default: - // unknown lattic flag in get Zij - // call error('Lattice not defined in get_Zij.') - break; - } - - // Compute screening for each first neighbor - C = 4.0 / (a * a) - 1.0; - x = (C - cmin) / (cmax - cmin); - sijk = fcut(x); - // There are numscr first neighbors screening the second neighbors - S = MathSpecialKokkos::powint(sijk, numscr); - return Zij2; -} diff --git a/src/KOKKOS/meam_impl_kokkos.h b/src/KOKKOS/meam_impl_kokkos.h index 0d2ec99445..0a787ba2eb 100644 --- a/src/KOKKOS/meam_impl_kokkos.h +++ b/src/KOKKOS/meam_impl_kokkos.h @@ -12,11 +12,12 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Sebastian Hütter (OvGU) + Contributing author: Naga Vydyanathan (NVIDIA), Stan Moore (SNL) ------------------------------------------------------------------------- */ #include "memory_kokkos.h" #include "meam_kokkos.h" + using namespace LAMMPS_NS; /* ---------------------------------------------------------------------- */ @@ -24,21 +25,15 @@ using namespace LAMMPS_NS; template MEAMKokkos::MEAMKokkos(Memory *mem) : MEAM(mem) { + d_errorflag = typename AT::t_int_scalar("meam:errorflag"); } template MEAMKokkos::~MEAMKokkos() { + if (copymode) return; + MemoryKokkos *memoryKK = (MemoryKokkos *)memory; - - memoryKK->destroy_kokkos(k_phirar6,phirar6); - memoryKK->destroy_kokkos(k_phirar5,phirar5); - memoryKK->destroy_kokkos(k_phirar4,phirar4); - memoryKK->destroy_kokkos(k_phirar3,phirar3); - memoryKK->destroy_kokkos(k_phirar2,phirar2); - memoryKK->destroy_kokkos(k_phirar1,phirar1); - memoryKK->destroy_kokkos(k_phirar,phirar); - memoryKK->destroy_kokkos(k_phir,phir); memoryKK->destroy_kokkos(k_rho,rho); memoryKK->destroy_kokkos(k_rho0,rho0); @@ -63,10 +58,10 @@ MEAMKokkos::~MEAMKokkos() memoryKK->destroy_kokkos(k_dscrfcn,dscrfcn); memoryKK->destroy_kokkos(k_fcpair,fcpair); } + #include "meam_setup_done_kokkos.h" #include "meam_funcs_kokkos.h" #include "meam_dens_init_kokkos.h" #include "meam_dens_final_kokkos.h" #include "meam_force_kokkos.h" -// diff --git a/src/KOKKOS/meam_kokkos.h b/src/KOKKOS/meam_kokkos.h index 0231f4b767..fb34f45b5c 100644 --- a/src/KOKKOS/meam_kokkos.h +++ b/src/KOKKOS/meam_kokkos.h @@ -14,68 +14,77 @@ namespace LAMMPS_NS { struct TagMEAMDensFinal{}; template struct TagMEAMDensInit{}; -struct TagMEAMInitialize{}; +struct TagMEAMZero{}; template -struct TagMEAMforce{}; +struct TagMEAMForce{}; template class MEAMKokkos : public MEAM { -public: + public: typedef ArrayTypes AT; typedef EV_FLOAT value_type; MEAMKokkos(Memory* mem); - ~MEAMKokkos(); + ~MEAMKokkos() override; KOKKOS_INLINE_FUNCTION void operator()(TagMEAMDensFinal, const int&, EV_FLOAT&) const; template KOKKOS_INLINE_FUNCTION - void operator()(TagMEAMDensInit, const int&, EV_FLOAT&) const; + void operator()(TagMEAMDensInit, const int&) const; KOKKOS_INLINE_FUNCTION - void operator()(TagMEAMInitialize, const int&) const; + void operator()(TagMEAMZero, const int&) const; template KOKKOS_INLINE_FUNCTION - void operator()(TagMEAMforce, const int&, EV_FLOAT&) const; -protected: -//Parameters to meam_dens_init - is there a better way to do this? - int ntype; - typename AT::t_int_1d_randomread type; - typename AT::t_int_1d_randomread d_offset; - typename AT::t_int_1d_randomread fmap; - typename AT::t_x_array_randomread x; - typename AT::t_int_1d_randomread d_numneigh_half; - typename AT::t_int_1d_randomread d_numneigh_full; + void operator()(TagMEAMForce, const int&, EV_FLOAT&) const; + private: + + // parameters to meam_dens_init + + int ntype, nlocal; + typename AT::t_int_1d type; + typename AT::t_int_1d d_offset; + typename AT::t_int_1d d_map; + typename AT::t_int_2d d_scale; + typename AT::t_x_array x; + typename AT::t_int_1d d_numneigh_half; + typename AT::t_int_1d d_numneigh_full; typename AT::t_neighbors_2d d_neighbors_half; typename AT::t_neighbors_2d d_neighbors_full; - typename AT::t_int_1d_randomread d_ilist_half; + typename AT::t_int_1d d_ilist_half; typename AT::t_f_array f; typename ArrayTypes::t_virial_array d_vatom; -//Parameters to meam_dens_final - is there a better way to do this? - int eflag_either, eflag_global, eflag_atom, vflag_atom; - double *eng_vdwl; + + // parameters to meam_dens_final + + typename AT::t_int_scalar d_errorflag; + int eflag_either, eflag_global, eflag_atom, vflag_either, vflag_global, vflag_atom; typename ArrayTypes::t_efloat_1d d_eatom; -public: - void meam_dens_setup(int, int, int); - void meam_setup_done(void); - void meam_dens_init(int , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread, typename AT::t_x_array_randomread, typename AT::t_int_1d_randomread, - typename AT::t_int_1d_randomread , int* , typename AT::t_int_1d_randomread, typename AT::t_neighbors_2d,typename AT::t_neighbors_2d,typename AT::t_int_1d_randomread, int ); - void meam_dens_final(int , int , int , int , double* , - typename ArrayTypes::t_efloat_1d , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread , int& ); - void meam_force(int , int , int , int , int , double* , - typename ArrayTypes::t_efloat_1d , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread , typename AT::t_x_array_randomread , typename AT::t_int_1d_randomread , - typename AT::t_int_1d_randomread , typename AT::t_f_array , typename ArrayTypes::t_virial_array ,typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread, typename AT::t_neighbors_2d, typename AT::t_neighbors_2d, int); + public: + void meam_dens_setup(int, int, int) override; + void meam_setup_done(double*) override; + void meam_dens_init(int, int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_x_array, + typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_int_1d, + typename AT::t_neighbors_2d, typename AT::t_neighbors_2d, typename AT::t_int_1d, + int, int); + void meam_dens_final(int, int, int, int, typename ArrayTypes::t_efloat_1d, + int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_int_2d, int&, EV_FLOAT&); + void meam_force(int, int, int, int, int, typename ArrayTypes::t_efloat_1d, + int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_x_array, typename AT::t_int_1d, + typename AT::t_int_1d, typename AT::t_f_array, typename ArrayTypes::t_virial_array, + typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_neighbors_2d, typename AT::t_neighbors_2d, + int, int, EV_FLOAT&); template KOKKOS_INLINE_FUNCTION - void getscreen(int , int, typename AT::t_x_array_randomread , typename AT::t_int_1d_randomread, - typename AT::t_int_1d_randomread, int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread ) const; + void getscreen(int, int, typename AT::t_x_array, typename AT::t_int_1d, + typename AT::t_int_1d, int, typename AT::t_int_1d, typename AT::t_int_1d) const; template KOKKOS_INLINE_FUNCTION - void calc_rho1(int , int , typename AT::t_int_1d_randomread , typename AT::t_int_1d_randomread , typename AT::t_x_array_randomread , typename AT::t_int_1d_randomread, int ) const; + void calc_rho1(int, int, typename AT::t_int_1d, typename AT::t_int_1d, typename AT::t_x_array, typename AT::t_int_1d, int) const; KOKKOS_INLINE_FUNCTION double fcut(const double xi) const; KOKKOS_INLINE_FUNCTION @@ -89,18 +98,19 @@ public: KOKKOS_INLINE_FUNCTION double dG_gam(const double, const int, double&) const; KOKKOS_INLINE_FUNCTION - double zbl(const double, const int, const int) const; + double zbl(const double, const int, const int) const; + KOKKOS_INLINE_FUNCTION + double embedding(const double, const double, const double, double&) const; KOKKOS_INLINE_FUNCTION double erose(const double, const double, const double, const double, const double, const double, const int) const; KOKKOS_INLINE_FUNCTION - void get_shpfcn(const lattice_t latt, double (&s)[3]) const; + void get_shpfcn(const lattice_t latt, const double sthe, const double cthe, + double (&s)[3]) const; KOKKOS_INLINE_FUNCTION - int get_Zij(const lattice_t ) const; - KOKKOS_INLINE_FUNCTION - int get_Zij2(const lattice_t, const double, const double, double&, double&) const; -public: + int get_Zij(const lattice_t) const; + public: DAT::tdual_ffloat_1d k_rho, k_rho0, k_rho1, k_rho2, k_rho3, k_frhop; - typename ArrayTypes::t_ffloat_1d d_rho, d_rho0,d_rho1, d_rho2, d_rho3, d_frhop; + typename ArrayTypes::t_ffloat_1d d_rho, d_rho0, d_rho1, d_rho2, d_rho3, d_frhop; HAT::t_ffloat_1d h_rho, h_rho0, h_rho1, h_rho2, h_rho3, h_frhop; DAT::tdual_ffloat_1d k_gamma, k_dgamma1, k_dgamma2, k_dgamma3, k_arho2b; typename ArrayTypes::t_ffloat_1d d_gamma, d_dgamma1, d_dgamma2, d_dgamma3, d_arho2b; @@ -108,20 +118,54 @@ public: DAT::tdual_ffloat_2d k_arho1, k_arho2, k_arho3, k_arho3b, k_t_ave, k_tsq_ave; typename ArrayTypes::t_ffloat_2d d_arho1, d_arho2, d_arho3, d_arho3b, d_t_ave, d_tsq_ave; HAT::t_ffloat_2d h_arho1, h_arho2, h_arho3, h_arho3b, h_t_ave, h_tsq_ave; - DAT::tdual_ffloat_2d k_phir, k_phirar, k_phirar1, k_phirar2, k_phirar3, k_phirar4, k_phirar5, k_phirar6; typename ArrayTypes::t_ffloat_2d d_phir, d_phirar, d_phirar1, d_phirar2, d_phirar3, d_phirar4, d_phirar5, d_phirar6; - HAT::t_ffloat_2d h_phir, h_phirar, h_phirar1, h_phirar2, h_phirar3, h_phirar4, h_phirar5, h_phirar6; DAT::tdual_ffloat_1d k_scrfcn, k_dscrfcn, k_fcpair; typename ArrayTypes::t_ffloat_1d d_scrfcn, d_dscrfcn, d_fcpair; HAT::t_ffloat_1d h_scrfcn, h_dscrfcn, h_fcpair; - + protected: + int need_dup; + using KKDeviceType = typename KKDevice::value; + + template + using DupScatterView = KKScatterView; + + template + using NonDupScatterView = KKScatterView; + + DupScatterView dup_rho0; + NonDupScatterView ndup_rho0; + DupScatterView dup_arho2b; + NonDupScatterView ndup_arho2b; + DupScatterView dup_arho1; + NonDupScatterView ndup_arho1; + DupScatterView dup_arho2; + NonDupScatterView ndup_arho2; + DupScatterView dup_arho3; + NonDupScatterView ndup_arho3; + DupScatterView dup_arho3b; + NonDupScatterView ndup_arho3b; + DupScatterView dup_t_ave; + NonDupScatterView ndup_t_ave; + DupScatterView dup_tsq_ave; + NonDupScatterView ndup_tsq_ave; + DupScatterView dup_f; + NonDupScatterView ndup_f; + DupScatterView dup_eatom; + NonDupScatterView ndup_eatom; + DupScatterView dup_vatom; + NonDupScatterView ndup_vatom; }; + KOKKOS_INLINE_FUNCTION static bool iszero_kk(const double f) { return fabs(f) < 1e-20; } +KOKKOS_INLINE_FUNCTION +static bool isone_kk(const double f) { + return fabs(f-1.0) < 1e-20; +} KOKKOS_INLINE_FUNCTION static double fdiv_zero_kk(const double n, const double d) { @@ -134,9 +178,5 @@ static double fdiv_zero_kk(const double n, const double d) { } #include "meam_impl_kokkos.h" -//#include "meam_setup_done_kokkos.h" -//#include "meam_funcs_kokkos.h" -//#include "meam_dens_init_kokkos.h" -//#include "meam_dens_final_kokkos.h" -//#include "meam_force_kokkos.h" + #endif diff --git a/src/KOKKOS/meam_setup_done_kokkos.h b/src/KOKKOS/meam_setup_done_kokkos.h index 72852bf2cb..7d5de12427 100644 --- a/src/KOKKOS/meam_setup_done_kokkos.h +++ b/src/KOKKOS/meam_setup_done_kokkos.h @@ -1,71 +1,46 @@ #include "meam_kokkos.h" template -void MEAMKokkos::meam_setup_done(void) +void MEAMKokkos::meam_setup_done(double* cutmax) { - MemoryKokkos *memoryKK = (MemoryKokkos *)memory; + MEAM::meam_setup_done(cutmax); - memoryKK->destroy_kokkos(k_phirar6,phirar6); - memoryKK->destroy_kokkos(k_phirar5,phirar5); - memoryKK->destroy_kokkos(k_phirar4,phirar4); - memoryKK->destroy_kokkos(k_phirar3,phirar3); - memoryKK->destroy_kokkos(k_phirar2,phirar2); - memoryKK->destroy_kokkos(k_phirar1,phirar1); - memoryKK->destroy_kokkos(k_phirar,phirar); - memoryKK->destroy_kokkos(k_phir,phir); + MemKK::realloc_kokkos(d_phir, "pair:phir", (neltypes * (neltypes + 1)) / 2, nr); + MemKK::realloc_kokkos(d_phirar, "pair:phirar", (neltypes * (neltypes + 1)) / 2, nr); + MemKK::realloc_kokkos(d_phirar1, "pair:phirar1", (neltypes * (neltypes + 1)) / 2, nr); + MemKK::realloc_kokkos(d_phirar2, "pair:phirar2", (neltypes * (neltypes + 1)) / 2, nr); + MemKK::realloc_kokkos(d_phirar3, "pair:phirar3", (neltypes * (neltypes + 1)) / 2, nr); + MemKK::realloc_kokkos(d_phirar4, "pair:phirar4", (neltypes * (neltypes + 1)) / 2, nr); + MemKK::realloc_kokkos(d_phirar5, "pair:phirar5", (neltypes * (neltypes + 1)) / 2, nr); + MemKK::realloc_kokkos(d_phirar6, "pair:phirar6", (neltypes * (neltypes + 1)) / 2, nr); - memoryKK->create_kokkos(k_phir, phir, (neltypes * (neltypes + 1)) / 2, nr, "pair:phir"); - memoryKK->create_kokkos(k_phirar, phirar, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar"); - memoryKK->create_kokkos(k_phirar1, phirar1, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar1"); - memoryKK->create_kokkos(k_phirar2, phirar2, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar2"); - memoryKK->create_kokkos(k_phirar3, phirar3, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar3"); - memoryKK->create_kokkos(k_phirar4, phirar4, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar4"); - memoryKK->create_kokkos(k_phirar5, phirar5, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar5"); - memoryKK->create_kokkos(k_phirar6, phirar6, (neltypes * (neltypes + 1)) / 2, nr, "pair:phirar6"); - - h_phir = k_phir.h_view; - h_phirar = k_phirar.h_view; - h_phirar1 = k_phirar1.h_view; - h_phirar2 = k_phirar2.h_view; - h_phirar3 = k_phirar3.h_view; - h_phirar4 = k_phirar4.h_view; - h_phirar5 = k_phirar5.h_view; - h_phirar6 = k_phirar6.h_view; + auto h_phir = Kokkos::create_mirror_view(d_phir); + auto h_phirar = Kokkos::create_mirror_view(d_phirar); + auto h_phirar1 = Kokkos::create_mirror_view(d_phirar1); + auto h_phirar2 = Kokkos::create_mirror_view(d_phirar2); + auto h_phirar3 = Kokkos::create_mirror_view(d_phirar3); + auto h_phirar4 = Kokkos::create_mirror_view(d_phirar4); + auto h_phirar5 = Kokkos::create_mirror_view(d_phirar5); + auto h_phirar6 = Kokkos::create_mirror_view(d_phirar6); for (int i = 0; i <(neltypes * (neltypes + 1)) / 2; i++) - for(int j = 0; j < nr; j++) - { - h_phir(i,j) = phir[i][j]; - h_phirar(i,j) = phirar[i][j]; - h_phirar1(i,j) = phirar1[i][j]; - h_phirar2(i,j) = phirar2[i][j]; - h_phirar3(i,j) = phirar3[i][j]; - h_phirar4(i,j) = phirar4[i][j]; - h_phirar5(i,j) = phirar5[i][j]; - h_phirar6(i,j) = phirar6[i][j]; + for(int j = 0; j < nr; j++) { + h_phir(i,j) = phir[i][j]; + h_phirar(i,j) = phirar[i][j]; + h_phirar1(i,j) = phirar1[i][j]; + h_phirar2(i,j) = phirar2[i][j]; + h_phirar3(i,j) = phirar3[i][j]; + h_phirar4(i,j) = phirar4[i][j]; + h_phirar5(i,j) = phirar5[i][j]; + h_phirar6(i,j) = phirar6[i][j]; } - k_phir.template modify(); - k_phir.template sync(); - d_phir = k_phir.template view(); - k_phirar.template modify(); - k_phirar.template sync(); - d_phirar = k_phirar.template view(); - k_phirar1.template modify(); - k_phirar1.template sync(); - d_phirar1 = k_phirar1.template view(); - k_phirar2.template modify(); - k_phirar2.template sync(); - d_phirar2 = k_phirar2.template view(); - k_phirar3.template modify(); - k_phirar3.template sync(); - d_phirar3 = k_phirar3.template view(); - k_phirar4.template modify(); - k_phirar4.template sync(); - d_phirar4 = k_phirar4.template view(); - k_phirar5.template modify(); - k_phirar5.template sync(); - d_phirar5 = k_phirar5.template view(); - k_phirar6.template modify(); - k_phirar6.template sync(); - d_phirar6 = k_phirar6.template view(); + + Kokkos::deep_copy(d_phir,h_phir); + Kokkos::deep_copy(d_phirar,h_phirar); + Kokkos::deep_copy(d_phirar1,h_phirar1); + Kokkos::deep_copy(d_phirar2,h_phirar2); + Kokkos::deep_copy(d_phirar3,h_phirar3); + Kokkos::deep_copy(d_phirar4,h_phirar4); + Kokkos::deep_copy(d_phirar5,h_phirar5); + Kokkos::deep_copy(d_phirar6,h_phirar6); } diff --git a/src/KOKKOS/pair_meam_kokkos.cpp b/src/KOKKOS/pair_meam_kokkos.cpp index 38386d0a7a..33343f4af5 100644 --- a/src/KOKKOS/pair_meam_kokkos.cpp +++ b/src/KOKKOS/pair_meam_kokkos.cpp @@ -1,6 +1,6 @@ /* ---------------------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories + https://www.lammps.org/, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract @@ -12,86 +12,67 @@ ------------------------------------------------------------------------- */ /* ---------------------------------------------------------------------- - Contributing author: Greg Wagner (SNL) + Contributing authors: Naga Vydyanathan (NVIDIA), Stan Moore (SNL) ------------------------------------------------------------------------- */ -#include -#include -#include -#include -//KK* -#include "meam_kokkos.h" -#include "kokkos.h" -#include "pair_kokkos.h" -//#include "pair_meamc.h" - #include "pair_meam_kokkos.h" +#include "meam_kokkos.h" + #include "atom_kokkos.h" -//*KK -#include "force.h" -#include "comm.h" -//KK* -//#include "memory.h" -#include "memory_kokkos.h" -//*KK -#include "neighbor.h" -//KK* -//#include "neigh_list.h" -#include "neigh_list_kokkos.h" -//*KK -#include "neigh_request.h" -#include "error.h" -//*KK #include "atom_masks.h" -//*KK +#include "comm.h" +#include "error.h" +#include "force.h" +#include "kokkos.h" +#include "memory_kokkos.h" +#include "neigh_list_kokkos.h" +#include "neigh_request.h" +#include "neighbor.h" + +#include using namespace LAMMPS_NS; -#if 0 -static const int nkeywords = 21; -static const char *keywords[] = { - "Ec","alpha","rho0","delta","lattce", - "attrac","repuls","nn2","Cmin","Cmax","rc","delr", - "augt1","gsmooth_factor","re","ialloy", - "mixture_ref_t","erose_form","zbl", - "emb_lin_neg","bkgd_dyn"}; -#endif - /* ---------------------------------------------------------------------- */ + template PairMEAMKokkos::PairMEAMKokkos(LAMMPS *lmp) : PairMEAM(lmp) { respa_enable = 0; + kokkosable = 1; + reverse_comm_device = 1; atomKK = (AtomKokkos *) atom; execution_space = ExecutionSpaceFromDevice::space; datamask_read = X_MASK | F_MASK | TYPE_MASK | ENERGY_MASK | VIRIAL_MASK; datamask_modify = F_MASK | ENERGY_MASK | VIRIAL_MASK; - meam_inst_kk = new MEAMKokkos(memory); + delete meam_inst; + meam_inst_kk = new MEAMKokkos(memory); meam_inst = meam_inst_kk; } + /* ---------------------------------------------------------------------- */ template PairMEAMKokkos::~PairMEAMKokkos() { - if (!copymode) { - memoryKK->destroy_kokkos(k_eatom,eatom); - memoryKK->destroy_kokkos(k_vatom,vatom); - delete meam_inst_kk; - } + if (copymode) return; + + memoryKK->destroy_kokkos(k_eatom,eatom); + memoryKK->destroy_kokkos(k_vatom,vatom); + delete meam_inst_kk; + meam_inst = nullptr; } /* ---------------------------------------------------------------------- */ -/* ---------------------------------------------------------------------- */ template void PairMEAMKokkos::compute(int eflag_in, int vflag_in) { eflag = eflag_in; vflag = vflag_in; - + if (neighflag == FULL) no_virial_fdotr_compute = 1; ev_init(eflag,vflag,0); @@ -105,36 +86,32 @@ void PairMEAMKokkos::compute(int eflag_in, int vflag_in) } if (vflag_atom) { memoryKK->destroy_kokkos(k_vatom,vatom); - memoryKK->create_kokkos(k_vatom,vatom,maxvatom,6,"pair:vatom"); + memoryKK->create_kokkos(k_vatom,vatom,maxvatom,"pair:vatom"); d_vatom = k_vatom.view(); } - atomKK->sync(execution_space,datamask_read); - if (eflag || vflag) atomKK->modified(execution_space,datamask_modify); - else atomKK->modified(execution_space,F_MASK); - - // neighbor list info - NeighListKokkos* k_halflist = static_cast*>(listhalf); int inum_half = listhalf->inum; - int* numneigh_half = listhalf->numneigh; - int* ilist_half = listhalf->ilist; - + NeighListKokkos* k_halflist = static_cast*>(listhalf); d_ilist_half = k_halflist->d_ilist; d_numneigh_half = k_halflist->d_numneigh; d_neighbors_half = k_halflist->d_neighbors; + NeighListKokkos* k_fulllist = static_cast*>(listfull); d_numneigh_full = k_fulllist->d_numneigh; d_neighbors_full = k_fulllist->d_neighbors; + EV_FLOAT ev; + copymode = 1; + meam_inst_kk->copymode = 1; // strip neighbor lists of any special bond flags before using with MEAM // necessary before doing neigh_f2c and neigh_c2f conversions each step - if (neighbor->ago == 0) { - Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_half),*this); - } + + if (neighbor->ago == 0) + Kokkos::parallel_for(Kokkos::RangePolicy(0,inum_half),*this); // check size of scrfcn based on half neighbor list @@ -142,152 +119,137 @@ void PairMEAMKokkos::compute(int eflag_in, int vflag_in) nall = nlocal + atom->nghost; int n = 0; - //for (ii = 0; ii < inum_half; ii++) n += numneigh_half[ilist_half[ii]]; - Kokkos::parallel_reduce(Kokkos::RangePolicy(0,inum_half), *this, n); + Kokkos::parallel_reduce(Kokkos::RangePolicy(0,inum_half),*this,n); meam_inst_kk->meam_dens_setup(atom->nmax, nall, n); - //double **x = atom->x; x = atomKK->k_x.view(); - - //double **f = atom->f; f = atomKK->k_f.view(); - - //int *type = atom->type; type = atomKK->k_type.view(); + atomKK->sync(execution_space,datamask_read); + int ntype = atom->ntypes; // 3 stages of MEAM calculation // loop over my atoms followed by communication - int offset = 0; int errorflag = 0; -#if 0 - for (ii = 0; ii < inum_half; ii++) { - i = ilist_half[ii]; - meam_inst->meam_dens_init(i,ntype,type,map,x, - numneigh_half[i],firstneigh_half[i], - numneigh_full[i],firstneigh_full[i], - offset); - offset += numneigh_half[i]; - } -#endif - // To do: create the cumulative offset array in host and device - k_offset = DAT::tdual_int_1d("pair:offset",inum_half+1); - h_offset = k_offset.h_view; - d_offset = k_offset.template view(); - ArrayTypes::t_int_1d h_ilist; - ArrayTypes::t_int_1d h_numneigh; - h_ilist = Kokkos::create_mirror_view(k_halflist->d_ilist); - h_numneigh = Kokkos::create_mirror_view(k_halflist->d_numneigh); - Kokkos::deep_copy(h_ilist,k_halflist->d_ilist); - Kokkos::deep_copy(h_numneigh,k_halflist->d_numneigh); + d_offset = typename AT::t_int_1d("pair:offset",inum_half+1); + { + // local variables for lambda capture - h_offset[0] = 0; - for (int ii = 0; ii < inum_half; ii++) { - int i = h_ilist[ii]; - h_offset[ii+1] = h_offset[ii] + h_numneigh[i]; + auto l_ilist_half = d_ilist_half; + auto l_numneigh_half = d_numneigh_half; + auto l_offset = d_offset; + + Kokkos::parallel_scan(inum_half, LAMMPS_LAMBDA(int ii, int &m_fill, bool final) { + int i = l_ilist_half[ii]; + m_fill += l_numneigh_half[i]; + if (final) + l_offset[ii+1] = m_fill; + }); } - k_offset.template modify(); - k_offset.template sync(); - meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,&offset,d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag); + + int need_dup = lmp->kokkos->need_dup(); + + meam_inst_kk->meam_dens_init(inum_half,ntype,type,d_map,x,d_numneigh_half,d_numneigh_full,d_ilist_half,d_neighbors_half, d_neighbors_full, d_offset, neighflag, need_dup); + meam_inst_kk->k_rho0.template modify(); - meam_inst_kk->k_rho0.template sync(); - meam_inst_kk->k_arho2b.template modify(); - meam_inst_kk->k_arho2b.template sync(); - meam_inst_kk->k_arho1.template modify(); - meam_inst_kk->k_arho1.template sync(); - meam_inst_kk->k_arho2.template modify(); - meam_inst_kk->k_arho2.template sync(); - meam_inst_kk->k_arho3.template modify(); - meam_inst_kk->k_arho3.template sync(); - meam_inst_kk->k_arho3b.template modify(); - meam_inst_kk->k_arho3b.template sync(); - meam_inst_kk->k_t_ave.template modify(); - meam_inst_kk->k_t_ave.template sync(); - meam_inst_kk->k_tsq_ave.template modify(); - meam_inst_kk->k_tsq_ave.template sync(); comm->reverse_comm(this); - meam_inst_kk->k_rho0.template modify(); meam_inst_kk->k_rho0.template sync(); - - meam_inst_kk->k_arho2b.template modify(); meam_inst_kk->k_arho2b.template sync(); - - meam_inst_kk->k_arho1.template modify(); meam_inst_kk->k_arho1.template sync(); - - meam_inst_kk->k_arho2.template modify(); meam_inst_kk->k_arho2.template sync(); - - meam_inst_kk->k_arho3.template modify(); meam_inst_kk->k_arho3.template sync(); - - meam_inst_kk->k_arho3b.template modify(); meam_inst_kk->k_arho3b.template sync(); - - meam_inst_kk->k_t_ave.template modify(); meam_inst_kk->k_t_ave.template sync(); - - meam_inst_kk->k_tsq_ave.template modify(); meam_inst_kk->k_tsq_ave.template sync(); - meam_inst_kk->meam_dens_final(nlocal,eflag_either,eflag_global,eflag_atom, - &eng_vdwl,d_eatom,ntype,type,d_map,errorflag); - if (errorflag) { - char str[128]; - sprintf(str,"MEAM library error %d",errorflag); - error->one(FLERR,str); - } + d_eatom,ntype,type,d_map,d_scale,errorflag,ev); + + if (errorflag) + error->one(FLERR,"MEAM library error {}",errorflag); + + meam_inst_kk->k_rho0.template modify(); + meam_inst_kk->k_rho1.template modify(); + meam_inst_kk->k_rho2.template modify(); + meam_inst_kk->k_rho3.template modify(); + meam_inst_kk->k_frhop.template modify(); + meam_inst_kk->k_gamma.template modify(); + meam_inst_kk->k_dgamma1.template modify(); + meam_inst_kk->k_dgamma2.template modify(); + meam_inst_kk->k_dgamma3.template modify(); + meam_inst_kk->k_arho2b.template modify(); + meam_inst_kk->k_arho1.template modify(); + meam_inst_kk->k_arho2.template modify(); + meam_inst_kk->k_arho3.template modify(); + meam_inst_kk->k_arho3b.template modify(); + meam_inst_kk->k_t_ave.template modify(); + meam_inst_kk->k_tsq_ave.template modify(); comm->forward_comm(this); - offset = 0; + meam_inst_kk->k_rho0.template sync(); + meam_inst_kk->k_rho1.template sync(); + meam_inst_kk->k_rho2.template sync(); + meam_inst_kk->k_rho3.template sync(); + meam_inst_kk->k_frhop.template sync(); + meam_inst_kk->k_gamma.template sync(); + meam_inst_kk->k_dgamma1.template sync(); + meam_inst_kk->k_dgamma2.template sync(); + meam_inst_kk->k_dgamma3.template sync(); + meam_inst_kk->k_arho2b.template sync(); + meam_inst_kk->k_arho1.template sync(); + meam_inst_kk->k_arho2.template sync(); + meam_inst_kk->k_arho3.template sync(); + meam_inst_kk->k_arho3b.template sync(); + meam_inst_kk->k_t_ave.template sync(); + meam_inst_kk->k_tsq_ave.template sync(); - // vptr is first value in vatom if it will be used by meam_force() - // else vatom may not exist, so pass dummy ptr + meam_inst_kk->meam_force(inum_half,eflag_global,eflag_atom,vflag_global, + vflag_atom,d_eatom,ntype,type,d_map,x, + d_numneigh_half, d_numneigh_full,f,d_vatom, + d_ilist_half, d_offset, d_neighbors_half, d_neighbors_full, + neighflag, need_dup, ev); -#if 0 // To do: is this correct? vflag_atom is used to access vatom - typename ArrayTypes::t_virial_array vptr; - if (vflag_atom) vptr = d_vatom; - else vptr = NULL; - for (ii = 0; ii < inum_half; ii++) { - i = ilist_half[ii]; - meam_inst->meam_force(i,eflag_either,eflag_global,eflag_atom, - vflag_atom,&eng_vdwl,eatom,ntype,type,map,x, - numneigh_half[i],firstneigh_half[i], - numneigh_full[i],firstneigh_full[i], - offset,f,vptr); - offset += numneigh_half[i]; + if (eflag_global) eng_vdwl += ev.evdwl; + if (vflag_global) { + virial[0] += ev.v[0]; + virial[1] += ev.v[1]; + virial[2] += ev.v[2]; + virial[3] += ev.v[3]; + virial[4] += ev.v[4]; + virial[5] += ev.v[5]; } -#endif - meam_inst_kk->meam_force(inum_half, eflag_either,eflag_global,eflag_atom, - vflag_atom,&eng_vdwl,d_eatom,ntype,type,d_map,x, - d_numneigh_half, d_numneigh_full,f,d_vatom,d_ilist_half, d_offset, d_neighbors_half, d_neighbors_full, neighflag); if (vflag_fdotr) pair_virial_fdotr_compute(this); if (eflag_atom) { k_eatom.template modify(); - k_eatom.template sync(); + k_eatom.sync_host(); } if (vflag_atom) { k_vatom.template modify(); - k_vatom.template sync(); + k_vatom.sync_host(); } + if (eflag || vflag) atomKK->modified(execution_space,datamask_modify); + else atomKK->modified(execution_space,F_MASK); + + copymode = 0; + meam_inst_kk->copymode = 0; } /* ---------------------------------------------------------------------- @@ -298,24 +260,22 @@ void PairMEAMKokkos::coeff(int narg, char **arg) { PairMEAM::coeff(narg,arg); - //sync map + // sync map and scale int n = atom->ntypes; - k_map = DAT::tdual_int_1d("pair:map",n+1); - HAT::t_int_1d h_map = k_map.h_view; + MemKK::realloc_kokkos(d_map,"pair:map",n+1); + MemKK::realloc_kokkos(d_scale,"pair:scale",n+1,n+1); + auto h_map = Kokkos::create_mirror_view(d_map); + auto h_scale = Kokkos::create_mirror_view(d_scale); - for (int i = 1; i <= n; i++) + for (int i = 1; i <= n; i++) { h_map[i] = map[i]; + for (int j = 1; j <= n; j++) + h_scale(i,j) = scale[i][j]; + } - k_map.template modify(); - k_map.template sync(); - - d_map = k_map.template view(); - - // To do: need to synchronize phirar variables - - meam_inst_kk->meam_setup_done(); - + Kokkos::deep_copy(d_map,h_map); + Kokkos::deep_copy(d_scale,h_scale); } /* ---------------------------------------------------------------------- @@ -324,24 +284,27 @@ void PairMEAMKokkos::coeff(int narg, char **arg) template void PairMEAMKokkos::init_style() { - PairMEAM::init_style(); + // adjust neighbor list request for KOKKOS + neighflag = lmp->kokkos->neighflag; - auto request = neighbor->find_request(this); - - // MEAM needs both a full and half neighbor list? Not sure how to get that. - if (!(neighflag == FULL || neighflag == HALF || neighflag == HALFTHREAD)) - error->all(FLERR, "Cannot use chosen neighbor list style with pair meam/kk"); - + auto request = neighbor->find_request(this,1); request->set_kokkos_host(std::is_same::value && !std::is_same::value); request->set_kokkos_device(std::is_same::value); - if (neighflag == FULL) request->enable_full(); + request = neighbor->find_request(this,2); + request->set_kokkos_host(std::is_same::value && + !std::is_same::value); + request->set_kokkos_device(std::is_same::value); + + if (neighflag == FULL) + error->all(FLERR,"Must use half neighbor list style with pair meam/kk"); } /* ---------------------------------------------------------------------- */ + template int PairMEAMKokkos::pack_forward_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf, int pbc_flag, int *pbc) @@ -350,9 +313,11 @@ int PairMEAMKokkos::pack_forward_comm_kokkos(int n, DAT::tdual_int_2 iswap = iswap_in; v_buf = buf.view(); Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); - return n; + return n*38; } +/* ---------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION void PairMEAMKokkos::operator()(TagPairMEAMPackForwardComm, const int &i) const { @@ -361,35 +326,36 @@ void PairMEAMKokkos::operator()(TagPairMEAMPackForwardComm, const in v_buf[m++] = meam_inst_kk->d_rho0[j]; v_buf[m++] = meam_inst_kk->d_rho1[j]; v_buf[m++] = meam_inst_kk->d_rho2[j]; - v_buf[m++] = meam_inst_kk->d_rho3[j]; - v_buf[m++] = meam_inst_kk->d_frhop[j]; - v_buf[m++] = meam_inst_kk->d_gamma[j]; - v_buf[m++] = meam_inst_kk->d_dgamma1[j]; - v_buf[m++] = meam_inst_kk->d_dgamma2[j]; - v_buf[m++] = meam_inst_kk->d_dgamma3[j]; - v_buf[m++] = meam_inst_kk->d_arho2b[j]; - v_buf[m++] = meam_inst_kk->d_arho1(j,0); - v_buf[m++] = meam_inst_kk->d_arho1(j,1); - v_buf[m++] = meam_inst_kk->d_arho1(j,2); - v_buf[m++] = meam_inst_kk->d_arho2(j,0); - v_buf[m++] = meam_inst_kk->d_arho2(j,1); - v_buf[m++] = meam_inst_kk->d_arho2(j,2); - v_buf[m++] = meam_inst_kk->d_arho2(j,3); - v_buf[m++] = meam_inst_kk->d_arho2(j,4); - v_buf[m++] = meam_inst_kk->d_arho2(j,5); - for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(j,k); - v_buf[m++] = meam_inst_kk->d_arho3b(j,0); - v_buf[m++] = meam_inst_kk->d_arho3b(j,1); - v_buf[m++] = meam_inst_kk->d_arho3b(j,2); - v_buf[m++] = meam_inst_kk->d_t_ave(j,0); - v_buf[m++] = meam_inst_kk->d_t_ave(j,1); - v_buf[m++] = meam_inst_kk->d_t_ave(j,2); - v_buf[m++] = meam_inst_kk->d_tsq_ave(j,0); - v_buf[m++] = meam_inst_kk->d_tsq_ave(j,1); - v_buf[m++] = meam_inst_kk->d_tsq_ave(j,2); + v_buf[m++] = meam_inst_kk->d_rho3[j]; + v_buf[m++] = meam_inst_kk->d_frhop[j]; + v_buf[m++] = meam_inst_kk->d_gamma[j]; + v_buf[m++] = meam_inst_kk->d_dgamma1[j]; + v_buf[m++] = meam_inst_kk->d_dgamma2[j]; + v_buf[m++] = meam_inst_kk->d_dgamma3[j]; + v_buf[m++] = meam_inst_kk->d_arho2b[j]; + v_buf[m++] = meam_inst_kk->d_arho1(j,0); + v_buf[m++] = meam_inst_kk->d_arho1(j,1); + v_buf[m++] = meam_inst_kk->d_arho1(j,2); + v_buf[m++] = meam_inst_kk->d_arho2(j,0); + v_buf[m++] = meam_inst_kk->d_arho2(j,1); + v_buf[m++] = meam_inst_kk->d_arho2(j,2); + v_buf[m++] = meam_inst_kk->d_arho2(j,3); + v_buf[m++] = meam_inst_kk->d_arho2(j,4); + v_buf[m++] = meam_inst_kk->d_arho2(j,5); + for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(j,k); + v_buf[m++] = meam_inst_kk->d_arho3b(j,0); + v_buf[m++] = meam_inst_kk->d_arho3b(j,1); + v_buf[m++] = meam_inst_kk->d_arho3b(j,2); + v_buf[m++] = meam_inst_kk->d_t_ave(j,0); + v_buf[m++] = meam_inst_kk->d_t_ave(j,1); + v_buf[m++] = meam_inst_kk->d_t_ave(j,2); + v_buf[m++] = meam_inst_kk->d_tsq_ave(j,0); + v_buf[m++] = meam_inst_kk->d_tsq_ave(j,1); + v_buf[m++] = meam_inst_kk->d_tsq_ave(j,2); } /* ---------------------------------------------------------------------- */ + template void PairMEAMKokkos::unpack_forward_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf) { @@ -398,6 +364,8 @@ void PairMEAMKokkos::unpack_forward_comm_kokkos(int n, int first_in, Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); } +/* ---------------------------------------------------------------------- */ + template KOKKOS_INLINE_FUNCTION void PairMEAMKokkos::operator()(TagPairMEAMUnpackForwardComm, const int &i) const{ @@ -432,18 +400,34 @@ void PairMEAMKokkos::operator()(TagPairMEAMUnpackForwardComm, const meam_inst_kk->d_tsq_ave(i+first,0) = v_buf[m++]; meam_inst_kk->d_tsq_ave(i+first,1) = v_buf[m++]; meam_inst_kk->d_tsq_ave(i+first,2) = v_buf[m++]; - } + } + +/* ---------------------------------------------------------------------- */ template int PairMEAMKokkos::pack_forward_comm(int n, int *list, double *buf, int pbc_flag, int *pbc) { - int i,j,k,m; + meam_inst_kk->k_rho0.sync_host(); + meam_inst_kk->k_rho1.sync_host(); + meam_inst_kk->k_rho2.sync_host(); + meam_inst_kk->k_rho3.sync_host(); + meam_inst_kk->k_frhop.sync_host(); + meam_inst_kk->k_gamma.sync_host(); + meam_inst_kk->k_dgamma1.sync_host(); + meam_inst_kk->k_dgamma2.sync_host(); + meam_inst_kk->k_dgamma3.sync_host(); + meam_inst_kk->k_arho2b.sync_host(); + meam_inst_kk->k_arho1.sync_host(); + meam_inst_kk->k_arho2.sync_host(); + meam_inst_kk->k_arho3.sync_host(); + meam_inst_kk->k_arho3b.sync_host(); + meam_inst_kk->k_t_ave.sync_host(); + meam_inst_kk->k_tsq_ave.sync_host(); - m = 0; - - for (i = 0; i < n; i++) { - j = list[i]; + int m = 0; + for (int i = 0; i < n; i++) { + const int j = list[i]; buf[m++] = meam_inst_kk->h_rho0[j]; buf[m++] = meam_inst_kk->h_rho1[j]; buf[m++] = meam_inst_kk->h_rho2[j]; @@ -463,7 +447,7 @@ int PairMEAMKokkos::pack_forward_comm(int n, int *list, double *buf, buf[m++] = meam_inst_kk->h_arho2(j,3); buf[m++] = meam_inst_kk->h_arho2(j,4); buf[m++] = meam_inst_kk->h_arho2(j,5); - for (k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(j,k); + for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(j,k); buf[m++] = meam_inst_kk->h_arho3b(j,0); buf[m++] = meam_inst_kk->h_arho3b(j,1); buf[m++] = meam_inst_kk->h_arho3b(j,2); @@ -478,14 +462,31 @@ int PairMEAMKokkos::pack_forward_comm(int n, int *list, double *buf, return m; } +/* ---------------------------------------------------------------------- */ + template void PairMEAMKokkos::unpack_forward_comm(int n, int first, double *buf) { - int i,k,m,last; + meam_inst_kk->k_rho0.sync_host(); + meam_inst_kk->k_rho1.sync_host(); + meam_inst_kk->k_rho2.sync_host(); + meam_inst_kk->k_rho3.sync_host(); + meam_inst_kk->k_frhop.sync_host(); + meam_inst_kk->k_gamma.sync_host(); + meam_inst_kk->k_dgamma1.sync_host(); + meam_inst_kk->k_dgamma2.sync_host(); + meam_inst_kk->k_dgamma3.sync_host(); + meam_inst_kk->k_arho2b.sync_host(); + meam_inst_kk->k_arho1.sync_host(); + meam_inst_kk->k_arho2.sync_host(); + meam_inst_kk->k_arho3.sync_host(); + meam_inst_kk->k_arho3b.sync_host(); + meam_inst_kk->k_t_ave.sync_host(); + meam_inst_kk->k_tsq_ave.sync_host(); - m = 0; - last = first + n; - for (i = first; i < last; i++) { + int m = 0; + const int last = first + n; + for (int i = first; i < last; i++) { meam_inst_kk->h_rho0[i] = buf[m++]; meam_inst_kk->h_rho1[i] = buf[m++]; meam_inst_kk->h_rho2[i] = buf[m++]; @@ -505,7 +506,7 @@ void PairMEAMKokkos::unpack_forward_comm(int n, int first, double *b meam_inst_kk->h_arho2(i,3) = buf[m++]; meam_inst_kk->h_arho2(i,4) = buf[m++]; meam_inst_kk->h_arho2(i,5) = buf[m++]; - for (k = 0; k < 10; k++) meam_inst_kk->h_arho3(i,k) = buf[m++]; + for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3(i,k) = buf[m++]; meam_inst_kk->h_arho3b(i,0) = buf[m++]; meam_inst_kk->h_arho3b(i,1) = buf[m++]; meam_inst_kk->h_arho3b(i,2) = buf[m++]; @@ -516,17 +517,83 @@ void PairMEAMKokkos::unpack_forward_comm(int n, int first, double *b meam_inst_kk->h_tsq_ave(i,1) = buf[m++]; meam_inst_kk->h_tsq_ave(i,2) = buf[m++]; } + + meam_inst_kk->k_rho0.modify_host(); + meam_inst_kk->k_rho1.modify_host(); + meam_inst_kk->k_rho2.modify_host(); + meam_inst_kk->k_rho3.modify_host(); + meam_inst_kk->k_frhop.modify_host(); + meam_inst_kk->k_gamma.modify_host(); + meam_inst_kk->k_dgamma1.modify_host(); + meam_inst_kk->k_dgamma2.modify_host(); + meam_inst_kk->k_dgamma3.modify_host(); + meam_inst_kk->k_arho2b.modify_host(); + meam_inst_kk->k_arho1.modify_host(); + meam_inst_kk->k_arho2.modify_host(); + meam_inst_kk->k_arho3.modify_host(); + meam_inst_kk->k_arho3b.modify_host(); + meam_inst_kk->k_t_ave.modify_host(); + meam_inst_kk->k_tsq_ave.modify_host(); } /* ---------------------------------------------------------------------- */ + +template +int PairMEAMKokkos::pack_reverse_comm_kokkos(int n, int first_in, DAT::tdual_xfloat_1d &buf) +{ + first = first_in; + v_buf = buf.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); + return n*30; +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairMEAMKokkos::operator()(TagPairMEAMPackReverseComm, const int &i) const { + int m = i*30; + + v_buf[m++] = meam_inst_kk->d_rho0[i+first]; + v_buf[m++] = meam_inst_kk->d_arho2b[i+first]; + v_buf[m++] = meam_inst_kk->d_arho1(i+first,0); + v_buf[m++] = meam_inst_kk->d_arho1(i+first,1); + v_buf[m++] = meam_inst_kk->d_arho1(i+first,2); + v_buf[m++] = meam_inst_kk->d_arho2(i+first,0); + v_buf[m++] = meam_inst_kk->d_arho2(i+first,1); + v_buf[m++] = meam_inst_kk->d_arho2(i+first,2); + v_buf[m++] = meam_inst_kk->d_arho2(i+first,3); + v_buf[m++] = meam_inst_kk->d_arho2(i+first,4); + v_buf[m++] = meam_inst_kk->d_arho2(i+first,5); + for (int k = 0; k < 10; k++) v_buf[m++] = meam_inst_kk->d_arho3(i+first,k); + v_buf[m++] = meam_inst_kk->d_arho3b(i+first,0); + v_buf[m++] = meam_inst_kk->d_arho3b(i+first,1); + v_buf[m++] = meam_inst_kk->d_arho3b(i+first,2); + v_buf[m++] = meam_inst_kk->d_t_ave(i+first,0); + v_buf[m++] = meam_inst_kk->d_t_ave(i+first,1); + v_buf[m++] = meam_inst_kk->d_t_ave(i+first,2); + v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,0); + v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,1); + v_buf[m++] = meam_inst_kk->d_tsq_ave(i+first,2); +} + +/* ---------------------------------------------------------------------- */ + template int PairMEAMKokkos::pack_reverse_comm(int n, int first, double *buf) { - int i,k,m,last; + meam_inst_kk->k_rho0.sync_host(); + meam_inst_kk->k_arho2b.sync_host(); + meam_inst_kk->k_arho1.sync_host(); + meam_inst_kk->k_arho2.sync_host(); + meam_inst_kk->k_arho3.sync_host(); + meam_inst_kk->k_arho3b.sync_host(); + meam_inst_kk->k_t_ave.sync_host(); + meam_inst_kk->k_tsq_ave.sync_host(); - m = 0; - last = first + n; - for (i = first; i < last; i++) { + int m = 0; + const int last = first + n; + for (int i = first; i < last; i++) { buf[m++] = meam_inst_kk->h_rho0[i]; buf[m++] = meam_inst_kk->h_arho2b[i]; buf[m++] = meam_inst_kk->h_arho1(i,0); @@ -538,7 +605,7 @@ int PairMEAMKokkos::pack_reverse_comm(int n, int first, double *buf) buf[m++] = meam_inst_kk->h_arho2(i,3); buf[m++] = meam_inst_kk->h_arho2(i,4); buf[m++] = meam_inst_kk->h_arho2(i,5); - for (k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(i,k); + for (int k = 0; k < 10; k++) buf[m++] = meam_inst_kk->h_arho3(i,k); buf[m++] = meam_inst_kk->h_arho3b(i,0); buf[m++] = meam_inst_kk->h_arho3b(i,1); buf[m++] = meam_inst_kk->h_arho3b(i,2); @@ -554,14 +621,64 @@ int PairMEAMKokkos::pack_reverse_comm(int n, int first, double *buf) } /* ---------------------------------------------------------------------- */ + +template +void PairMEAMKokkos::unpack_reverse_comm_kokkos(int n, DAT::tdual_int_2d k_sendlist, int iswap_in, DAT::tdual_xfloat_1d &buf) +{ + d_sendlist = k_sendlist.view(); + iswap = iswap_in; + v_buf = buf.view(); + Kokkos::parallel_for(Kokkos::RangePolicy(0,n),*this); +} + +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairMEAMKokkos::operator()(TagPairMEAMUnpackReverseComm, const int &i) const { + int j = d_sendlist(iswap, i); + int m = i*30; + + meam_inst_kk->d_rho0[j] += v_buf[m++]; + meam_inst_kk->d_arho2b[j] += v_buf[m++]; + meam_inst_kk->d_arho1(j,0) += v_buf[m++]; + meam_inst_kk->d_arho1(j,1) += v_buf[m++]; + meam_inst_kk->d_arho1(j,2) += v_buf[m++]; + meam_inst_kk->d_arho2(j,0) += v_buf[m++]; + meam_inst_kk->d_arho2(j,1) += v_buf[m++]; + meam_inst_kk->d_arho2(j,2) += v_buf[m++]; + meam_inst_kk->d_arho2(j,3) += v_buf[m++]; + meam_inst_kk->d_arho2(j,4) += v_buf[m++]; + meam_inst_kk->d_arho2(j,5) += v_buf[m++]; + for (int k = 0; k < 10; k++) meam_inst_kk->d_arho3(j,k) += v_buf[m++]; + meam_inst_kk->d_arho3b(j,0) += v_buf[m++]; + meam_inst_kk->d_arho3b(j,1) += v_buf[m++]; + meam_inst_kk->d_arho3b(j,2) += v_buf[m++]; + meam_inst_kk->d_t_ave(j,0) += v_buf[m++]; + meam_inst_kk->d_t_ave(j,1) += v_buf[m++]; + meam_inst_kk->d_t_ave(j,2) += v_buf[m++]; + meam_inst_kk->d_tsq_ave(j,0) += v_buf[m++]; + meam_inst_kk->d_tsq_ave(j,1) += v_buf[m++]; + meam_inst_kk->d_tsq_ave(j,2) += v_buf[m++]; +} + +/* ---------------------------------------------------------------------- */ + template void PairMEAMKokkos::unpack_reverse_comm(int n, int *list, double *buf) { - int i,j,k,m; + meam_inst_kk->k_rho0.sync_host(); + meam_inst_kk->k_arho2b.sync_host(); + meam_inst_kk->k_arho1.sync_host(); + meam_inst_kk->k_arho2.sync_host(); + meam_inst_kk->k_arho3.sync_host(); + meam_inst_kk->k_arho3b.sync_host(); + meam_inst_kk->k_t_ave.sync_host(); + meam_inst_kk->k_tsq_ave.sync_host(); - m = 0; - for (i = 0; i < n; i++) { - j = list[i]; + int m = 0; + for (int i = 0; i < n; i++) { + const int j = list[i]; meam_inst_kk->h_rho0[j] += buf[m++]; meam_inst_kk->h_arho2b[j] += buf[m++]; meam_inst_kk->h_arho1(j,0) += buf[m++]; @@ -573,7 +690,7 @@ void PairMEAMKokkos::unpack_reverse_comm(int n, int *list, double *b meam_inst_kk->h_arho2(j,3) += buf[m++]; meam_inst_kk->h_arho2(j,4) += buf[m++]; meam_inst_kk->h_arho2(j,5) += buf[m++]; - for (k = 0; k < 10; k++) meam_inst_kk->h_arho3(j,k) += buf[m++]; + for (int k = 0; k < 10; k++) meam_inst_kk->h_arho3(j,k) += buf[m++]; meam_inst_kk->h_arho3b(j,0) += buf[m++]; meam_inst_kk->h_arho3b(j,1) += buf[m++]; meam_inst_kk->h_arho3b(j,2) += buf[m++]; @@ -584,46 +701,48 @@ void PairMEAMKokkos::unpack_reverse_comm(int n, int *list, double *b meam_inst_kk->h_tsq_ave(j,1) += buf[m++]; meam_inst_kk->h_tsq_ave(j,2) += buf[m++]; } + + meam_inst_kk->k_rho0.modify_host(); + meam_inst_kk->k_arho2b.modify_host(); + meam_inst_kk->k_arho1.modify_host(); + meam_inst_kk->k_arho2.modify_host(); + meam_inst_kk->k_arho3.modify_host(); + meam_inst_kk->k_arho3b.modify_host(); + meam_inst_kk->k_t_ave.modify_host(); + meam_inst_kk->k_tsq_ave.modify_host(); } /* ---------------------------------------------------------------------- - memory usage of local atom-based arrays + strip special bond flags from neighbor list entries + are not used with MEAM + need to do here so Fortran lib doesn't see them + done once per reneighbor so that neigh_f2c and neigh_c2f don't see them ------------------------------------------------------------------------- */ -template -double PairMEAMKokkos::memory_usage() -{ - double bytes = 11 * meam_inst_kk->nmax * sizeof(double); - bytes += (3 + 6 + 10 + 3 + 3 + 3) * meam_inst_kk->nmax * sizeof(double); - bytes += 3 * meam_inst_kk->maxneigh * sizeof(double); - return bytes; -} -template -KOKKOS_INLINE_FUNCTION -void PairMEAMKokkos::operator()(TagPairMEAMKernelNeighStrip, const int &ii) const { -/* ---------------------------------------------------------------------- - * strip special bond flags from neighbor list entries - * are not used with MEAM - * need to do here so Fortran lib doesn't see them - * done once per reneighbor so that neigh_f2c and neigh_c2f don't see them - * ------------------------------------------------------------------------- */ - const int i = d_ilist_half[ii]; - const int jnum_half = d_numneigh_half[i]; - const int jnum_full = d_numneigh_full[i]; - for (int jj = 0; jj < jnum_half; jj++) { - d_neighbors_half(i,jj) &= NEIGHMASK; - } - for (int jj = 0; jj < jnum_full; jj++) { - d_neighbors_full(i,jj) &= NEIGHMASK; - } -} template KOKKOS_INLINE_FUNCTION -void PairMEAMKokkos::operator()(TagPairMEAMKernelA, const int ii, int &n) const { - const int i = d_ilist_half[ii]; - n += d_numneigh_half[i]; +void PairMEAMKokkos::operator()(TagPairMEAMNeighStrip, const int &ii) const { + + const int i = d_ilist_half[ii]; + const int jnum_half = d_numneigh_half[i]; + const int jnum_full = d_numneigh_full[i]; + for (int jj = 0; jj < jnum_half; jj++) + d_neighbors_half(i,jj) &= NEIGHMASK; + for (int jj = 0; jj < jnum_full; jj++) + d_neighbors_full(i,jj) &= NEIGHMASK; } +/* ---------------------------------------------------------------------- */ + +template +KOKKOS_INLINE_FUNCTION +void PairMEAMKokkos::operator()(TagPairMEAMOffsets, const int ii, int &n) const { + const int i = d_ilist_half[ii]; + n += d_numneigh_half[i]; +} + +/* ---------------------------------------------------------------------- */ + namespace LAMMPS_NS { template class PairMEAMKokkos; #ifdef KOKKOS_ENABLE_CUDA diff --git a/src/KOKKOS/pair_meam_kokkos.h b/src/KOKKOS/pair_meam_kokkos.h index 114f4a1665..c18c9151f2 100644 --- a/src/KOKKOS/pair_meam_kokkos.h +++ b/src/KOKKOS/pair_meam_kokkos.h @@ -1,6 +1,6 @@ /* -*- c++ -*- ---------------------------------------------------------- LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator - http://lammps.sandia.gov, Sandia National Laboratories + https://www.lammps.org/, Sandia National Laboratories Steve Plimpton, sjplimp@sandia.gov Copyright (2003) Sandia Corporation. Under the terms of Contract @@ -12,31 +12,33 @@ ------------------------------------------------------------------------- */ #ifdef PAIR_CLASS - +// clang-format off PairStyle(meam/c/kk,PairMEAMKokkos) PairStyle(meam/c/kk/device,PairMEAMKokkos) PairStyle(meam/c/kk/host,PairMEAMKokkos) PairStyle(meam/kk,PairMEAMKokkos) PairStyle(meam/kk/device,PairMEAMKokkos) PairStyle(meam/kk/host,PairMEAMKokkos) - +// clang-format on #else -#ifndef LMP_PAIR_MEAMC_KOKKOS_H -#define LMP_PAIR_MEAMC_KOKKOS_H +// clang-format off +#ifndef LMP_PAIR_MEAM_KOKKOS_H +#define LMP_PAIR_MEAM_KOKKOS_H #include "kokkos_base.h" #include "pair_kokkos.h" #include "pair_meam.h" -#include "neigh_list_kokkos.h" #include "meam_kokkos.h" - namespace LAMMPS_NS { -struct TagPairMEAMKernelNeighStrip{}; -struct TagPairMEAMKernelA{}; + +struct TagPairMEAMNeighStrip{}; +struct TagPairMEAMOffsets{}; struct TagPairMEAMPackForwardComm{}; struct TagPairMEAMUnpackForwardComm{}; +struct TagPairMEAMPackReverseComm{}; +struct TagPairMEAMUnpackReverseComm{}; template class MEAMKokkos; @@ -48,13 +50,13 @@ class PairMEAMKokkos : public PairMEAM, public KokkosBase { enum {COUL_FLAG=0}; typedef DeviceType device_type; typedef ArrayTypes AT; - //typedef EV_FLOAT value_type; + typedef int value_type; PairMEAMKokkos(class LAMMPS *); - virtual ~PairMEAMKokkos(); - void compute(int, int); - void coeff(int, char **); - void init_style(); + ~PairMEAMKokkos() override; + void compute(int, int) override; + void coeff(int, char **) override; + void init_style() override; KOKKOS_INLINE_FUNCTION void operator()(TagPairMEAMPackForwardComm, const int&) const; @@ -63,104 +65,59 @@ class PairMEAMKokkos : public PairMEAM, public KokkosBase { void operator()(TagPairMEAMUnpackForwardComm, const int&) const; KOKKOS_INLINE_FUNCTION - void operator()(TagPairMEAMKernelNeighStrip, const int&) const; + void operator()(TagPairMEAMPackReverseComm, const int&) const; KOKKOS_INLINE_FUNCTION - void operator()(TagPairMEAMKernelA, const int, int&) const; + void operator()(TagPairMEAMUnpackReverseComm, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPairMEAMNeighStrip, const int&) const; + + KOKKOS_INLINE_FUNCTION + void operator()(TagPairMEAMOffsets, const int, int&) const; int pack_forward_comm_kokkos(int, DAT::tdual_int_2d, int, DAT::tdual_xfloat_1d&, - int, int *); - void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d&); - int pack_forward_comm(int, int *, double *, int, int *); - void unpack_forward_comm(int, int, double *); - int pack_reverse_comm(int, int, double *); - void unpack_reverse_comm(int, int *, double *); - double memory_usage(); + int, int *) override; + int pack_forward_comm(int, int *, double *, int, int *) override; + void unpack_forward_comm_kokkos(int, int, DAT::tdual_xfloat_1d&) override; + void unpack_forward_comm(int, int, double *) override; + int pack_reverse_comm_kokkos(int, int, DAT::tdual_xfloat_1d&) override; + int pack_reverse_comm(int, int, double *) override; + void unpack_reverse_comm_kokkos(int, DAT::tdual_int_2d, + int, DAT::tdual_xfloat_1d&) override; + void unpack_reverse_comm(int, int *, double *) override; protected: class MEAMKokkos *meam_inst_kk; - typename AT::t_x_array_randomread x; + typename AT::t_x_array x; typename AT::t_f_array f; - typename AT::t_int_1d_randomread type; + typename AT::t_int_1d type; DAT::tdual_efloat_1d k_eatom; DAT::tdual_virial_array k_vatom; - typename ArrayTypes::t_efloat_1d d_eatom; - typename ArrayTypes::t_virial_array d_vatom; + typename AT::t_efloat_1d d_eatom; + typename AT::t_virial_array d_vatom; - DAT::tdual_int_1d k_offset; - HAT::t_int_1d h_offset; - typename AT::t_int_1d_randomread d_offset; + typename AT::t_int_1d d_offset; DAT::tdual_int_1d k_map; - typename AT::t_int_1d_randomread d_map; - typename AT::t_int_1d_randomread d_ilist_half; - typename AT::t_int_1d_randomread d_numneigh_half; + typename AT::t_int_1d d_map; + typename AT::t_int_2d d_scale; + typename AT::t_int_1d d_ilist_half; + typename AT::t_int_1d d_numneigh_half; typename AT::t_neighbors_2d d_neighbors_half; - typename AT::t_int_1d_randomread d_numneigh_full; + typename AT::t_int_1d d_numneigh_full; typename AT::t_neighbors_2d d_neighbors_full; typename AT::t_int_2d d_sendlist; typename AT::t_xfloat_1d_um v_buf; - + + int iswap,first; int neighflag,nlocal,nall,eflag,vflag; - int iswap; - int first; friend void pair_virial_fdotr_compute(PairMEAMKokkos*); - }; } - #endif #endif -/* ERROR/WARNING messages: - -E: MEAM library error %d - -A call to the MEAM Fortran library returned an error. - -E: Illegal ... command - -Self-explanatory. Check the input script syntax and compare to the -documentation for the command. You can use -echo screen as a -command-line option when running LAMMPS to see the offending line. - -E: Incorrect args for pair coefficients - -Self-explanatory. Check the input script or data file. - -E: Pair style MEAM requires newton pair on - -See the newton command. This is a restriction to use the MEAM -potential. - -E: Cannot open MEAM potential file %s - -The specified MEAM potential file cannot be opened. Check that the -path and name are correct. - -E: Incorrect format in MEAM potential file - -Incorrect number of words per line in the potential file. - -E: Unrecognized lattice type in MEAM file 1 - -The lattice type in an entry of the MEAM library file is not -valid. - -E: Did not find all elements in MEAM library file - -The requested elements were not found in the MEAM file. - -E: Keyword %s in MEAM parameter file not recognized - -Self-explanatory. - -E: Unrecognized lattice type in MEAM file 2 - -The lattice type in an entry of the MEAM parameter file is not -valid. - -*/ diff --git a/src/MEAM/meam.h b/src/MEAM/meam.h index 8b94ac0084..8a2f43e354 100644 --- a/src/MEAM/meam.h +++ b/src/MEAM/meam.h @@ -27,7 +27,9 @@ typedef enum { FCC, BCC, HCP, DIM, DIA, DIA3, B1, C11, L12, B2, CH4, LIN, ZIG, T class MEAM { public: MEAM(Memory *mem); - ~MEAM(); + virtual ~MEAM(); + + int copymode; protected: Memory *memory; @@ -285,8 +287,8 @@ class MEAM { double *rozero, int *ibar); void meam_setup_param(int which, double value, int nindex, int *index /*index(3)*/, int *errorflag); - void meam_setup_done(double *cutmax); - void meam_dens_setup(int atom_nmax, int nall, int n_neigh); + virtual void meam_setup_done(double *cutmax); + virtual void meam_dens_setup(int atom_nmax, int nall, int n_neigh); void meam_dens_init(int i, int ntype, int *type, int *fmap, double **x, int numneigh, int *firstneigh, int numneigh_full, int *firstneigh_full, int fnoffset); void meam_dens_final(int nlocal, int eflag_either, int eflag_global, int eflag_atom, diff --git a/src/MEAM/meam_impl.cpp b/src/MEAM/meam_impl.cpp index 499a2b1520..d0d81cee88 100644 --- a/src/MEAM/meam_impl.cpp +++ b/src/MEAM/meam_impl.cpp @@ -36,6 +36,7 @@ MEAM::MEAM(Memory* mem) maxneigh = 0; scrfcn = dscrfcn = fcpair = nullptr; + copymode = 0; neltypes = 0; for (int i = 0; i < maxelt; i++) { @@ -53,6 +54,8 @@ MEAM::MEAM(Memory* mem) MEAM::~MEAM() { + if (copymode) return; + memory->destroy(this->phirar6); memory->destroy(this->phirar5); memory->destroy(this->phirar4); diff --git a/src/MEAM/pair_meam.cpp b/src/MEAM/pair_meam.cpp index ab027dcb89..6cbe5c69b9 100644 --- a/src/MEAM/pair_meam.cpp +++ b/src/MEAM/pair_meam.cpp @@ -75,7 +75,8 @@ PairMEAM::~PairMEAM() { if (copymode) return; - delete meam_inst; + if (meam_inst) + delete meam_inst; if (allocated) { memory->destroy(setflag); diff --git a/src/MEAM/pair_meam.h b/src/MEAM/pair_meam.h index ffe8045ac9..304e7eb41f 100644 --- a/src/MEAM/pair_meam.h +++ b/src/MEAM/pair_meam.h @@ -52,7 +52,7 @@ class PairMEAM : public Pair { double **scale; // scaling factor for adapt - virtual void allocate(); + void allocate(); void read_files(const std::string &, const std::string &); void read_global_meam_file(const std::string &); void read_user_meam_file(const std::string &); diff --git a/src/pair.cpp b/src/pair.cpp index 44f23745d9..5ce4dc2d32 100644 --- a/src/pair.cpp +++ b/src/pair.cpp @@ -126,6 +126,7 @@ Pair::Pair(LAMMPS *lmp) : Pointers(lmp) datamask_modify = ALL_MASK; kokkosable = 0; + reverse_comm_device = 0; copymode = 0; } diff --git a/src/pair.h b/src/pair.h index 4763a225d2..5bad9e14a3 100644 --- a/src/pair.h +++ b/src/pair.h @@ -123,6 +123,7 @@ class Pair : protected Pointers { ExecutionSpace execution_space; unsigned int datamask_read, datamask_modify; int kokkosable; // 1 if Kokkos pair + int reverse_comm_device; // 1 if reverse comm on Device Pair(class LAMMPS *); ~Pair() override;