diff --git a/doc/src/Eqs/fix_mvv_dpd.jpg b/doc/src/Eqs/fix_mvv_dpd.jpg
new file mode 100644
index 0000000000..f62ae28bc4
Binary files /dev/null and b/doc/src/Eqs/fix_mvv_dpd.jpg differ
diff --git a/doc/src/Eqs/fix_mvv_dpd.tex b/doc/src/Eqs/fix_mvv_dpd.tex
new file mode 100644
index 0000000000..4652d54b77
--- /dev/null
+++ b/doc/src/Eqs/fix_mvv_dpd.tex
@@ -0,0 +1,21 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ v(t+\frac{\Delta t}{2}) = v(t) + \frac{\Delta t}{2}\cdot a(t),
+$$
+
+$$
+ r(t+\Delta t) = r(t) + \Delta t\cdot v(t+\frac{\Delta t}{2}),
+$$
+
+$$
+ a(t+\Delta t) = \frac{1}{m}\cdot F\left[ r(t+\Delta t), v(t) +\lambda \cdot \Delta t\cdot a(t)\right],
+$$
+
+$$
+ v(t+\Delta t) = v(t+\frac{\Delta t}{2}) + \frac{\Delta t}{2}\cdot a(t++\Delta t),
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_edpd_force.jpg b/doc/src/Eqs/pair_edpd_force.jpg
new file mode 100644
index 0000000000..fabb6f772e
Binary files /dev/null and b/doc/src/Eqs/pair_edpd_force.jpg differ
diff --git a/doc/src/Eqs/pair_edpd_force.tex b/doc/src/Eqs/pair_edpd_force.tex
new file mode 100644
index 0000000000..f6a0ca0d3c
--- /dev/null
+++ b/doc/src/Eqs/pair_edpd_force.tex
@@ -0,0 +1,33 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ \mathbf{F}_{ij}^{C} = \alpha_{ij}{\omega_{C}}(r_{ij})\mathbf{e}_{ij},
+$$
+
+$$
+ \mathbf{F}_{ij}^{D} = -\gamma {\omega_{D}}(r_{ij})(\mathbf{e}_{ij} \cdot \mathbf{v}_{ij})\mathbf{e}_{ij},
+$$
+
+$$
+ \mathbf{F}_{ij}^{R} = \sigma {\omega_{R}}(r_{ij}){\xi_{ij}}\Delta t^{-1/2} \mathbf{e}_{ij},
+$$
+
+$$
+ \omega_{C}(r) = 1 - r/r_c,
+$$
+
+$$
+ \alpha_{ij} = A\cdot k_B(T_i + T_j)/2,
+$$
+
+$$
+ \omega_{D}(r) = \omega^2_{R}(r) = (1-r/r_c)^s,
+$$
+
+$$
+ \sigma_{ij}^2 = 4\gamma k_B T_i T_j/(T_i + T_j),
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_edpd_gov.jpg b/doc/src/Eqs/pair_edpd_gov.jpg
new file mode 100644
index 0000000000..10b303a218
Binary files /dev/null and b/doc/src/Eqs/pair_edpd_gov.jpg differ
diff --git a/doc/src/Eqs/pair_edpd_gov.tex b/doc/src/Eqs/pair_edpd_gov.tex
new file mode 100644
index 0000000000..782cdec99e
--- /dev/null
+++ b/doc/src/Eqs/pair_edpd_gov.tex
@@ -0,0 +1,15 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ \frac{\mathrm{d}^2 \mathbf{r}_i}{\mathrm{d} t^2}=
+ \frac{\mathrm{d} \mathbf{v}_i}{\mathrm{d} t}
+ =\mathbf{F}_{i}=\sum_{i\neq j}(\mathbf{F}_{ij}^{C}+\mathbf{F}_{ij}^{D}+\mathbf{F}_{ij}^{R}),
+$$
+
+$$
+ C_v\frac{\mathrm{d} T_i}{\mathrm{d} t}= q_{i} = \sum_{i\neq j}(q_{ij}^{C}+q_{ij}^{V}+q_{ij}^{R}),
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_edpd_heat.jpg b/doc/src/Eqs/pair_edpd_heat.jpg
new file mode 100644
index 0000000000..b9256a1d13
Binary files /dev/null and b/doc/src/Eqs/pair_edpd_heat.jpg differ
diff --git a/doc/src/Eqs/pair_edpd_heat.tex b/doc/src/Eqs/pair_edpd_heat.tex
new file mode 100644
index 0000000000..241a1bad64
--- /dev/null
+++ b/doc/src/Eqs/pair_edpd_heat.tex
@@ -0,0 +1,29 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ q_i^C = \sum_{j \ne i} k_{ij} \omega_{CT}(r_{ij}) \left( \frac{1}{T_i} - \frac{1}{T_j} \right),
+$$
+
+$$
+ q_i^V = \frac{1}{2 C_v}\sum_{j \ne i}{ \left\{ \omega_D(r_{ij})\left[\gamma_{ij} \left( \mathbf{e}_{ij} \cdot \mathbf{v}_{ij} \right)^2 - \frac{\left( \sigma _{ij} \right)^2}{m}\right] - \sigma _{ij} \omega_R(r_{ij})\left( \mathbf{e}_{ij} \cdot \mathbf{v}_{ij} \right){\xi_{ij}} \right\} },
+$$
+
+$$
+ q_i^R = \sum_{j \ne i} \beta _{ij} \omega_{RT}(r_{ij}) d {t^{ - 1/2}} \xi_{ij}^e,
+$$
+
+$$
+ \omega_{CT}(r)=\omega_{RT}^2(r)=\left(1-r/r_{ct}\right)^{s_T},
+$$
+
+$$
+ k_{ij}=C_v^2\kappa(T_i + T_j)^2/4k_B,
+$$
+
+$$
+ \beta_{ij}^2=2k_Bk_{ij},
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_edpd_kappa.jpg b/doc/src/Eqs/pair_edpd_kappa.jpg
new file mode 100644
index 0000000000..158974d374
Binary files /dev/null and b/doc/src/Eqs/pair_edpd_kappa.jpg differ
diff --git a/doc/src/Eqs/pair_edpd_kappa.tex b/doc/src/Eqs/pair_edpd_kappa.tex
new file mode 100644
index 0000000000..ac5ca9f740
--- /dev/null
+++ b/doc/src/Eqs/pair_edpd_kappa.tex
@@ -0,0 +1,9 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ \kappa = \frac{315k_B\upsilon }{2\pi \rho C_v r_{ct}^5}\frac{1}{Pr},
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_mdpd_force.jpg b/doc/src/Eqs/pair_mdpd_force.jpg
new file mode 100644
index 0000000000..9b0e573b79
Binary files /dev/null and b/doc/src/Eqs/pair_mdpd_force.jpg differ
diff --git a/doc/src/Eqs/pair_mdpd_force.tex b/doc/src/Eqs/pair_mdpd_force.tex
new file mode 100644
index 0000000000..b5c8d9be4a
--- /dev/null
+++ b/doc/src/Eqs/pair_mdpd_force.tex
@@ -0,0 +1,17 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ \mathbf{F}_{ij}^C = Aw_c(r_{ij})\mathbf{e}_{ij} + B(\rho_i+\rho_j)w_d(r_{ij})\mathbf{e}_{ij},
+$$
+
+$$
+ \mathbf{F}_{ij}^{D} = -\gamma {\omega_{D}}(r_{ij})(\mathbf{e}_{ij} \cdot \mathbf{v}_{ij})\mathbf{e}_{ij},
+$$
+
+$$
+ \mathbf{F}_{ij}^{R} = \sigma {\omega_{R}}(r_{ij}){\xi_{ij}}\Delta t^{-1/2} \mathbf{e}_{ij},
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_tdpd_flux.jpg b/doc/src/Eqs/pair_tdpd_flux.jpg
new file mode 100644
index 0000000000..9da788ae9b
Binary files /dev/null and b/doc/src/Eqs/pair_tdpd_flux.jpg differ
diff --git a/doc/src/Eqs/pair_tdpd_flux.tex b/doc/src/Eqs/pair_tdpd_flux.tex
new file mode 100644
index 0000000000..f753f16acf
--- /dev/null
+++ b/doc/src/Eqs/pair_tdpd_flux.tex
@@ -0,0 +1,21 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ Q_{ij}^D = -\kappa_{ij} w_{DC}(r_{ij}) \left( C_i - C_j \right),
+$$
+
+$$
+ Q_{ij}^R = \epsilon_{ij}\left( C_i + C_j \right) w_{RC}(r_{ij}) \xi_{ij},
+$$
+
+$$
+ w_{DC}(r_{ij})=w^2_{RC}(r_{ij}) = (1 - r/r_{cc})^{\rm power\_{cc}},
+$$
+
+$$
+ \epsilon_{ij}^2 = m_s^2\kappa_{ij}\rho,
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_tdpd_force.jpg b/doc/src/Eqs/pair_tdpd_force.jpg
new file mode 100644
index 0000000000..f6feb35397
Binary files /dev/null and b/doc/src/Eqs/pair_tdpd_force.jpg differ
diff --git a/doc/src/Eqs/pair_tdpd_force.tex b/doc/src/Eqs/pair_tdpd_force.tex
new file mode 100644
index 0000000000..49edff9700
--- /dev/null
+++ b/doc/src/Eqs/pair_tdpd_force.tex
@@ -0,0 +1,29 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ \mathbf{F}_{ij}^{C} = A{\omega_{C}}(r_{ij})\mathbf{e}_{ij},
+$$
+
+$$
+ \mathbf{F}_{ij}^{D} = -\gamma {\omega_{D}}(r_{ij})(\mathbf{e}_{ij} \cdot \mathbf{v}_{ij})\mathbf{e}_{ij},
+$$
+
+$$
+ \mathbf{F}_{ij}^{R} = \sigma {\omega_{R}}(r_{ij}){\xi_{ij}}\Delta t^{-1/2} \mathbf{e}_{ij},
+$$
+
+$$
+ \omega_{C}(r) = 1 - r/r_c,
+$$
+
+$$
+ \omega_{D}(r) = \omega^2_{R}(r) = (1-r/r_c)^{\rm power\_f},
+$$
+
+$$
+ \sigma^2 = 2\gamma k_B T,
+$$
+
+\end{document}
diff --git a/doc/src/Eqs/pair_tdpd_gov.jpg b/doc/src/Eqs/pair_tdpd_gov.jpg
new file mode 100644
index 0000000000..0ed793a132
Binary files /dev/null and b/doc/src/Eqs/pair_tdpd_gov.jpg differ
diff --git a/doc/src/Eqs/pair_tdpd_gov.tex b/doc/src/Eqs/pair_tdpd_gov.tex
new file mode 100644
index 0000000000..e4b5f5790a
--- /dev/null
+++ b/doc/src/Eqs/pair_tdpd_gov.tex
@@ -0,0 +1,13 @@
+\documentclass[12pt]{article}
+
+\begin{document}
+
+$$
+ \frac{\mathrm{d}^2 \mathbf{r}_i}{\mathrm{d} t^2} = \frac{\mathrm{d} \mathbf{v}_i}{\mathrm{d} t}=\mathbf{F}_{i}=\sum_{i\neq j}(\mathbf{F}_{ij}^{C}+\mathbf{F}_{ij}^{D}+\mathbf{F}_{ij}^{R}),
+$$
+
+$$
+ \frac{\mathrm{d} C_{i}}{\mathrm{d} t}= Q_{i} = \sum_{i\neq j}(Q_{ij}^{D}+Q_{ij}^{R}) + Q_{i}^{S},
+$$
+
+\end{document}
diff --git a/doc/src/JPG/examples_edpd.jpg b/doc/src/JPG/examples_edpd.jpg
new file mode 100644
index 0000000000..4d0cde6370
Binary files /dev/null and b/doc/src/JPG/examples_edpd.jpg differ
diff --git a/doc/src/JPG/examples_mdpd.gif b/doc/src/JPG/examples_mdpd.gif
new file mode 100644
index 0000000000..29ace1a0b2
Binary files /dev/null and b/doc/src/JPG/examples_mdpd.gif differ
diff --git a/doc/src/JPG/examples_mdpd_first.jpg b/doc/src/JPG/examples_mdpd_first.jpg
new file mode 100644
index 0000000000..958912e662
Binary files /dev/null and b/doc/src/JPG/examples_mdpd_first.jpg differ
diff --git a/doc/src/JPG/examples_mdpd_last.jpg b/doc/src/JPG/examples_mdpd_last.jpg
new file mode 100644
index 0000000000..1bf8b9ac82
Binary files /dev/null and b/doc/src/JPG/examples_mdpd_last.jpg differ
diff --git a/doc/src/JPG/examples_tdpd.jpg b/doc/src/JPG/examples_tdpd.jpg
new file mode 100644
index 0000000000..c00e83e003
Binary files /dev/null and b/doc/src/JPG/examples_tdpd.jpg differ
diff --git a/doc/src/Section_commands.txt b/doc/src/Section_commands.txt
index 571c6c4920..48f11b3c63 100644
--- a/doc/src/Section_commands.txt
+++ b/doc/src/Section_commands.txt
@@ -685,6 +685,7 @@ package"_Section_start.html#start_3.
"drude"_fix_drude.html,
"drude/transform/direct"_fix_drude_transform.html,
"drude/transform/reverse"_fix_drude_transform.html,
+"edpd/source"_fix_dpd_source.html,
"eos/cv"_fix_eos_cv.html,
"eos/table"_fix_eos_table.html,
"eos/table/rx"_fix_eos_table_rx.html,
@@ -704,6 +705,9 @@ package"_Section_start.html#start_3.
"meso"_fix_meso.html,
"manifoldforce"_fix_manifoldforce.html,
"meso/stationary"_fix_meso_stationary.html,
+"mvv/dpd"_fix_mvv_dpd.html,
+"mvv/edpd"_fix_mvv_dpd.html,
+"mvv/tdpd"_fix_mvv_dpd.html,
"nve/dot"_fix_nve_dot.html,
"nve/dotc/langevin"_fix_nve_dotc_langevin.html,
"nve/manifold/rattle"_fix_nve_manifold_rattle.html,
@@ -732,6 +736,7 @@ package"_Section_start.html#start_3.
"smd/move/triangulated/surface"_fix_smd_move_triangulated_surface.html,
"smd/setvel"_fix_smd_setvel.html,
"smd/wall/surface"_fix_smd_wall_surface.html,
+"tdpd/source"_fix_dpd_source.html,
"temp/rescale/eff"_fix_temp_rescale_eff.html,
"ti/spring"_fix_ti_spring.html,
"ttm/mod"_fix_ttm.html,
@@ -775,6 +780,7 @@ KOKKOS, o = USER-OMP, t = OPT.
"erotate/sphere"_compute_erotate_sphere.html,
"erotate/sphere/atom"_compute_erotate_sphere_atom.html,
"event/displace"_compute_event_displace.html,
+"fragment/atom"_compute_cluster_atom.html,
"global/atom"_compute_global_atom.html,
"group/group"_compute_group_group.html,
"gyration"_compute_gyration.html,
@@ -836,6 +842,7 @@ package"_Section_start.html#start_3.
"cnp/atom"_compute_cnp_atom.html,
"dpd"_compute_dpd.html,
"dpd/atom"_compute_dpd_atom.html,
+"edpd/temp/atom"_compute_edpd_temp_atom.html,
"fep"_compute_fep.html,
"force/tally"_compute_tally.html,
"heat/flux/tally"_compute_tally.html,
@@ -868,6 +875,7 @@ package"_Section_start.html#start_3.
"smd/ulsph/stress"_compute_smd_ulsph_stress.html,
"smd/vol"_compute_smd_vol.html,
"stress/tally"_compute_tally.html,
+"tdpd/cc/atom"_compute_tdpd_cc_atom.html,
"temp/drude"_compute_temp_drude.html,
"temp/eff"_compute_temp_eff.html,
"temp/deform/eff"_compute_temp_deform_eff.html,
@@ -1024,6 +1032,7 @@ package"_Section_start.html#start_3.
"eam/cd (o)"_pair_eam.html,
"edip (o)"_pair_edip.html,
"edip/multi"_pair_edip.html,
+"edpd"_pair_meso.html,
"eff/cut"_pair_eff.html,
"exp6/rx"_pair_exp6_rx.html,
"gauss/cut"_pair_gauss.html,
@@ -1041,6 +1050,8 @@ package"_Section_start.html#start_3.
"lj/sdk (gko)"_pair_sdk.html,
"lj/sdk/coul/long (go)"_pair_sdk.html,
"lj/sdk/coul/msm (o)"_pair_sdk.html,
+"mdpd"_pair_meso.html,
+"mdpd/rhosum"_pair_meso.html,
"meam/c"_pair_meam.html,
"meam/spline (o)"_pair_meam_spline.html,
"meam/sw/spline"_pair_meam_sw_spline.html,
@@ -1074,6 +1085,7 @@ package"_Section_start.html#start_3.
"sph/taitwater/morris"_pair_sph_taitwater_morris.html,
"srp"_pair_srp.html,
"table/rx"_pair_table_rx.html,
+"tdpd"_pair_meso.html,
"tersoff/table (o)"_pair_tersoff.html,
"thole"_pair_thole.html,
"tip4p/long/soft (o)"_pair_lj_soft.html :tb(c=4,ea=c)
diff --git a/doc/src/Section_packages.txt b/doc/src/Section_packages.txt
index 16864bcdc4..6122dfac78 100644
--- a/doc/src/Section_packages.txt
+++ b/doc/src/Section_packages.txt
@@ -112,7 +112,7 @@ Package, Description, Doc page, Example, Library
"REPLICA"_#REPLICA, multi-replica methods, "Section 6.6.5"_Section_howto.html#howto_5, tad, -
"RIGID"_#RIGID, rigid bodies and constraints, "fix rigid"_fix_rigid.html, rigid, -
"SHOCK"_#SHOCK, shock loading methods, "fix msst"_fix_msst.html, -, -
-"SNAP"_#SNAP, quantum-fitted potential, "pair snap"_pair_snap.html, snap, -
+"SNAP"_#SNAP, quantum-fitted potential, "pair_style snap"_pair_snap.html, snap, -
"SRD"_#SRD, stochastic rotation dynamics, "fix srd"_fix_srd.html, srd, -
"VORONOI"_#VORONOI, Voronoi tesselation, "compute voronoi/atom"_compute_voronoi_atom.html, -, ext :tb(ea=c,ca1=l)
@@ -134,6 +134,7 @@ Package, Description, Doc page, Example, Library
"USER-LB"_#USER-LB, Lattice Boltzmann fluid,"fix lb/fluid"_fix_lb_fluid.html, USER/lb, -
"USER-MANIFOLD"_#USER-MANIFOLD, motion on 2d surfaces,"fix manifoldforce"_fix_manifoldforce.html, USER/manifold, -
"USER-MEAMC"_#USER-MEAMC, modified EAM potential (C++), "pair_style meam/c"_pair_meam.html, meam, -
+"USER-MESO"_#USER-MESO, mesoscale DPD models, "pair_style edpd"_pair_meso.html, USER/meso, -
"USER-MGPT"_#USER-MGPT, fast MGPT multi-ion potentials, "pair_style mgpt"_pair_mgpt.html, USER/mgpt, -
"USER-MISC"_#USER-MISC, single-file contributions, USER-MISC/README, USER/misc, -
"USER-MOLFILE"_#USER-MOLFILE, "VMD"_vmd_home molfile plug-ins,"dump molfile"_dump_molfile.html, -, ext
@@ -1342,7 +1343,7 @@ make machine :pre
[Supporting info:]
src/SNAP: filenames -> commands
-"pair snap"_pair_snap.html
+"pair_style snap"_pair_snap.html
"compute sna/atom"_compute_sna_atom.html
"compute snad/atom"_compute_sna_atom.html
"compute snav/atom"_compute_sna_atom.html
@@ -1556,7 +1557,7 @@ make machine :pre
src/USER-AWPMD: filenames -> commands
src/USER-AWPMD/README
-"pair awpmd/cut"_pair_awpmd.html
+"pair_style awpmd/cut"_pair_awpmd.html
examples/USER/awpmd :ul
:line
@@ -1745,12 +1746,12 @@ src/USER-DPD: filenames -> commands
"fix eos/table/rx"_fix_eos_table_rx.html
"fix shardlow"_fix_shardlow.html
"fix rx"_fix_rx.html
-"pair table/rx"_pair_table_rx.html
-"pair dpd/fdt"_pair_dpd_fdt.html
-"pair dpd/fdt/energy"_pair_dpd_fdt.html
-"pair exp6/rx"_pair_exp6_rx.html
-"pair multi/lucy"_pair_multi_lucy.html
-"pair multi/lucy/rx"_pair_multi_lucy_rx.html
+"pair_style table/rx"_pair_table_rx.html
+"pair_style dpd/fdt"_pair_dpd_fdt.html
+"pair_style dpd/fdt/energy"_pair_dpd_fdt.html
+"pair_style exp6/rx"_pair_exp6_rx.html
+"pair_style multi/lucy"_pair_multi_lucy.html
+"pair_style multi/lucy/rx"_pair_multi_lucy_rx.html
examples/USER/dpd :ul
:line
@@ -1785,8 +1786,8 @@ src/USER-DRUDE/README
"fix drude"_fix_drude.html
"fix drude/transform/*"_fix_drude_transform.html
"compute temp/drude"_compute_temp_drude.html
-"pair thole"_pair_thole.html
-"pair lj/cut/thole/long"_pair_thole.html
+"pair_style thole"_pair_thole.html
+"pair_style lj/cut/thole/long"_pair_thole.html
examples/USER/drude
tools/drude :ul
@@ -1824,8 +1825,8 @@ src/USER-EFF/README
"fix npt/eff"_fix_nh_eff.html
"fix langevin/eff"_fix_langevin_eff.html
"compute temp/eff"_compute_temp_eff.html
-"pair eff/cut"_pair_eff.html
-"pair eff/inline"_pair_eff.html
+"pair_style eff/cut"_pair_eff.html
+"pair_style eff/inline"_pair_eff.html
examples/USER/eff
tools/eff/README
tools/eff
@@ -2155,11 +2156,47 @@ make machine :pre
src/USER-MEAMC: filenames -> commands
src/USER-MEAMC/README
-"pair meam/c"_pair_meam.html
+"pair_style meam/c"_pair_meam.html
examples/meam :ul
:line
+USER-MESO package :link(USER-MESO),h4
+
+[Contents:]
+
+Several extensions of the the dissipative particle dynamics (DPD)
+method. Specifically, energy-conserving DPD (eDPD) that can model
+non-isothermal processes, many-body DPD (mDPD) for simulating
+vapor-liquid coexistence, and transport DPD (tDPD) for modeling
+advection-diffuion-reaction systems. The equations of motion of these
+DPD extensions are integrated through a modified velocity-Verlet (MVV)
+algorithm.
+
+[Author:] Zhen Li (Division of Applied Mathematics, Brown University)
+
+[Install or un-install:]
+
+make yes-user-meso
+make machine :pre
+
+make no-user-meso
+make machine :pre
+
+[Supporting info:]
+
+src/USER-MESO: filenames -> commands
+src/USER-MESO/README
+"atom_style edpd"_atom_style.html
+"pair_style edpd"_pair_meso.html
+"pair_style mdpd"_pair_meso.html
+"pair_style tdpd"_pair_meso.html
+"fix mvv/dpd"_fix_mvv.html
+examples/USER/meso
+http://lammps.sandia.gov/movies.html#mesodpd :ul
+
+:line
+
USER-MOLFILE package :link(USER-MOLFILE),h4
[Contents:]
diff --git a/doc/src/atom_style.txt b/doc/src/atom_style.txt
index 077636dfd1..49d9dde791 100644
--- a/doc/src/atom_style.txt
+++ b/doc/src/atom_style.txt
@@ -13,17 +13,19 @@ atom_style command :h3
atom_style style args :pre
style = {angle} or {atomic} or {body} or {bond} or {charge} or {dipole} or \
- {dpd} or {electron} or {ellipsoid} or {full} or {line} or {meso} or \
- {molecular} or {peri} or {smd} or {sphere} or {tri} or \
- {template} or {hybrid} :ulb,l
+ {dpd} or {edpd} or {mdpd} or {tdpd} or {electron} or {ellipsoid} or \
+ {full} or {line} or {meso} or {molecular} or {peri} or {smd} or \
+ {sphere} or {tri} or {template} or {hybrid} :ulb,l
args = none for any style except the following
- {body} args = bstyle bstyle-args
- bstyle = style of body particles
- bstyle-args = additional arguments specific to the bstyle
- see the "body"_body.html doc page for details
- {template} args = template-ID
- template-ID = ID of molecule template specified in a separate "molecule"_molecule.html command
- {hybrid} args = list of one or more sub-styles, each with their args :pre
+ {body} args = bstyle bstyle-args
+ bstyle = style of body particles
+ bstyle-args = additional arguments specific to the bstyle
+ see the "body"_body.html doc page for details
+ {tdpd} arg = Nspecies
+ Nspecies = # of chemical species
+ {template} arg = template-ID
+ template-ID = ID of molecule template specified in a separate "molecule"_molecule.html command
+ {hybrid} args = list of one or more sub-styles, each with their args :pre
accelerated styles (with same args) = {angle/kk} or {atomic/kk} or {bond/kk} or {charge/kk} or {full/kk} or {molecular/kk} :l
:ule
@@ -36,7 +38,8 @@ atom_style full
atom_style body nparticle 2 10
atom_style hybrid charge bond
atom_style hybrid charge body nparticle 2 5
-atom_style template myMols :pre
+atom_style template myMols
+atom_style tdpd 2 :pre
[Description:]
@@ -74,6 +77,9 @@ quantities.
{charge} | charge | atomic system with charges |
{dipole} | charge and dipole moment | system with dipolar particles |
{dpd} | internal temperature and internal energies | DPD particles |
+{edpd} | temperature and heat capacity | eDPD particles |
+{mdpd} | density | mDPD particles |
+{tdpd} | chemical concentration | tDPD particles |
{electron} | charge and spin and eradius | electronic force field |
{ellipsoid} | shape, quaternion, angular momentum | aspherical particles |
{full} | molecular + charge | bio-molecules |
@@ -145,6 +151,19 @@ properties with internal temperature (dpdTheta), internal conductive
energy (uCond), internal mechanical energy (uMech), and internal
chemical energy (uChem).
+The {edpd} style is for energy-conserving dissipative particle
+dynamics (eDPD) particles which store a temperature (edpd_temp), and
+heat capacity(edpd_cv).
+
+The {mdpd} style is for many-body dissipative particle dynamics (mDPD)
+particles which store a density (rho) for considering
+density-dependent many-body interactions.
+
+The {tdpd} style is for transport dissipative particle dynamics (tDPD)
+particles which store a set of chemical concentration. An integer
+"cc_species" is required to specify the number of chemical species
+involved in a tDPD system.
+
The {meso} style is for smoothed particle hydrodynamics (SPH)
particles which store a density (rho), energy (e), and heat capacity
(cv).
@@ -284,6 +303,11 @@ force fields"_pair_eff.html.
The {dpd} style is part of the USER-DPD package for dissipative
particle dynamics (DPD).
+The {edpd}, {mdpd}, and {tdpd} styles are part of the USER-MESO package
+for energy-conserving dissipative particle dynamics (eDPD), many-body
+dissipative particle dynamics (mDPD), and transport dissipative particle
+dynamics (tDPD), respectively.
+
The {meso} style is part of the USER-SPH package for smoothed particle
hydrodynamics (SPH). See "this PDF
guide"_USER/sph/SPH_LAMMPS_userguide.pdf to using SPH in LAMMPS.
diff --git a/doc/src/compute_cluster_atom.txt b/doc/src/compute_cluster_atom.txt
index 147d06c2a8..0aa38ae590 100644
--- a/doc/src/compute_cluster_atom.txt
+++ b/doc/src/compute_cluster_atom.txt
@@ -7,37 +7,62 @@
:line
compute cluster/atom command :h3
+compute fragment/atom command :h3
+compute aggregate/atom command :h3
[Syntax:]
-compute ID group-ID cluster/atom cutoff :pre
+compute ID group-ID cluster/atom cutoff
+compute ID group-ID fragment/atom
+compute ID group-ID aggregate/atom cutoff :pre
ID, group-ID are documented in "compute"_compute.html command
-cluster/atom = style name of this compute command
+{cluster/atom} or {fragment/atom} or {aggregate/atom} = style name of this compute command
cutoff = distance within which to label atoms as part of same cluster (distance units) :ul
[Examples:]
-compute 1 all cluster/atom 1.0 :pre
+compute 1 all cluster/atom 3.5
+compute 1 all fragment/atom :pre
+compute 1 all aggregate/atom 3.5 :pre
[Description:]
-Define a computation that assigns each atom a cluster ID.
+Define a computation that assigns each atom a cluster, fragement,
+or aggregate ID.
A cluster is defined as a set of atoms, each of which is within the
cutoff distance from one or more other atoms in the cluster. If an
atom has no neighbors within the cutoff distance, then it is a 1-atom
-cluster. The ID of every atom in the cluster will be the smallest
-atom ID of any atom in the cluster.
+cluster.
+
+A fragment is similarly defined as a set of atoms, each of
+which has an explicit bond (i.e. defined via a "data file"_read_data.html,
+the "create_bonds"_create_bonds.html command, or through fixes like
+"fix bond/create"_fix_bond_create.html, "fix bond/swap"_fix_bond_swap.html,
+or "fix bond/break"_fix_bond_break.html). The cluster ID or fragment ID
+of every atom in the cluster will be set to the smallest atom ID of any atom
+in the cluster or fragment, respectively.
+
+An aggregate is defined by combining the rules for clusters and
+fragments, i.e. a set of atoms, where each of it is within the cutoff
+distance from one or more atoms within a fragment that is part of
+the same cluster. This measure can be used to track molecular assemblies
+like micelles.
Only atoms in the compute group are clustered and assigned cluster
-IDs. Atoms not in the compute group are assigned a cluster ID = 0.
+IDs. Atoms not in the compute group are assigned a cluster ID = 0.
+For fragments, only bonds where [both] atoms of the bond are included
+in the compute group are assigned to fragments, so that only fragmets
+are detected where [all] atoms are in the compute group. Thus atoms
+may be included in the compute group, yes still have a fragment ID of 0.
-The neighbor list needed to compute this quantity is constructed each
-time the calculation is performed (i.e. each time a snapshot of atoms
-is dumped). Thus it can be inefficient to compute/dump this quantity
-too frequently or to have multiple compute/dump commands, each of a
-{cluster/atom} style.
+For computes {cluster/atom} and {aggregate/atom} the neighbor list needed
+to compute this quantity is constructed each time the calculation is
+performed (i.e. each time a snapshot of atoms is dumped). Thus it can be
+inefficient to compute/dump this quantity too frequently or to have
+multiple compute/dump commands, each of a {cluster/atom} or
+{aggregate/atom} style.
NOTE: If you have a bonded system, then the settings of
"special_bonds"_special_bonds.html command can remove pairwise
diff --git a/doc/src/compute_edpd_temp_atom.txt b/doc/src/compute_edpd_temp_atom.txt
new file mode 100644
index 0000000000..5b8c8ebd67
--- /dev/null
+++ b/doc/src/compute_edpd_temp_atom.txt
@@ -0,0 +1,62 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+compute edpd/temp/atom command :h3
+
+[Syntax:]
+
+compute ID group-ID edpd/temp/atom :pre
+
+ID, group-ID are documented in "compute"_compute.html command
+edpd/temp/atom = style name of this compute command :ul
+
+[Examples:]
+
+compute 1 all edpd/temp/atom :pre
+
+[Description:]
+
+Define a computation that calculates the per-atom temperature
+for each eDPD particle in a group.
+
+The temperature is a local temperature derived from the internal energy
+of each eDPD particle based on the local equilibrium hypothesis.
+For more details please see "(Espanol1997)"_#Espanol1997 and
+"(Li2014)"_#Li2014a.
+
+[Output info:]
+
+This compute calculates a per-atom vector, which can be accessed by
+any command that uses per-atom values from a compute as input. See
+"Section 6.15"_Section_howto.html#howto_15 for an overview of
+LAMMPS output options.
+
+The per-atom vector values will be in temperature "units"_units.html.
+
+[Restrictions:]
+
+This compute is part of the USER-MESO package. It is only enabled if
+LAMMPS was built with that package. See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+[Related commands:]
+
+"pair_style edpd"_pair_meso.html
+
+[Default:] none
+
+:line
+
+:link(Espanol1997)
+[(Espanol1997)] Espanol, Europhys Lett, 40(6): 631-636 (1997). DOI:
+10.1209/epl/i1997-00515-8
+
+:link(Li2014a)
+[(Li2014)] Li, Tang, Lei, Caswell, Karniadakis, J Comput Phys, 265:
+113-127 (2014). DOI: 10.1016/j.jcp.2014.02.003.
+
diff --git a/doc/src/compute_tdpd_cc_atom.txt b/doc/src/compute_tdpd_cc_atom.txt
new file mode 100644
index 0000000000..a6a12dc52c
--- /dev/null
+++ b/doc/src/compute_tdpd_cc_atom.txt
@@ -0,0 +1,60 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+compute tdpd/cc/atom command :h3
+
+[Syntax:]
+
+compute ID group-ID tdpd/cc/atom index :pre
+
+ID, group-ID are documented in "compute"_compute.html command
+tdpd/cc/atom = style name of this compute command
+index = index of chemical species (1 to Nspecies) :ul
+
+[Examples:]
+
+compute 1 all tdpd/cc/atom 2 :pre
+
+[Description:]
+
+Define a computation that calculates the per-atom chemical
+concentration of a specified species for each tDPD particle in a
+group.
+
+The chemical concentration of each species is defined as the number of
+molecules carried by a tDPD particle for dilute solution. For more
+details see "(Li2015)"_#Li2015a.
+
+[Output info:]
+
+This compute calculates a per-atom vector, which can be accessed by
+any command that uses per-atom values from a compute as input. See
+"Section 6.15"_Section_howto.html#howto_15 for an overview of
+LAMMPS output options.
+
+The per-atom vector values will be in the units of chemical species
+per unit mass.
+
+[Restrictions:]
+
+This compute is part of the USER-MESO package. It is only enabled if
+LAMMPS was built with that package. See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+[Related commands:]
+
+"pair_style tdpd"_pair_meso.html
+
+[Default:] none
+
+:line
+
+:link(Li2015a)
+[(Li2015)] Li, Yazdani, Tartakovsky, Karniadakis, J Chem Phys, 143:
+014101 (2015). DOI: 10.1063/1.4923254
+
diff --git a/doc/src/computes.txt b/doc/src/computes.txt
index 5a6ca66c46..c443bfaba2 100644
--- a/doc/src/computes.txt
+++ b/doc/src/computes.txt
@@ -30,6 +30,7 @@ Computes :h1
compute_displace_atom
compute_dpd
compute_dpd_atom
+ compute_edpd_temp_atom
compute_erotate_asphere
compute_erotate_rigid
compute_erotate_sphere
@@ -95,6 +96,7 @@ Computes :h1
compute_sna_atom
compute_stress_atom
compute_tally
+ compute_tdpd_cc_atom
compute_temp
compute_temp_asphere
compute_temp_body
diff --git a/doc/src/fix_dpd_source.txt b/doc/src/fix_dpd_source.txt
new file mode 100644
index 0000000000..b6decc657c
--- /dev/null
+++ b/doc/src/fix_dpd_source.txt
@@ -0,0 +1,101 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+fix edpd/source command :h3
+fix tdpd/source command :h3
+
+[Syntax:]
+
+fix ID group-ID edpd/source keyword values ...
+fix ID group-ID tdpd/source cc_index keyword values ... :pre
+
+ID, group-ID are documented in "fix"_fix.html command :ulb,l
+edpd/source or tdpd/source = style name of this fix command :l
+index (only specified for tdpd/source) = index of chemical species (1 to Nspecies) :l
+keyword = {sphere} or {cuboid} :l
+ {sphere} values = cx,cy,cz,radius,source
+ cx,cy,cz = x,y,z center of spherical domain (distance units)
+ radius = radius of a spherical domain (distance units)
+ source = heat source or concentration source (flux units, see below)
+ {cuboid} values = cx,cy,cz,dLx,dLy,dLz,source
+ cx,cy,cz = x,y,z lower left corner of a cuboid domain (distance units)
+ dLx,dLy,dLz = x,y,z side length of a cuboid domain (distance units)
+ source = heat source or concentration source (flux units, see below) :pre
+:ule
+
+[Examples:]
+
+fix 1 all edpd/source sphere 0.0 0.0 0.0 5.0 0.01
+fix 1 all edpd/source cuboid 0.0 0.0 0.0 20.0 10.0 10.0 -0.01
+fix 1 all tdpd/source 1 sphere 5.0 0.0 0.0 5.0 0.01
+fix 1 all tdpd/source 2 cuboid 0.0 0.0 0.0 20.0 10.0 10.0 0.01 :pre
+
+[Description:]
+
+Fix {edpd/source} adds a heat source as an external heat flux to each
+atom in a spherical or cuboid domain, where the {source} is in units
+of energy/time. Fix {tdpd/source} adds an external concentration
+source of the chemical species specified by {index} as an external
+concentration flux for each atom in a spherical or cuboid domain,
+where the {source} is in units of mole/volume/time.
+
+This command can be used to give an additional heat/concentration
+source term to atoms in a simulation, such as for a simulation of a
+heat conduction with a source term (see Fig.12 in "(Li2014)"_#Li2014b)
+or diffusion with a source term (see Fig.1 in "(Li2015)"_#Li2015b), as
+an analog of a periodic Poiseuille flow problem.
+
+If the {sphere} keyword is used, the {cx,cy,cz,radius} defines a
+spherical domain to apply the source flux to.
+
+If the {cuboid} keyword is used, the {cx,cy,cz,dLx,dLy,dLz} defines a
+cuboid domain to apply the source flux to.
+
+:line
+
+[Restart, fix_modify, output, run start/stop, minimize info:]
+
+No information about this fix is written to "binary restart
+files"_restart.html. None of the "fix_modify"_fix_modify.html options
+are relevant to this fix. No global or per-atom quantities are stored
+by this fix for access by various "output
+commands"_Section_howto.html#howto_15. No parameter of this fix can
+be used with the {start/stop} keywords of the "run"_run.html command.
+This fix is not invoked during "energy minimization"_minimize.html.
+
+[Restrictions:]
+
+This fix is part of the USER-MESO package. It is only enabled if
+LAMMPS was built with that package. See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+Fix {edpd/source} must be used with the "pair_style
+edpd"_pair_meso.html command. Fix {tdpd/source} must be used with the
+"pair_style tdpd"_pair_meso.html command.
+
+[Related commands:]
+
+"pair_style edpd"_pair_meso.html, "pair_style tdpd"_pair_meso.html,
+"compute edpd/temp/atom"_compute_edpd_temp_atom.html, "compute
+tdpd/cc/atom"_compute_tdpd_cc_atom.html
+
+[Default:] none
+
+:line
+
+:link(Li2014b)
+[(Li2014)] Z. Li, Y.-H. Tang, H. Lei, B. Caswell and G.E. Karniadakis,
+"Energy-conserving dissipative particle dynamics with
+temperature-dependent properties", J. Comput. Phys., 265: 113-127
+(2014). DOI: 10.1016/j.jcp.2014.02.003
+
+:link(Li2015b)
+[(Li2015)] Z. Li, A. Yazdani, A. Tartakovsky and G.E. Karniadakis,
+"Transport dissipative particle dynamics model for mesoscopic
+advection-diffusion-reaction problems", J. Chem. Phys., 143: 014101
+(2015). DOI: 10.1063/1.4923254
diff --git a/doc/src/fix_mvv_dpd.txt b/doc/src/fix_mvv_dpd.txt
new file mode 100644
index 0000000000..3c1c1a7cba
--- /dev/null
+++ b/doc/src/fix_mvv_dpd.txt
@@ -0,0 +1,97 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+fix mvv/dpd command :h3
+fix mvv/edpd command :h3
+fix mvv/tdpd command :h3
+
+[Syntax:]
+
+fix ID group-ID mvv/dpd lambda :pre
+fix ID group-ID mvv/edpd lambda :pre
+fix ID group-ID mvv/tdpd lambda :pre
+
+ID, group-ID are documented in "fix"_fix.html command
+mvv/dpd, mvv/edpd, mvv/tdpd = style name of this fix command
+lambda = (optional) relaxation parameter (unitless) :ul
+
+[Examples:]
+
+fix 1 all mvv/dpd
+fix 1 all mvv/dpd 0.5
+fix 1 all mvv/edpd
+fix 1 all mvv/edpd 0.5
+fix 1 all mvv/tdpd
+fix 1 all mvv/tdpd 0.5 :pre
+
+[Description:]
+
+Perform time integration using the modified velocity-Verlet (MVV)
+algorithm to update position and velocity (fix mvv/dpd), or position,
+velocity and temperature (fix mvv/edpd), or position, velocity and
+concentration (fix mvv/tdpd) for particles in the group each timestep.
+
+The modified velocity-Verlet (MVV) algorithm aims to improve the
+stability of the time integrator by using an extrapolated version of
+the velocity for the force evaluation:
+
+:c,image(Eqs/fix_mvv_dpd.jpg)
+
+where the parameter λ depends on the
+specific choice of DPD parameters, and needs to be tuned on a
+case-by-case basis. Specification of a {lambda} value is opttional.
+If specified, the setting must be from 0.0 to 1.0. If not specified,
+a default value of 0.5 is used, which effectively reproduces the
+standard velocity-Verlet (VV) scheme. For more details, see
+"Groot"_#Groot2.
+
+Fix {mvv/dpd} updates the position and velocity of each atom. It can
+be used with the "pair_style mdpd"_pair_meso.html command or other
+pair styles such as "pair dpd"_pair_dpd.html.
+
+Fix {mvv/edpd} updates the per-atom temperature, in addition to
+position and velocity, and must be used with the "pair_style
+edpd"_pair_meso.html command.
+
+Fix {mvv/tdpd} updates the per-atom chemical concentration, in
+addition to position and velocity, and must be used with the
+"pair_style tdpd"_pair_meso.html command.
+
+:line
+
+[Restart, fix_modify, output, run start/stop, minimize info:]
+
+No information about this fix is written to "binary restart
+files"_restart.html. None of the "fix_modify"_fix_modify.html options
+are relevant to this fix. No global or per-atom quantities are stored
+by this fix for access by various "output
+commands"_Section_howto.html#howto_15. No parameter of this fix can
+be used with the {start/stop} keywords of the "run"_run.html command.
+This fix is not invoked during "energy minimization"_minimize.html.
+
+[Restrictions:]
+
+This fix is part of the USER-MESO package. It is only enabled if
+LAMMPS was built with that package. See the "Making
+LAMMPS"_Section_start.html#start_3 section for more info.
+
+[Related commands:]
+
+"pair_style mdpd"_pair_meso.html, "pair_style edpd"_pair_meso.html,
+"pair_style tdpd"_pair_meso.html
+
+[Default:]
+
+The default value for the optional {lambda} parameter is 0.5.
+
+:line
+
+:link(Groot2)
+[(Groot)] Groot and Warren, J Chem Phys, 107: 4423-4435 (1997). DOI:
+10.1063/1.474784
+
diff --git a/doc/src/fix_qeq.txt b/doc/src/fix_qeq.txt
index 22f4766896..194361e990 100644
--- a/doc/src/fix_qeq.txt
+++ b/doc/src/fix_qeq.txt
@@ -90,9 +90,14 @@ file specified by {qfile}. The file has the following format
...
Ntype chi eta gamma zeta qcore :pre
-There is one line per atom type with the following parameters.
+There have to be parameters given for every atom type. Wildcard entries
+are possible using the same syntax as elsewhere in LAMMPS
+(i.e., n*m, n*, *m, *). Later entries will overwrite previous ones.
+Empty lines or any text following the pound sign (#) are ignored.
+Each line starts with the atom type followed by five parameters.
Only a subset of the parameters is used by each QEq style as described
-below, thus the others can be set to 0.0 if desired.
+below, thus the others can be set to 0.0 if desired, but all five
+entries per line are required.
{chi} = electronegativity in energy units
{eta} = self-Coulomb potential in energy units
diff --git a/doc/src/fix_wall_ees.txt b/doc/src/fix_wall_ees.txt
index a8688e8e41..f141a19405 100644
--- a/doc/src/fix_wall_ees.txt
+++ b/doc/src/fix_wall_ees.txt
@@ -50,17 +50,17 @@ fix ees_cube all wall/region/ees myCube 1.0 1.0 2.5 :pre
Fix {wall/ees} bounds the simulation domain on one or more of its
faces with a flat wall that interacts with the ellipsoidal atoms in the
group by generating a force on the atom in a direction perpendicular to
-the wall and a torque parallel with the wall. The energy of
+the wall and a torque parallel with the wall. The energy of
wall-particle interactions E is given by:
:c,image(Eqs/fix_wall_ees.jpg)
Introduced by Babadi and Ejtehadi in "(Babadi)"_#BabadiEjtehadi. Here,
{r} is the distance from the particle to the wall at position {coord},
-and Rc is the {cutoff} distance at which the particle and wall no
-longer interact. Also, sigma_n is the distance between center of
-ellipsoid and the nearest point of its surface to the wall The energy
-of the wall (see the image below).
+and Rc is the {cutoff} distance at which the particle and wall no
+longer interact. Also, sigma_n is the distance between center of
+ellipsoid and the nearest point of its surface to the wall. The energy
+of the wall is:
:c,image(JPG/fix_wall_ees_image.jpg)
@@ -68,21 +68,22 @@ Details of using this command and specifications are the same as
fix/wall command. You can also find an example in USER/ees/ under
examples/ directory.
-The prefactor {epsilon} can be thought of as an
-effective Hamaker constant with energy units for the strength of the
-ellipsoid-wall interaction. More specifically, the {epsilon} pre-factor
-= 8 * pi^2 * rho_wall * rho_ellipsoid * epsilon
-* sigma_a * sigma_b * sigma_c, where epsilon is the LJ parameters for
-the constituent LJ particles and sigma_a, sigma_b, and sigma_c are radii
-of ellipsoidal particles. Rho_wall and rho_ellipsoid are the number
+The prefactor {epsilon} can be thought of as an
+effective Hamaker constant with energy units for the strength of the
+ellipsoid-wall interaction. More specifically, the {epsilon} pre-factor
+= 8 * pi^2 * rho_wall * rho_ellipsoid * epsilon
+* sigma_a * sigma_b * sigma_c, where epsilon is the LJ parameters for
+the constituent LJ particles and sigma_a, sigma_b, and sigma_c are radii
+of ellipsoidal particles. Rho_wall and rho_ellipsoid are the number
density of the constituent particles, in the wall and ellipsoid
respectively, in units of 1/volume.
NOTE: You must insure that r is always bigger than sigma_n for
-all particles in the group, or LAMMPS will generate an error. This
+all particles in the group, or LAMMPS will generate an error. This
means you cannot start your simulation with particles touching the wall
-position {coord} (r = sigma_n) or with particles penetrating the wall (0 =< r < sigma_n) or with particles on the wrong side of the
-wall (r < 0).
+position {coord} (r = sigma_n) or with particles penetrating the wall
+(0 =< r < sigma_n) or with particles on the wrong side of the
+wall (r < 0).
Fix {wall/region/ees} treats the surface of the geometric region defined
@@ -93,7 +94,7 @@ Other details of this command are the same as for the "fix
wall/region"_fix_wall_region.html command. One may also find an example
of using this fix in the examples/USER/misc/ees/ directory.
-[Restrictions:]
+[Restrictions:]
This fix is part of the USER-MISC package. It is only enabled if
LAMMPS was built with that package. See the "Making
diff --git a/doc/src/fixes.txt b/doc/src/fixes.txt
index 3dc5e77e14..7000a66c51 100644
--- a/doc/src/fixes.txt
+++ b/doc/src/fixes.txt
@@ -33,6 +33,7 @@ Fixes :h1
fix_drude
fix_drude_transform
fix_dpd_energy
+ fix_dpd_source
fix_dt_reset
fix_efield
fix_ehex
@@ -71,6 +72,7 @@ Fixes :h1
fix_move
fix_mscg
fix_msst
+ fix_mvv_dpd
fix_neb
fix_nh
fix_nh_eff
diff --git a/doc/src/lammps.book b/doc/src/lammps.book
index 76b6743657..f4d3566a6e 100644
--- a/doc/src/lammps.book
+++ b/doc/src/lammps.book
@@ -21,6 +21,7 @@ Section_python.html
Section_errors.html
Section_history.html
+tutorial_bash_on_windows.html
tutorial_drude.html
tutorial_github.html
tutorial_pylammps.html
@@ -156,6 +157,7 @@ fix_controller.html
fix_deform.html
fix_deposit.html
fix_dpd_energy.html
+fix_dpd_source.html
fix_drag.html
fix_drude.html
fix_drude_transform.html
@@ -197,6 +199,7 @@ fix_momentum.html
fix_move.html
fix_mscg.html
fix_msst.html
+fix_mvv_dpd.html
fix_neb.html
fix_nh.html
fix_nh_eff.html
@@ -315,6 +318,7 @@ compute_dipole_chunk.html
compute_displace_atom.html
compute_dpd.html
compute_dpd_atom.html
+compute_edpd_temp_atom.html
compute_erotate_asphere.html
compute_erotate_rigid.html
compute_erotate_sphere.html
@@ -380,6 +384,7 @@ compute_smd_vol.html
compute_sna_atom.html
compute_stress_atom.html
compute_tally.html
+compute_tdpd_cc_atom.html
compute_temp.html
compute_temp_asphere.html
compute_temp_body.html
@@ -457,6 +462,7 @@ pair_mdf.html
pair_meam.html
pair_meam_spline.html
pair_meam_sw_spline.html
+pair_meso.html
pair_mgpt.html
pair_mie.html
pair_momb.html
@@ -644,4 +650,3 @@ USER/atc/man_unfix_flux.html
USER/atc/man_unfix_nodes.html
USER/atc/man_write_atom_weights.html
USER/atc/man_write_restart.html
-
diff --git a/doc/src/pair_dpd.txt b/doc/src/pair_dpd.txt
index 9dd204ad2d..8d194bb092 100644
--- a/doc/src/pair_dpd.txt
+++ b/doc/src/pair_dpd.txt
@@ -36,7 +36,7 @@ pair_coeff 1 1 1.0 1.0 :pre
[Description:]
Style {dpd} computes a force field for dissipative particle dynamics
-(DPD) following the exposition in "(Groot)"_#Groot.
+(DPD) following the exposition in "(Groot)"_#Groot1.
Style {dpd/tstat} invokes a DPD thermostat on pairwise interactions,
which is equivalent to the non-conservative portion of the DPD force
@@ -196,7 +196,7 @@ langevin"_fix_langevin.html, "pair_style srp"_pair_srp.html
:line
-:link(Groot)
+:link(Groot1)
[(Groot)] Groot and Warren, J Chem Phys, 107, 4423-35 (1997).
:link(Afshar)
diff --git a/doc/src/pair_meso.txt b/doc/src/pair_meso.txt
new file mode 100644
index 0000000000..bcdf717d68
--- /dev/null
+++ b/doc/src/pair_meso.txt
@@ -0,0 +1,277 @@
+"LAMMPS WWW Site"_lws - "LAMMPS Documentation"_ld - "LAMMPS Commands"_lc :c
+
+:link(lws,http://lammps.sandia.gov)
+:link(ld,Manual.html)
+:link(lc,Section_commands.html#comm)
+
+:line
+
+pair_style edpd command :h3
+pair_style mdpd command :h3
+pair_style mdpd/rhosum command :h3
+pair_style tdpd command :h3
+
+[Syntax:]
+
+pair_style style args :pre
+
+style = {edpd} or {mdpd} or {mdpd/rhosum} or {tdpd} :ulb,l
+args = list of arguments for a particular style :l
+ {edpd} args = cutoff seed
+ cutoff = global cutoff for eDPD interactions (distance units)
+ seed = random # seed (integer) (if <= 0, eDPD will use current time as the seed)
+ {mdpd} args = T cutoff seed
+ T = temperature (temperature units)
+ cutoff = global cutoff for mDPD interactions (distance units)
+ seed = random # seed (integer) (if <= 0, mDPD will use current time as the seed)
+ {mdpd/rhosum} args =
+ {tdpd} args = T cutoff seed
+ T = temperature (temperature units)
+ cutoff = global cutoff for tDPD interactions (distance units)
+ seed = random # seed (integer) (if <= 0, tDPD will use current time as the seed) :pre
+:ule
+
+[Examples:]
+
+pair_style edpd 1.58 9872598
+pair_coeff * * 18.75 4.5 0.41 1.58 1.42E-5 2.0 1.58
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.42E-5 2.0 1.58 power 10.54 -3.66 3.44 -4.10
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.42E-5 2.0 1.58 power 10.54 -3.66 3.44 -4.10 kappa -0.44 -3.21 5.04 0.00 :pre
+
+pair_style hybrid/overlay mdpd/rhosum mdpd 1.0 1.0 65689
+pair_coeff 1 1 mdpd/rhosum 0.75
+pair_coeff 1 1 mdpd -40.0 25.0 18.0 1.0 0.75 :pre
+
+pair_style tdpd 1.0 1.58 935662
+pair_coeff * * 18.75 4.5 0.41 1.58 1.58 1.0 1.0E-5 2.0
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.58 1.0 1.0E-5 2.0 3.0 1.0E-5 2.0 :pre
+
+[Description:]
+
+The {edpd} style computes the pairwise interactions and heat fluxes
+for eDPD particles following the formulations in
+"(Li2014_JCP)"_#Li2014_JCP and "Li2015_CC"_#Li2015_CC. The time
+evolution of an eDPD particle is governed by the conservation of
+momentum and energy given by
+
+:c,image(Eqs/pair_edpd_gov.jpg)
+
+where the three components of Fi
+including the conservative force FijC, dissipative force FijD and random force FijR are expressed as
+
+:c,image(Eqs/pair_edpd_force.jpg)
+
+in which the exponent of the weighting function s can be defined as a temperature-dependent
+variable. The heat flux between particles accounting for the
+collisional heat flux qC, viscous
+heat flux qV, and random heat flux
+qR are given by
+
+:c,image(Eqs/pair_edpd_heat.jpg)
+
+where the mesoscopic heat friction κ is given by
+
+:c,image(Eqs/pair_edpd_kappa.jpg)
+
+with υ being the kinematic
+viscosity. For more details, see Eq.(15) in "(Li2014_JCP)"_#Li2014_JCP.
+
+The following coefficients must be defined in eDPD system for each
+pair of atom types via the "pair_coeff"_pair_coeff.html command as in
+the examples above.
+
+A (force units)
+gamma (force/velocity units)
+power_f (positive real)
+cutoff (distance units)
+kappa (thermal conductivity units)
+power_T (positive real)
+cutoff_T (distance units)
+optional keyword = power or kappa :ul
+
+The keyword {power} or {kappa} is optional. Both "power" and "kappa"
+require 4 parameters c1, c2,
+c4, c4 showing the temperature dependence
+of the exponent
s(T) =
+power_f*(1+c1*(T-1)+c2*(T-1)2
++c3*(T-1)3+c4*(T-1)4)
+and of the mesoscopic heat friction
+sT(T) =
+kappa*(1+c1*(T-1)+c2*(T-1)2
++c3*(T-1)3+c4*(T-1)4)
+If the keyword {power} or {kappa} is not specified, the eDPD system
+will use constant power_f and kappa, which is independent to
+temperature changes.
+
+:line
+
+The {mdpd/rhosum} style computes the local particle mass density rho
+for mDPD particles by kernel function interpolation.
+
+The following coefficients must be defined for each pair of atom types
+via the "pair_coeff"_pair_coeff.html command as in the examples above.
+
+cutoff (distance units) :ul
+
+:line
+
+The {mdpd} style computes the many-body interactions between mDPD
+particles following the formulations in
+"(Li2013_POF)"_#Li2013_POF. The dissipative and random forces are in
+the form same as the classical DPD, but the conservative force is
+local density dependent, which are given by
+
+:c,image(Eqs/pair_mdpd_force.jpg)
+
+where the first term in FC with a
+negative coefficient A < 0 stands for an attractive force within an
+interaction range rc, and the second
+term with B > 0 is the density-dependent repulsive force within an
+interaction range rd.
+
+The following coefficients must be defined for each pair of atom types via the
+"pair_coeff"_pair_coeff.html command as in the examples above.
+
+A (force units)
+B (force units)
+gamma (force/velocity units)
+cutoff_c (distance units)
+cutoff_d (distance units) :ul
+
+:line
+
+The {tdpd} style computes the pairwise interactions and chemical
+concentration fluxes for tDPD particles following the formulations in
+"(Li2015_JCP)"_#Li2015_JCP. The time evolution of a tDPD particle is
+governed by the conservation of momentum and concentration given by
+
+:c,image(Eqs/pair_tdpd_gov.jpg)
+
+where the three components of Fi
+including the conservative force FijC, dissipative force FijD and random force FijR are expressed as
+
+:c,image(Eqs/pair_tdpd_force.jpg)
+
+The concentration flux between two tDPD particles includes the Fickian
+flux QijD and random flux
+QijR, which are given by
+
+:c,image(Eqs/pair_tdpd_flux.jpg)
+
+where the parameters kappa and epsilon determine the strength of the
+Fickian and random fluxes. ms
+is the mass of a single solute molecule. In general, ms is much smaller than the mass of
+a tDPD particle m. For more details, see
+"(Li2015_JCP)"_#Li2015_JCP.
+
+The following coefficients must be defined for each pair of atom types via the
+"pair_coeff"_pair_coeff.html command as in the examples above.
+
+A (force units)
+gamma (force/velocity units)
+power_f (positive real)
+cutoff (distance units)
+cutoff_CC (distance units)
+kappa_i (diffusivity units)
+epsilon_i (diffusivity units)
+power_cc_i (positive real) :ul
+
+The last 3 values must be repeated Nspecies times, so that values for
+each of the Nspecies chemical species are specified, as indicated by
+the "I" suffix. In the first pair_coeff example above for pair_style
+tdpd, Nspecies = 1. In the second example, Nspecies = 2, so 3
+additional coeffs are specified (for species 2).
+
+:line
+
+[Example scripts]
+
+There are example scripts for using all these pair styles in
+examples/USER/meso. The example for an eDPD simulation models heat
+conduction with source terms analog of periodic Poiseuille flow
+problem. The setup follows Fig.12 in "(Li2014_JCP)"_#Li2014_JCP. The
+output of the short eDPD simulation (about 2 minutes on a single core)
+gives a temperature and density profiles as
+
+:c,image(JPG/examples_edpd.jpg)
+
+The example for a mDPD simulation models the oscillations of a liquid
+droplet started from a liquid film. The mDPD parameters are adopted
+from "(Li2013_POF)"_#Li2013_POF. The short mDPD run (about 2 minutes
+on a single core) generates a particle trajectory which can
+be visualized as follows.
+
+:c,image(JPG/examples_mdpd_first.jpg,JPG/examples_mdpd.gif)
+:c,image(JPG/examples_mdpd_last.jpg)
+
+The first image is the initial state of the simulation. If you
+click it a GIF movie should play in your browser. The second image
+is the final state of the simulation.
+
+The example for a tDPD simulation computes the effective diffusion
+coefficient of a tDPD system using a method analogous to the periodic
+Poiseuille flow. The tDPD system is specified with two chemical
+species, and the setup follows Fig.1 in
+"(Li2015_JCP)"_#Li2015_JCP. The output of the short tDPD simulation
+(about one and a half minutes on a single core) gives the
+concentration profiles of the two chemical species as
+
+:c,image(JPG/examples_tdpd.jpg)
+
+:line
+
+[Mixing, shift, table, tail correction, restart, rRESPA info]:
+
+The styles {edpd}, {mdpd}, {mdpd/rhosum} and {tdpd} do not support
+mixing. Thus, coefficients for all I,J pairs must be specified explicitly.
+
+The styles {edpd}, {mdpd}, {mdpd/rhosum} and {tdpd} do not support
+the "pair_modify"_pair_modify.html shift, table, and tail options.
+
+The styles {edpd}, {mdpd}, {mdpd/rhosum} and {tdpd} do not write
+information to "binary restart files"_restart.html. Thus, you need
+to re-specify the pair_style and pair_coeff commands in an input script
+that reads a restart file.
+
+[Restrictions:]
+
+The pair styles {edpd}, {mdpd}, {mdpd/rhosum} and {tdpd} are part of
+the USER-MESO package. It is only enabled if LAMMPS was built with
+that package. See the "Making LAMMPS"_Section_start.html#start_3
+section for more info.
+
+[Related commands:]
+
+"pair_coeff"_pair_coeff.html, "fix mvv/dpd"_fix_mvv_dpd.html,
+"fix mvv/edpd"_fix_mvv_dpd.html, "fix mvv/tdpd"_fix_mvv_dpd.html,
+"fix edpd/source"_fix_dpd_source.html, "fix tdpd/source"_fix_dpd_source.html,
+"compute edpd/temp/atom"_compute_edpd_temp_atom.html,
+"compute tdpd/cc/atom"_compute_tdpd_cc_atom.html
+
+[Default:] none
+
+:line
+
+:link(Li2014_JCP)
+[(Li2014_JCP)] Li, Tang, Lei, Caswell, Karniadakis, J Comput Phys,
+265: 113-127 (2014). DOI: 10.1016/j.jcp.2014.02.003.
+
+:link(Li2015_CC)
+[(Li2015_CC)] Li, Tang, Li, Karniadakis, Chem Commun, 51: 11038-11040
+(2015). DOI: 10.1039/C5CC01684C.
+
+:link(Li2013_POF)
+[(Li2013_POF)] Li, Hu, Wang, Ma, Zhou, Phys Fluids, 25: 072103 (2013).
+DOI: 10.1063/1.4812366.
+
+:link(Li2015_JCP)
+[(Li2015_JCP)] Li, Yazdani, Tartakovsky, Karniadakis, J Chem Phys,
+143: 014101 (2015). DOI: 10.1063/1.4923254.
diff --git a/doc/src/pair_snap.txt b/doc/src/pair_snap.txt
index fa90dc34e9..593765aa63 100644
--- a/doc/src/pair_snap.txt
+++ b/doc/src/pair_snap.txt
@@ -10,8 +10,7 @@ pair_style snap command :h3
[Syntax:]
-pair_style snap
-:pre
+pair_style snap :pre
[Examples:]
@@ -20,17 +19,16 @@ pair_coeff * * InP.snapcoeff In P InP.snapparam In In P P :pre
[Description:]
-Pair style {snap} computes interactions
-using the spectral neighbor analysis potential (SNAP)
-"(Thompson)"_#Thompson20142. Like the GAP framework of Bartok et al.
-"(Bartok2010)"_#Bartok20102, "(Bartok2013)"_#Bartok2013
-which uses bispectrum components
+Pair style {snap} computes interactions using the spectral
+neighbor analysis potential (SNAP) "(Thompson)"_#Thompson20142.
+Like the GAP framework of Bartok et al. "(Bartok2010)"_#Bartok20102,
+"(Bartok2013)"_#Bartok2013 which uses bispectrum components
to characterize the local neighborhood of each atom
in a very general way. The mathematical definition of the
bispectrum calculation used by SNAP is identical
to that used by "compute sna/atom"_compute_sna_atom.html.
In SNAP, the total energy is decomposed into a sum over
-atom energies. The energy of atom {i } is
+atom energies. The energy of atom {i} is
expressed as a weighted sum over bispectrum components.
:c,image(Eqs/pair_snap.jpg)
diff --git a/doc/src/pairs.txt b/doc/src/pairs.txt
index 2c1b20f4d3..ec21b7a02e 100644
--- a/doc/src/pairs.txt
+++ b/doc/src/pairs.txt
@@ -58,6 +58,7 @@ Pair Styles :h1
pair_meam
pair_meam_spline
pair_meam_sw_spline
+ pair_meso
pair_mgpt
pair_mie
pair_momb
diff --git a/doc/src/read_data.txt b/doc/src/read_data.txt
index a8aca53693..8c9aa02e22 100644
--- a/doc/src/read_data.txt
+++ b/doc/src/read_data.txt
@@ -374,10 +374,9 @@ needed if new bonds (angles, dihedrals, impropers) will be added to
the system when a simulation runs, e.g. by using the "fix
bond/create"_fix_bond_create.html command. Using this header flag
is deprecated; please use the {extra/bond/per/atom} keyword (and
-correspondingly for angles, dihedrals and impropers) in the
-read_data command instead. Either will pre-allocate space in LAMMPS
- data structures for storing the new bonds (angles,
-dihedrals, impropers).
+correspondingly for angles, dihedrals and impropers) in the read_data
+command instead. Either will pre-allocate space in LAMMPS data
+structures for storing the new bonds (angles, dihedrals, impropers).
The "extra special per atom" setting is typically only needed if new
bonds/angles/etc will be added to the system, e.g. by using the "fix
@@ -547,6 +546,9 @@ bond: atom-ID molecule-ID atom-type x y z
charge: atom-ID atom-type q x y z
dipole: atom-ID atom-type q x y z mux muy muz
dpd: atom-ID atom-type theta x y z
+edpd: atom-ID atom-type edpd_temp edpd_cv x y z
+mdpd: atom-ID atom-type x y z
+tdpd: atom-ID atom-type x y z cc1 cc2 ... ccNspecies
electron: atom-ID atom-type q spin eradius x y z
ellipsoid: atom-ID atom-type ellipsoidflag density x y z
full: atom-ID molecule-ID atom-type q x y z
@@ -566,12 +568,15 @@ The per-atom values have these meanings and units, listed alphabetically:
atom-ID = integer ID of atom
atom-type = type of atom (1-Ntype)
bodyflag = 1 for body particles, 0 for point particles
+cc = chemical concentration for tDPD particles for each species (mole/volume units)
contact-radius = ??? (distance units)
cs_re,cs_im = real/imaginary parts of wavepacket coefficients
cv = heat capacity (need units) for SPH particles
density = density of particle (mass/distance^3 or mass/distance^2 or mass/distance units, depending on dimensionality of particle)
diameter = diameter of spherical atom (distance units)
e = energy (need units) for SPH particles
+edpd_temp = temperature for eDPD particles (temperature units)
+edpd_cv = volumetric heat capacity for eDPD particles (energy/temperature/volume units)
ellipsoidflag = 1 for ellipsoidal particles, 0 for point particles
eradius = electron radius (or fixed-core radius)
etag = integer ID of electron that each wavepacket belongs to
diff --git a/doc/src/set.txt b/doc/src/set.txt
index 14460c9741..4757d1c575 100644
--- a/doc/src/set.txt
+++ b/doc/src/set.txt
@@ -24,7 +24,7 @@ keyword = {type} or {type/fraction} or {mol} or {x} or {y} or {z} or \
{bond} or {angle} or {dihedral} or {improper} or \
{meso/e} or {meso/cv} or {meso/rho} or \
{smd/contact/radius} or {smd/mass/density} or {dpd/theta} or \
- {i_name} or {d_name} :l
+ {edpd/temp} or {edpd/cv} or {cc} or {i_name} or {d_name} :l
{type} value = atom type
value can be an atom-style variable (see below)
{type/fraction} values = type fraction seed
@@ -98,6 +98,13 @@ keyword = {type} or {type/fraction} or {mol} or {x} or {y} or {z} or \
{dpd/theta} value = internal temperature of DPD particles (temperature units)
value can be an atom-style variable (see below)
value can be NULL which sets internal temp of each particle to KE temp
+ {edpd/temp} value = temperature of eDPD particles (temperature units)
+ value can be an atom-style variable (see below)
+ {edpd/cv} value = volumetric heat capacity of eDPD particles (energy/temperature/volume units)
+ value can be an atom-style variable (see below)
+ {cc} values = index cc
+ index = index of a chemical species (1 to Nspecies)
+ cc = chemical concentration of tDPD particles for a species (mole/volume units)
{i_name} value = value for custom integer vector with name
{d_name} value = value for custom floating-point vector with name :pre
:ule
@@ -418,6 +425,19 @@ value >= 0.0, the internal temperature is set to that value. If it is
< 0.0, the computation of Tkin is performed and the internal
temperature is set to that value.
+Keywords {edpd/temp} and {edpd/cv} set the temperature and volumetric
+heat capacity of an eDPD particle as defined by the USER-MESO package.
+Currently, only "atom_style edpd"_atom_style.html defines particles
+with these attributes. The values for the temperature and heat
+capacity must be positive.
+
+Keyword {cc} sets the chemical concentration of a tDPD particle for a
+specified species as defined by the USER-MESO package. Currently, only
+"atom_style tdpd"_atom_style.html defines particles with this
+attribute. An integer for "index" selects a chemical species (1 to
+Nspecies) where Nspecies is set by the atom_style command. The value
+for the chemical concentration must be >= 0.0.
+
Keywords {i_name} and {d_name} refer to custom integer and
floating-point properties that have been added to each atom via the
"fix property/atom"_fix_property_atom.html command. When that command
diff --git a/examples/USER/meso/README b/examples/USER/meso/README
new file mode 100644
index 0000000000..4accc7ba01
--- /dev/null
+++ b/examples/USER/meso/README
@@ -0,0 +1,40 @@
+This directory contains input scripts for performing
+simulations with these models:
+
+eDPD - energy-conserving dissipative particle dynamics
+mDPD - many-body dissipative particle dynamics
+tDPD - transport dissipative particle dynamics
+
+1) eDPD: The input script in.mdpd is an example simulation of
+measuring the thermal conductivity by heat conduction analog of
+periodic Poiseuille flow. The initial eDPD system is randomly filled
+by many eDPD particles, and a set command "edpd/temp" gives the
+initial temperature and a set command "edpd/cv" gives the heat
+capacity of eDPD particles. A non-contact heat source/sink term is
+applied by a fix command "edpd/source". A compute command
+"edpd/temp/atom" obtain the temperature on each eDPD particle. The
+simulation will generate a file named "temp.profile" showing the
+temperature profile. For details please see online LAMMPS
+documentation and Fig.12 in the paper Z. Li, et al. J Comput Phys,
+2014, 265: 113-127. DOI: 10.1016/j.jcp.2014.02.003
+
+2) mDPD: The input script "in.mdpd" is an example simulation of
+oscillations of a free liquid droplet. The initial configuration is a
+liquid film whose particles are in a fcc lattice created by the
+command "create atoms". Then the liquid film has a tendency to form a
+spherical droplet under the effect of surface tension. For details
+please see online LAMMPS documentation and the paper Z. Li, et
+al. Phys Fluids, 2013, 25: 072103. DOI: 10.1063/1.4812366
+
+3) tDPD: The input script in.tdpd is an example simulation of
+computing the effective diffusion coefficient of a tDPD system using a
+method analogous to the periodic Poiseuille flow. Command "atom_style
+tdpd 2" specifies the tDPD system with two chemical species. The
+initial tDPD system is randomly filled by many tDPD particles, and a
+set "cc" command gives initial concentration for each chemical
+species. Fix commands "tdpd/source" add source terms and compute
+commands "tdpd/cc/atom" obtain the chemical concentration on each tDPD
+particle. The simulation will generate a file named "cc.profile"
+showing the concentration profiles of the two chemical species. For
+details please see online LAMMPS documentation and Fig.1 in the paper
+Z. Li, et al. J Chem Phys, 2015, 143: 014101. DOI: 10.1063/1.4923254
diff --git a/examples/USER/meso/edpd/in.edpd b/examples/USER/meso/edpd/in.edpd
new file mode 100644
index 0000000000..316099bce3
--- /dev/null
+++ b/examples/USER/meso/edpd/in.edpd
@@ -0,0 +1,54 @@
+########################################################################
+### Heat conduction analog of periodic Poiseuille flow problem ###
+### using energy-conserving DPD (eDPD) simulation ###
+### ###
+### Created : Zhen Li (zhen_li@brown.edu) ###
+### Division of Applied Mathematics, Brown University. ###
+### ###
+### mDPD system setup follows Fig.12 in the publication: ###
+### Z. Li, Y.-H. Tang, H. Lei, B. Caswell and G.E. Karniadakis. ###
+### "Energy-conserving dissipative particle dynamics with ###
+### temperature-dependent properties". J. Comput. Phys., ###
+### 2014, 265: 113-127. DOI: 10.1016/j.jcp.2014.02.003 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.2 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style edpd
+
+region edpd block -10 10 -10 10 -5 5 units box
+create_box 1 edpd
+create_atoms 1 random 16000 276438 NULL
+mass 1 1.0
+set atom * edpd/temp 1.0
+set atom * edpd/cv 1.0E5
+
+pair_style edpd 1.58 9872598
+#pair_coeff 1 1 18.75 4.5 0.41 1.58 1.45E-5 2.0 1.58
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.41E-5 2.0 1.58 &
+ power 10.54 -3.66 3.44 -4.10 &
+ kappa -0.44 -3.21 5.04 0.00
+
+compute mythermo all temp
+thermo 100
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 432982 loop local dist gaussian
+
+fix mvv all mvv/edpd 0.5
+fix upper all edpd/source cuboid 0.0 5.0 0.0 20.0 10.0 10.0 0.01
+fix lower all edpd/source cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 -0.01
+
+timestep 0.01
+run 500
+reset_timestep 0
+
+compute temp all edpd/temp/atom
+compute ccT all chunk/atom bin/1d y 0.0 1.0
+fix stat all ave/chunk 1 500 500 ccT c_temp density/number norm sample file temp.profile
+
+run 500
diff --git a/examples/USER/meso/edpd/log.16Aug17.edpd.g++.1 b/examples/USER/meso/edpd/log.16Aug17.edpd.g++.1
new file mode 100644
index 0000000000..af975f877c
--- /dev/null
+++ b/examples/USER/meso/edpd/log.16Aug17.edpd.g++.1
@@ -0,0 +1,142 @@
+LAMMPS (11 Aug 2017)
+########################################################################
+### Heat conduction analog of periodic Poiseuille flow problem ###
+### using energy-conserving DPD (eDPD) simulation ###
+### ###
+### Created : Zhen Li (zhen_li@brown.edu) ###
+### Division of Applied Mathematics, Brown University. ###
+### ###
+### mDPD system setup follows Fig.12 in the publication: ###
+### Z. Li, Y.-H. Tang, H. Lei, B. Caswell and G.E. Karniadakis. ###
+### "Energy-conserving dissipative particle dynamics with ###
+### temperature-dependent properties". J. Comput. Phys., ###
+### 2014, 265: 113-127. DOI: 10.1016/j.jcp.2014.02.003 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.2 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style edpd
+
+region edpd block -10 10 -10 10 -5 5 units box
+create_box 1 edpd
+Created orthogonal box = (-10 -10 -5) to (10 10 5)
+ 1 by 1 by 1 MPI processor grid
+create_atoms 1 random 16000 276438 NULL
+Created 16000 atoms
+mass 1 1.0
+set atom * edpd/temp 1.0
+ 16000 settings made for edpd/temp
+set atom * edpd/cv 1.0E5
+ 16000 settings made for edpd/cv
+
+pair_style edpd 1.58 9872598
+#pair_coeff 1 1 18.75 4.5 0.41 1.58 1.45E-5 2.0 1.58
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.41E-5 2.0 1.58 power 10.54 -3.66 3.44 -4.10 kappa -0.44 -3.21 5.04 0.00
+
+compute mythermo all temp
+thermo 100
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 432982 loop local dist gaussian
+
+fix mvv all mvv/edpd 0.5
+fix upper all edpd/source cuboid 0.0 5.0 0.0 20.0 10.0 10.0 0.01
+fix lower all edpd/source cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 -0.01
+
+timestep 0.01
+run 500
+Neighbor list info ...
+ update every 1 steps, delay 0 steps, check yes
+ max neighbors/atom: 2000, page size: 100000
+ master list distance cutoff = 1.78
+ ghost atom cutoff = 1.78
+ binsize = 0.89, bins = 23 23 12
+ 1 neighbor lists, perpetual/occasional/extra = 1 0 0
+ (1) pair edpd, perpetual
+ attributes: half, newton on
+ pair build: half/bin/atomonly/newton
+ stencil: half/bin/3d/newton
+ bin: standard
+Per MPI rank memory allocation (min/avg/max) = 11.64 | 11.64 | 11.64 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1 48.948932 0 50.448838 201.73366
+ 100 1.0069712 43.754293 0 45.264656 199.5369
+ 200 0.98667561 43.716052 0 45.195973 196.72854
+ 300 1.0036944 43.706299 0 45.211746 195.35714
+ 400 1.0024228 43.697014 0 45.200554 197.0062
+ 500 0.99968161 43.687445 0 45.186873 193.80596
+Loop time of 80.7995 on 1 procs for 500 steps with 16000 atoms
+
+Performance: 5346.567 tau/day, 6.188 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 75.106 | 75.106 | 75.106 | 0.0 | 92.95
+Neigh | 4.9836 | 4.9836 | 4.9836 | 0.0 | 6.17
+Comm | 0.31199 | 0.31199 | 0.31199 | 0.0 | 0.39
+Output | 0.00048232 | 0.00048232 | 0.00048232 | 0.0 | 0.00
+Modify | 0.29985 | 0.29985 | 0.29985 | 0.0 | 0.37
+Other | | 0.09751 | | | 0.12
+
+Nlocal: 16000 ave 16000 max 16000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost: 14091 ave 14091 max 14091 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs: 749111 ave 749111 max 749111 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 749111
+Ave neighs/atom = 46.8194
+Neighbor list builds = 181
+Dangerous builds = 0
+reset_timestep 0
+
+compute temp all edpd/temp/atom
+compute ccT all chunk/atom bin/1d y 0.0 1.0
+fix stat all ave/chunk 1 500 500 ccT c_temp density/number norm sample file temp.profile
+
+run 500
+Per MPI rank memory allocation (min/avg/max) = 12.14 | 12.14 | 12.14 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 0.99968161 43.687397 0 45.186825 196.38426
+ 100 1.0041443 43.668196 0 45.174318 195.38066
+ 200 0.99628392 43.666173 0 45.160505 197.84675
+ 300 1.0029116 43.66224 0 45.166513 199.67414
+ 400 0.99922193 43.64406 0 45.142799 196.94404
+ 500 0.99355431 43.623266 0 45.113505 195.94136
+Loop time of 80.7742 on 1 procs for 500 steps with 16000 atoms
+
+Performance: 5348.242 tau/day, 6.190 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 75.073 | 75.073 | 75.073 | 0.0 | 92.94
+Neigh | 4.8786 | 4.8786 | 4.8786 | 0.0 | 6.04
+Comm | 0.31086 | 0.31086 | 0.31086 | 0.0 | 0.38
+Output | 0.00045919 | 0.00045919 | 0.00045919 | 0.0 | 0.00
+Modify | 0.4139 | 0.4139 | 0.4139 | 0.0 | 0.51
+Other | | 0.09731 | | | 0.12
+
+Nlocal: 16000 ave 16000 max 16000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost: 14091 ave 14091 max 14091 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs: 749667 ave 749667 max 749667 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 749667
+Ave neighs/atom = 46.8542
+Neighbor list builds = 178
+Dangerous builds = 0
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:02:41
diff --git a/examples/USER/meso/edpd/log.16Aug17.edpd.g++.4 b/examples/USER/meso/edpd/log.16Aug17.edpd.g++.4
new file mode 100644
index 0000000000..21206d38c6
--- /dev/null
+++ b/examples/USER/meso/edpd/log.16Aug17.edpd.g++.4
@@ -0,0 +1,142 @@
+LAMMPS (11 Aug 2017)
+########################################################################
+### Heat conduction analog of periodic Poiseuille flow problem ###
+### using energy-conserving DPD (eDPD) simulation ###
+### ###
+### Created : Zhen Li (zhen_li@brown.edu) ###
+### Division of Applied Mathematics, Brown University. ###
+### ###
+### mDPD system setup follows Fig.12 in the publication: ###
+### Z. Li, Y.-H. Tang, H. Lei, B. Caswell and G.E. Karniadakis. ###
+### "Energy-conserving dissipative particle dynamics with ###
+### temperature-dependent properties". J. Comput. Phys., ###
+### 2014, 265: 113-127. DOI: 10.1016/j.jcp.2014.02.003 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.2 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style edpd
+
+region edpd block -10 10 -10 10 -5 5 units box
+create_box 1 edpd
+Created orthogonal box = (-10 -10 -5) to (10 10 5)
+ 2 by 2 by 1 MPI processor grid
+create_atoms 1 random 16000 276438 NULL
+Created 16000 atoms
+mass 1 1.0
+set atom * edpd/temp 1.0
+ 16000 settings made for edpd/temp
+set atom * edpd/cv 1.0E5
+ 16000 settings made for edpd/cv
+
+pair_style edpd 1.58 9872598
+#pair_coeff 1 1 18.75 4.5 0.41 1.58 1.45E-5 2.0 1.58
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.41E-5 2.0 1.58 power 10.54 -3.66 3.44 -4.10 kappa -0.44 -3.21 5.04 0.00
+
+compute mythermo all temp
+thermo 100
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 432982 loop local dist gaussian
+
+fix mvv all mvv/edpd 0.5
+fix upper all edpd/source cuboid 0.0 5.0 0.0 20.0 10.0 10.0 0.01
+fix lower all edpd/source cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 -0.01
+
+timestep 0.01
+run 500
+Neighbor list info ...
+ update every 1 steps, delay 0 steps, check yes
+ max neighbors/atom: 2000, page size: 100000
+ master list distance cutoff = 1.78
+ ghost atom cutoff = 1.78
+ binsize = 0.89, bins = 23 23 12
+ 1 neighbor lists, perpetual/occasional/extra = 1 0 0
+ (1) pair edpd, perpetual
+ attributes: half, newton on
+ pair build: half/bin/atomonly/newton
+ stencil: half/bin/3d/newton
+ bin: standard
+Per MPI rank memory allocation (min/avg/max) = 4.969 | 4.979 | 4.985 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1 48.948932 0 50.448838 199.51547
+ 100 1.0106415 43.744371 0 45.260239 196.39598
+ 200 1.0053215 43.714413 0 45.222301 195.35298
+ 300 0.99886399 43.713356 0 45.211559 196.74821
+ 400 1.0035264 43.699086 0 45.204282 195.47446
+ 500 1.0025285 43.698051 0 45.20175 197.27042
+Loop time of 21.165 on 4 procs for 500 steps with 16000 atoms
+
+Performance: 20411.046 tau/day, 23.624 timesteps/s
+99.9% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 18.713 | 19.101 | 19.41 | 6.0 | 90.25
+Neigh | 1.2687 | 1.2925 | 1.3177 | 1.5 | 6.11
+Comm | 0.33013 | 0.66337 | 1.0747 | 34.3 | 3.13
+Output | 0.00023484 | 0.00028092 | 0.00036526 | 0.0 | 0.00
+Modify | 0.073931 | 0.075277 | 0.076306 | 0.3 | 0.36
+Other | | 0.03227 | | | 0.15
+
+Nlocal: 4000 ave 4067 max 3930 min
+Histogram: 1 1 0 0 0 0 0 0 0 2
+Nghost: 5997.5 ave 6052 max 5943 min
+Histogram: 1 0 1 0 0 0 0 1 0 1
+Neighs: 187388 ave 193157 max 181221 min
+Histogram: 1 1 0 0 0 0 0 0 0 2
+
+Total # of neighbors = 749552
+Ave neighs/atom = 46.847
+Neighbor list builds = 181
+Dangerous builds = 0
+reset_timestep 0
+
+compute temp all edpd/temp/atom
+compute ccT all chunk/atom bin/1d y 0.0 1.0
+fix stat all ave/chunk 1 500 500 ccT c_temp density/number norm sample file temp.profile
+
+run 500
+Per MPI rank memory allocation (min/avg/max) = 5.221 | 5.23 | 5.236 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1.0025285 43.69801 0 45.201708 194.00452
+ 100 0.9885969 43.679927 0 45.16273 196.28442
+ 200 1.0028463 43.663067 0 45.167242 198.25592
+ 300 1.0027516 43.648817 0 45.152851 198.82226
+ 400 0.99695312 43.641469 0 45.136805 197.97499
+ 500 0.98202292 43.627163 0 45.100105 199.16319
+Loop time of 21.576 on 4 procs for 500 steps with 16000 atoms
+
+Performance: 20022.203 tau/day, 23.174 timesteps/s
+99.8% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 18.438 | 19.121 | 19.812 | 14.1 | 88.62
+Neigh | 1.2568 | 1.2885 | 1.325 | 2.5 | 5.97
+Comm | 0.29482 | 1.0219 | 1.7352 | 63.9 | 4.74
+Output | 0.00027728 | 0.00029719 | 0.0003531 | 0.0 | 0.00
+Modify | 0.11153 | 0.11265 | 0.1135 | 0.2 | 0.52
+Other | | 0.03194 | | | 0.15
+
+Nlocal: 4000 ave 4092 max 3899 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost: 5974 ave 6019 max 5915 min
+Histogram: 1 0 0 1 0 0 0 0 0 2
+Neighs: 187414 ave 196149 max 178418 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+
+Total # of neighbors = 749658
+Ave neighs/atom = 46.8536
+Neighbor list builds = 181
+Dangerous builds = 0
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:00:42
diff --git a/examples/USER/meso/edpd/temp.profile.16Aug17.edpd.g++.1 b/examples/USER/meso/edpd/temp.profile.16Aug17.edpd.g++.1
new file mode 100644
index 0000000000..469b550258
--- /dev/null
+++ b/examples/USER/meso/edpd/temp.profile.16Aug17.edpd.g++.1
@@ -0,0 +1,24 @@
+# Chunk-averaged data for fix stat and group density/number
+# Timestep Number-of-chunks Total-count
+# Chunk Coord1 Ncount c_temp density/number
+500 20 16000
+ 1 -9.5 801.636 0.986368 4.00818
+ 2 -8.5 809.788 0.966281 4.04894
+ 3 -7.5 819.754 0.952764 4.09877
+ 4 -6.5 820.364 0.944592 4.10182
+ 5 -5.5 826.146 0.940968 4.13073
+ 6 -4.5 819.52 0.941415 4.0976
+ 7 -3.5 815.182 0.945887 4.07591
+ 8 -2.5 817.168 0.95487 4.08584
+ 9 -1.5 817.282 0.969225 4.08641
+ 10 -0.5 804.204 0.989552 4.02102
+ 11 0.5 793.266 1.01015 3.96633
+ 12 1.5 789.056 1.0308 3.94528
+ 13 2.5 784.344 1.04568 3.92172
+ 14 3.5 780.592 1.05508 3.90296
+ 15 4.5 772.218 1.05968 3.86109
+ 16 5.5 776.968 1.06003 3.88484
+ 17 6.5 780.858 1.05612 3.90429
+ 18 7.5 786.174 1.04752 3.93087
+ 19 8.5 788.922 1.03347 3.94461
+ 20 9.5 796.558 1.01278 3.98279
diff --git a/examples/USER/meso/edpd/temp.profile.16Aug17.edpd.g++.4 b/examples/USER/meso/edpd/temp.profile.16Aug17.edpd.g++.4
new file mode 100644
index 0000000000..1c50a9b6e5
--- /dev/null
+++ b/examples/USER/meso/edpd/temp.profile.16Aug17.edpd.g++.4
@@ -0,0 +1,24 @@
+# Chunk-averaged data for fix stat and group density/number
+# Timestep Number-of-chunks Total-count
+# Chunk Coord1 Ncount c_temp density/number
+500 20 16000
+ 1 -9.5 801.642 0.986089 4.00821
+ 2 -8.5 819.168 0.966072 4.09584
+ 3 -7.5 817.382 0.952718 4.08691
+ 4 -6.5 818 0.944633 4.09
+ 5 -5.5 817.806 0.941105 4.08903
+ 6 -4.5 826.11 0.941499 4.13055
+ 7 -3.5 821.946 0.945922 4.10973
+ 8 -2.5 816.202 0.954889 4.08101
+ 9 -1.5 813.202 0.969281 4.06601
+ 10 -0.5 798.904 0.989463 3.99452
+ 11 0.5 798.056 1.01005 3.99028
+ 12 1.5 793.114 1.03073 3.96557
+ 13 2.5 782.812 1.04569 3.91406
+ 14 3.5 775.69 1.05498 3.87845
+ 15 4.5 778.094 1.05965 3.89047
+ 16 5.5 778.856 1.06002 3.89428
+ 17 6.5 780.51 1.05621 3.90255
+ 18 7.5 780.518 1.04782 3.90259
+ 19 8.5 789.698 1.03348 3.94849
+ 20 9.5 792.29 1.01261 3.96145
diff --git a/examples/USER/meso/mdpd/in.mdpd b/examples/USER/meso/mdpd/in.mdpd
new file mode 100644
index 0000000000..201b4a340e
--- /dev/null
+++ b/examples/USER/meso/mdpd/in.mdpd
@@ -0,0 +1,52 @@
+########################################################################
+#### 3D droplet oscilation using many-body DPD simulation ###
+#### ###
+#### Created : Zhen Li (zhen_li@brown.edu) ###
+#### Division of Applied Mathematics, Brown University. ###
+#### ###
+#### mDPD parameters follow the choice of the publication: ###
+#### Z. Li et al. "Three dimensional flow structures in a moving ###
+#### droplet on substrate: a dissipative particle dynamics study" ###
+#### Physics of Fluids, 2013, 25: 072103. DOI: 10.1063/1.4812366 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.3 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style mdpd
+
+region mdpd block -25 25 -10 10 -10 10 units box
+create_box 1 mdpd
+
+lattice fcc 6
+region film block -20 20 -7.5 7.5 -2.0 2.0 units box
+create_atoms 1 region film
+
+pair_style hybrid/overlay mdpd/rhosum mdpd 1.0 1.0 9872598
+pair_coeff 1 1 mdpd/rhosum 0.75
+pair_coeff 1 1 mdpd -40 25 18.0 1.0 0.75
+mass 1 1.0
+
+compute mythermo all temp
+thermo 100
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 38497 loop local dist gaussian
+
+fix mvv all mvv/dpd
+
+#dump mydump all atom 100 atom.lammpstrj
+
+#dump jpg all image 200 image.*.jpg type type zoom 5 adiam 0.5 &
+# view 90 90 box no 0 size 600 200
+#dump_modify jpg pad 4
+
+#dump avi all movie 200 movie.avi type type zoom 5 adiam 0.5 &
+# view 90 90 box no 0 size 600 200
+#dump_modify avi pad 4
+
+timestep 0.01
+run 4000
diff --git a/examples/USER/meso/mdpd/log.16Aug17.mdpd.g++.1 b/examples/USER/meso/mdpd/log.16Aug17.mdpd.g++.1
new file mode 100644
index 0000000000..c3c14da559
--- /dev/null
+++ b/examples/USER/meso/mdpd/log.16Aug17.mdpd.g++.1
@@ -0,0 +1,147 @@
+LAMMPS (11 Aug 2017)
+########################################################################
+#### 3D droplet oscilation using many-body DPD simulation ###
+#### ###
+#### Created : Zhen Li (zhen_li@brown.edu) ###
+#### Division of Applied Mathematics, Brown University. ###
+#### ###
+#### mDPD parameters follow the choice of the publication: ###
+#### Z. Li et al. "Three dimensional flow structures in a moving ###
+#### droplet on substrate: a dissipative particle dynamics study" ###
+#### Physics of Fluids, 2013, 25: 072103. DOI: 10.1063/1.4812366 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.3 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style mdpd
+
+region mdpd block -25 25 -10 10 -10 10 units box
+create_box 1 mdpd
+Created orthogonal box = (-25 -10 -10) to (25 10 10)
+ 1 by 1 by 1 MPI processor grid
+
+lattice fcc 6
+Lattice spacing in x,y,z = 0.87358 0.87358 0.87358
+region film block -20 20 -7.5 7.5 -2.0 2.0 units box
+create_atoms 1 region film
+Created 14333 atoms
+
+pair_style hybrid/overlay mdpd/rhosum mdpd 1.0 1.0 9872598
+pair_coeff 1 1 mdpd/rhosum 0.75
+pair_coeff 1 1 mdpd -40 25 18.0 1.0 0.75
+mass 1 1.0
+
+compute mythermo all temp
+thermo 100
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 38497 loop local dist gaussian
+
+fix mvv all mvv/dpd
+
+dump mydump all atom 100 atom.lammpstrj
+
+#dump jpg all image 200 image.*.jpg type type zoom 5 adiam 0.5 # view 90 90 box no 0 size 600 200
+#dump_modify jpg pad 4
+
+#dump avi all movie 200 movie.avi type type zoom 5 adiam 0.5 # view 90 90 box no 0 size 600 200
+#dump_modify avi pad 4
+
+timestep 0.01
+run 4000
+Neighbor list info ...
+ update every 1 steps, delay 0 steps, check yes
+ max neighbors/atom: 2000, page size: 100000
+ master list distance cutoff = 1.3
+ ghost atom cutoff = 1.3
+ binsize = 0.65, bins = 77 31 31
+ 2 neighbor lists, perpetual/occasional/extra = 2 0 0
+ (1) pair mdpd/rhosum, perpetual
+ attributes: full, newton on
+ pair build: full/bin/atomonly
+ stencil: full/bin/3d
+ bin: standard
+ (2) pair mdpd, perpetual, half/full from (1)
+ attributes: half, newton on
+ pair build: halffull/newton
+ stencil: none
+ bin: none
+Per MPI rank memory allocation (min/avg/max) = 9.931 | 9.931 | 9.931 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1 -13.346542 0 -11.846647 -6.8495478
+ 100 1.0321029 -7.2846779 0 -5.7366316 -0.77640205
+ 200 1.042287 -6.9534532 0 -5.3901317 -0.27750815
+ 300 1.0583027 -6.8483105 0 -5.2609672 -0.30347708
+ 400 1.0493719 -6.8648608 0 -5.2909127 -0.15312495
+ 500 1.0723786 -6.8341085 0 -5.2256528 0.017227511
+ 600 1.0545695 -6.8152957 0 -5.2335517 -0.024362439
+ 700 1.0507193 -6.8076033 0 -5.2316344 -0.07101536
+ 800 1.0531856 -6.9378568 0 -5.3581886 -0.053943939
+ 900 1.0442995 -6.8501126 0 -5.2837726 -0.13347942
+ 1000 1.0335049 -6.8883554 0 -5.3382062 -0.18420426
+ 1100 1.0287276 -6.8298226 0 -5.2868389 -0.12081558
+ 1200 1.0322527 -6.9462828 0 -5.3980117 -0.18047625
+ 1300 1.0599443 -6.9449975 0 -5.355192 -0.011763589
+ 1400 1.0560932 -6.845479 0 -5.2614498 0.032130055
+ 1500 1.0432786 -6.9035877 0 -5.338779 -0.10268662
+ 1600 1.064183 -6.9116836 0 -5.3155205 -0.060722129
+ 1700 1.0586249 -6.8768278 0 -5.2890013 0.037005566
+ 1800 1.0576064 -7.0060193 0 -5.4197204 -0.036211254
+ 1900 1.0595141 -6.838741 0 -5.2495807 -0.12395681
+ 2000 1.0650509 -6.897976 0 -5.3005111 0.003594807
+ 2100 1.0768273 -6.8874245 0 -5.2722962 0.033283489
+ 2200 1.0511606 -6.9823162 0 -5.4056854 0.015008427
+ 2300 1.0461138 -6.8820601 0 -5.3129988 0.064646933
+ 2400 1.0485369 -6.9437148 0 -5.3710191 -0.16534939
+ 2500 1.0507221 -6.9394786 0 -5.3635054 -0.098289859
+ 2600 1.0518352 -6.8947578 0 -5.3171152 -0.011666785
+ 2700 1.0402369 -6.9273377 0 -5.3670913 0.035267073
+ 2800 1.0426109 -6.912024 0 -5.3482168 0.049597305
+ 2900 1.0358928 -6.9574778 0 -5.4037471 -0.063216561
+ 3000 1.0351023 -6.9844192 0 -5.4318742 -0.10323465
+ 3100 1.0255005 -6.9382486 0 -5.4001052 -0.073954735
+ 3200 1.0150616 -6.9843183 0 -5.4618321 -0.095136405
+ 3300 1.0118112 -6.9522082 0 -5.4345973 -0.12686179
+ 3400 1.0071522 -6.970158 0 -5.4595351 -0.012487475
+ 3500 1.0041758 -6.9773019 0 -5.4711433 -0.098027653
+ 3600 1.0189298 -6.9393039 0 -5.4110158 0.061631719
+ 3700 1.012442 -6.9341423 0 -5.4155852 0.10442772
+ 3800 1.0021246 -6.9594374 0 -5.4563553 -0.081535223
+ 3900 1.0165002 -6.9045321 0 -5.3798882 -0.0088283303
+ 4000 1.0077099 -6.9145511 0 -5.4030918 0.048349691
+Loop time of 135.409 on 1 procs for 4000 steps with 14333 atoms
+
+Performance: 25522.736 tau/day, 29.540 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 93.074 | 93.074 | 93.074 | 0.0 | 68.74
+Neigh | 40.192 | 40.192 | 40.192 | 0.0 | 29.68
+Comm | 0.19625 | 0.19625 | 0.19625 | 0.0 | 0.14
+Output | 0.41756 | 0.41756 | 0.41756 | 0.0 | 0.31
+Modify | 1.0706 | 1.0706 | 1.0706 | 0.0 | 0.79
+Other | | 0.4581 | | | 0.34
+
+Nlocal: 14333 ave 14333 max 14333 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost: 11 ave 11 max 11 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs: 401803 ave 401803 max 401803 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+FullNghs: 803606 ave 803606 max 803606 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 803606
+Ave neighs/atom = 56.0668
+Neighbor list builds = 1050
+Dangerous builds = 0
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:02:15
diff --git a/examples/USER/meso/mdpd/log.16Aug17.mdpd.g++.4 b/examples/USER/meso/mdpd/log.16Aug17.mdpd.g++.4
new file mode 100644
index 0000000000..ec3d8fbddc
--- /dev/null
+++ b/examples/USER/meso/mdpd/log.16Aug17.mdpd.g++.4
@@ -0,0 +1,147 @@
+LAMMPS (11 Aug 2017)
+########################################################################
+#### 3D droplet oscilation using many-body DPD simulation ###
+#### ###
+#### Created : Zhen Li (zhen_li@brown.edu) ###
+#### Division of Applied Mathematics, Brown University. ###
+#### ###
+#### mDPD parameters follow the choice of the publication: ###
+#### Z. Li et al. "Three dimensional flow structures in a moving ###
+#### droplet on substrate: a dissipative particle dynamics study" ###
+#### Physics of Fluids, 2013, 25: 072103. DOI: 10.1063/1.4812366 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.3 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style mdpd
+
+region mdpd block -25 25 -10 10 -10 10 units box
+create_box 1 mdpd
+Created orthogonal box = (-25 -10 -10) to (25 10 10)
+ 4 by 1 by 1 MPI processor grid
+
+lattice fcc 6
+Lattice spacing in x,y,z = 0.87358 0.87358 0.87358
+region film block -20 20 -7.5 7.5 -2.0 2.0 units box
+create_atoms 1 region film
+Created 14333 atoms
+
+pair_style hybrid/overlay mdpd/rhosum mdpd 1.0 1.0 9872598
+pair_coeff 1 1 mdpd/rhosum 0.75
+pair_coeff 1 1 mdpd -40 25 18.0 1.0 0.75
+mass 1 1.0
+
+compute mythermo all temp
+thermo 100
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 38497 loop local dist gaussian
+
+fix mvv all mvv/dpd
+
+dump mydump all atom 100 atom.lammpstrj
+
+#dump jpg all image 200 image.*.jpg type type zoom 5 adiam 0.5 # view 90 90 box no 0 size 600 200
+#dump_modify jpg pad 4
+
+#dump avi all movie 200 movie.avi type type zoom 5 adiam 0.5 # view 90 90 box no 0 size 600 200
+#dump_modify avi pad 4
+
+timestep 0.01
+run 4000
+Neighbor list info ...
+ update every 1 steps, delay 0 steps, check yes
+ max neighbors/atom: 2000, page size: 100000
+ master list distance cutoff = 1.3
+ ghost atom cutoff = 1.3
+ binsize = 0.65, bins = 77 31 31
+ 2 neighbor lists, perpetual/occasional/extra = 2 0 0
+ (1) pair mdpd/rhosum, perpetual
+ attributes: full, newton on
+ pair build: full/bin/atomonly
+ stencil: full/bin/3d
+ bin: standard
+ (2) pair mdpd, perpetual, half/full from (1)
+ attributes: half, newton on
+ pair build: halffull/newton
+ stencil: none
+ bin: none
+Per MPI rank memory allocation (min/avg/max) = 6.265 | 6.655 | 7.045 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1 -13.346542 0 -11.846647 -6.9757225
+ 100 1.0406108 -7.2500697 0 -5.6892624 -0.80306477
+ 200 1.0535506 -6.9452928 0 -5.3650772 -0.39911584
+ 300 1.0644295 -6.8599907 0 -5.2634577 -0.2997968
+ 400 1.0780123 -6.9471342 0 -5.3302286 -0.06274869
+ 500 1.0672153 -6.8269984 0 -5.2262872 0.021251762
+ 600 1.0634304 -6.8366569 0 -5.2416226 -0.021863333
+ 700 1.0544807 -6.8272074 0 -5.2455967 -0.0064688066
+ 800 1.0556172 -6.8859788 0 -5.3026634 0.023983333
+ 900 1.0436201 -6.9246523 0 -5.3593313 -0.12409618
+ 1000 1.0617016 -6.8632331 0 -5.2707919 -0.1145505
+ 1100 1.0323831 -6.951554 0 -5.4030874 -0.030031884
+ 1200 1.0407785 -6.931048 0 -5.3699892 -0.018362136
+ 1300 1.0380953 -6.8785296 0 -5.3214953 -0.099308737
+ 1400 1.0418898 -6.8998 0 -5.3370743 -0.14199421
+ 1500 1.0487254 -6.9671212 0 -5.3941429 -0.12132644
+ 1600 1.0561042 -6.8948881 0 -5.3108424 -0.09627292
+ 1700 1.0524479 -6.9531441 0 -5.3745823 -0.11959782
+ 1800 1.0541197 -6.9219819 0 -5.3409126 0.032964029
+ 1900 1.0531221 -6.8805815 0 -5.3010085 0.030124685
+ 2000 1.0531819 -6.8612868 0 -5.2816242 -0.076876781
+ 2100 1.0757791 -6.919875 0 -5.3063189 -0.04060439
+ 2200 1.069423 -6.9005754 0 -5.2965527 0.015347467
+ 2300 1.0403109 -6.9015402 0 -5.3411827 0.0034687897
+ 2400 1.0547448 -6.9325539 0 -5.3505471 -0.021202325
+ 2500 1.0404195 -6.8494675 0 -5.2889472 0.086947847
+ 2600 1.0499828 -6.9861392 0 -5.4112749 -0.018079308
+ 2700 1.0294278 -6.8525151 0 -5.3084811 0.16911472
+ 2800 1.0220652 -6.8993978 0 -5.366407 0.064820531
+ 2900 1.0347904 -6.9322703 0 -5.3801929 -0.11384964
+ 3000 1.0391372 -6.9519088 0 -5.3933117 0.003050577
+ 3100 1.0335828 -7.0090074 0 -5.4587413 -0.17366664
+ 3200 1.0211896 -6.9421289 0 -5.4104513 0.025299853
+ 3300 1.0019232 -6.9426488 0 -5.4398688 -0.098334724
+ 3400 1.0203541 -6.9310981 0 -5.4006737 -0.0015544982
+ 3500 1.0076794 -6.9519932 0 -5.4405796 -0.056956902
+ 3600 1.0086525 -6.9620979 0 -5.4492247 0.020014884
+ 3700 1.0046112 -7.0011625 0 -5.4943508 -0.083936527
+ 3800 1.0096867 -6.9470382 0 -5.4326138 -0.089521759
+ 3900 1.0074482 -6.9959414 0 -5.4848745 -0.11873698
+ 4000 1.01222 -6.9535694 0 -5.4353454 0.042191466
+Loop time of 63.0327 on 4 procs for 4000 steps with 14333 atoms
+
+Performance: 54828.695 tau/day, 63.459 timesteps/s
+98.8% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 16.591 | 29.795 | 42.814 | 236.6 | 47.27
+Neigh | 2.0347 | 10.239 | 18.555 | 255.6 | 16.24
+Comm | 0.70099 | 6.0601 | 11.386 | 207.4 | 9.61
+Output | 0.20713 | 0.40902 | 0.61087 | 31.5 | 0.65
+Modify | 0.058089 | 0.27033 | 0.4851 | 40.7 | 0.43
+Other | | 16.26 | | | 25.79
+
+Nlocal: 3583.25 ave 7207 max 0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Nghost: 1055.75 ave 2131 max 0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+Neighs: 100549 ave 202192 max 0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+FullNghs: 201098 ave 404372 max 0 min
+Histogram: 2 0 0 0 0 0 0 0 0 2
+
+Total # of neighbors = 804390
+Ave neighs/atom = 56.1215
+Neighbor list builds = 1049
+Dangerous builds = 0
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:01:03
diff --git a/examples/USER/meso/tdpd/cc.profile.16Aug17.tdpd.g++.1 b/examples/USER/meso/tdpd/cc.profile.16Aug17.tdpd.g++.1
new file mode 100644
index 0000000000..a872600504
--- /dev/null
+++ b/examples/USER/meso/tdpd/cc.profile.16Aug17.tdpd.g++.1
@@ -0,0 +1,24 @@
+# Chunk-averaged data for fix stat and group c_cc2
+# Timestep Number-of-chunks Total-count
+# Chunk Coord1 Ncount c_cc1 c_cc2
+100 20 16000
+ 1 -9.5 797.17 0.986661 1.0077
+ 2 -8.5 802.61 0.967974 1.02003
+ 3 -7.5 795.46 0.957045 1.02873
+ 4 -6.5 806.46 0.951271 1.03428
+ 5 -5.5 802.34 0.94898 1.03692
+ 6 -4.5 799.84 0.949378 1.03673
+ 7 -3.5 798.4 0.952505 1.03374
+ 8 -2.5 800.36 0.959322 1.02778
+ 9 -1.5 797.65 0.971516 1.01867
+ 10 -0.5 808.88 0.990644 1.00626
+ 11 0.5 786.29 1.00924 0.993828
+ 12 1.5 807.16 1.02831 0.981436
+ 13 2.5 797.54 1.04071 0.972184
+ 14 3.5 799.67 1.04749 0.966258
+ 15 4.5 799.61 1.05063 0.963256
+ 16 5.5 806.11 1.05105 0.963052
+ 17 6.5 803.67 1.04877 0.965688
+ 18 7.5 797.39 1.04305 0.971187
+ 19 8.5 801.85 1.03208 0.97993
+ 20 9.5 791.54 1.01351 0.992209
diff --git a/examples/USER/meso/tdpd/cc.profile.16Aug17.tdpd.g++.4 b/examples/USER/meso/tdpd/cc.profile.16Aug17.tdpd.g++.4
new file mode 100644
index 0000000000..de34ef26c3
--- /dev/null
+++ b/examples/USER/meso/tdpd/cc.profile.16Aug17.tdpd.g++.4
@@ -0,0 +1,24 @@
+# Chunk-averaged data for fix stat and group c_cc2
+# Timestep Number-of-chunks Total-count
+# Chunk Coord1 Ncount c_cc1 c_cc2
+100 20 16000
+ 1 -9.5 806.92 0.986675 1.00766
+ 2 -8.5 798.01 0.96792 1.02003
+ 3 -7.5 805.43 0.956909 1.02883
+ 4 -6.5 800.54 0.951207 1.03432
+ 5 -5.5 794.14 0.948967 1.03691
+ 6 -4.5 799.75 0.949379 1.03672
+ 7 -3.5 799.65 0.952492 1.03374
+ 8 -2.5 799.94 0.959331 1.02778
+ 9 -1.5 800.96 0.971664 1.01861
+ 10 -0.5 803.97 0.99074 1.00622
+ 11 0.5 800.66 1.00949 0.993673
+ 12 1.5 779.22 1.02824 0.981461
+ 13 2.5 809.13 1.04056 0.972274
+ 14 3.5 805.23 1.04747 0.966272
+ 15 4.5 795.95 1.05061 0.96327
+ 16 5.5 796.4 1.05105 0.963035
+ 17 6.5 806.1 1.04883 0.965621
+ 18 7.5 806.41 1.04305 0.971224
+ 19 8.5 792.2 1.03211 0.979955
+ 20 9.5 799.39 1.01362 0.992156
diff --git a/examples/USER/meso/tdpd/in.tdpd b/examples/USER/meso/tdpd/in.tdpd
new file mode 100644
index 0000000000..748a4f5077
--- /dev/null
+++ b/examples/USER/meso/tdpd/in.tdpd
@@ -0,0 +1,54 @@
+########################################################################
+### Pure diffusion with a reaction source term analog of a periodic ###
+### Poiseuille flow problem using transport DPD (tDPD) simulation ###
+### ###
+### Created : Zhen Li (zhen_li@brown.edu) ###
+### Division of Applied Mathematics, Brown University. ###
+### ###
+### tDPD system setup follows Fig.1 in the publication: ###
+### Z. Li, A. Yazdani, A. Tartakovsky and G.E. Karniadakis. ###
+### "Transport dissipative particle dynamics model for mesoscopic ###
+### advection-diffusion-reaction problems. J. Chem. Phys., ###
+### 2015, 143: 014101. DOI: 10.1063/1.4923254 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.2 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style tdpd 2
+
+region tdpd block -10 10 -10 10 -5 5 units box
+create_box 1 tdpd
+create_atoms 1 random 16000 276438 NULL
+mass 1 1.0
+set atom * cc 1 1.0
+set atom * cc 2 1.0
+
+pair_style tdpd 1.0 1.58 9872598
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.58 1.0 1.0E-5 2.0 3.0 1.0E-5 2.0
+
+compute mythermo all temp
+thermo 50
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 432982 loop local dist gaussian
+
+fix mvv all mvv/tdpd 0.5
+fix upper1 all tdpd/source 1 cuboid 0.0 5.0 0.0 20.0 10.0 10.0 0.01
+fix lower1 all tdpd/source 1 cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 -0.01
+fix upper2 all tdpd/source 2 cuboid 0.0 5.0 0.0 20.0 10.0 10.0 -0.01
+fix lower2 all tdpd/source 2 cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 0.01
+
+timestep 0.01
+run 500
+reset_timestep 0
+
+compute cc1 all tdpd/cc/atom 1
+compute cc2 all tdpd/cc/atom 2
+compute bin all chunk/atom bin/1d y 0.0 1.0
+fix stat all ave/chunk 1 100 100 bin c_cc1 c_cc2 norm sample file cc.profile
+
+run 100
diff --git a/examples/USER/meso/tdpd/log.16Aug17.tdpd.g++.1 b/examples/USER/meso/tdpd/log.16Aug17.tdpd.g++.1
new file mode 100644
index 0000000000..21b618148c
--- /dev/null
+++ b/examples/USER/meso/tdpd/log.16Aug17.tdpd.g++.1
@@ -0,0 +1,146 @@
+LAMMPS (11 Aug 2017)
+########################################################################
+### Pure diffusion with a reaction source term analog of a periodic ###
+### Poiseuille flow problem using transport DPD (tDPD) simulation ###
+### ###
+### Created : Zhen Li (zhen_li@brown.edu) ###
+### Division of Applied Mathematics, Brown University. ###
+### ###
+### tDPD system setup follows Fig.1 in the publication: ###
+### Z. Li, A. Yazdani, A. Tartakovsky and G.E. Karniadakis. ###
+### "Transport dissipative particle dynamics model for mesoscopic ###
+### advection-diffusion-reaction problems. J. Chem. Phys., ###
+### 2015, 143: 014101. DOI: 10.1063/1.4923254 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.2 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style tdpd 2
+
+region tdpd block -10 10 -10 10 -5 5 units box
+create_box 1 tdpd
+Created orthogonal box = (-10 -10 -5) to (10 10 5)
+ 1 by 1 by 1 MPI processor grid
+create_atoms 1 random 16000 276438 NULL
+Created 16000 atoms
+mass 1 1.0
+set atom * cc 1 1.0
+ 16000 settings made for cc index 1
+set atom * cc 2 1.0
+ 16000 settings made for cc index 2
+
+pair_style tdpd 1.0 1.58 9872598
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.58 1.0 1.0E-5 2.0 3.0 1.0E-5 2.0
+
+compute mythermo all temp
+thermo 50
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 432982 loop local dist gaussian
+
+fix mvv all mvv/tdpd 0.5
+fix upper1 all tdpd/source 1 cuboid 0.0 5.0 0.0 20.0 10.0 10.0 0.01
+fix lower1 all tdpd/source 1 cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 -0.01
+fix upper2 all tdpd/source 2 cuboid 0.0 5.0 0.0 20.0 10.0 10.0 -0.01
+fix lower2 all tdpd/source 2 cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 0.01
+
+timestep 0.01
+run 500
+Neighbor list info ...
+ update every 1 steps, delay 0 steps, check yes
+ max neighbors/atom: 2000, page size: 100000
+ master list distance cutoff = 1.78
+ ghost atom cutoff = 1.78
+ binsize = 0.89, bins = 23 23 12
+ 1 neighbor lists, perpetual/occasional/extra = 1 0 0
+ (1) pair tdpd, perpetual
+ attributes: half, newton on
+ pair build: half/bin/atomonly/newton
+ stencil: half/bin/3d/newton
+ bin: standard
+Per MPI rank memory allocation (min/avg/max) = 11.3 | 11.3 | 11.3 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1 48.948932 0 50.448838 202.19166
+ 50 0.99837766 43.949877 0 45.447349 195.80936
+ 100 0.99846831 43.756995 0 45.254604 198.22348
+ 150 1.0026903 43.72408 0 45.228021 196.61676
+ 200 1.0063144 43.722388 0 45.231765 194.17954
+ 250 1.0032304 43.721864 0 45.226615 197.85829
+ 300 0.9932656 43.703526 0 45.193331 196.57406
+ 350 1.0002916 43.720498 0 45.220841 193.55346
+ 400 0.99475486 43.722965 0 45.215004 196.81546
+ 450 1.0011803 43.712447 0 45.214124 200.46118
+ 500 1.0009006 43.708984 0 45.210241 197.38953
+Loop time of 96.0326 on 1 procs for 500 steps with 16000 atoms
+
+Performance: 4498.474 tau/day, 5.207 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 90.083 | 90.083 | 90.083 | 0.0 | 93.80
+Neigh | 5.049 | 5.049 | 5.049 | 0.0 | 5.26
+Comm | 0.34141 | 0.34141 | 0.34141 | 0.0 | 0.36
+Output | 0.00092816 | 0.00092816 | 0.00092816 | 0.0 | 0.00
+Modify | 0.45991 | 0.45991 | 0.45991 | 0.0 | 0.48
+Other | | 0.09865 | | | 0.10
+
+Nlocal: 16000 ave 16000 max 16000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost: 14091 ave 14091 max 14091 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs: 749379 ave 749379 max 749379 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 749379
+Ave neighs/atom = 46.8362
+Neighbor list builds = 183
+Dangerous builds = 0
+reset_timestep 0
+
+compute cc1 all tdpd/cc/atom 1
+compute cc2 all tdpd/cc/atom 2
+compute bin all chunk/atom bin/1d y 0.0 1.0
+fix stat all ave/chunk 1 100 100 bin c_cc1 c_cc2 norm sample file cc.profile
+
+run 100
+Per MPI rank memory allocation (min/avg/max) = 11.8 | 11.8 | 11.8 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1.0009006 43.708984 0 45.210241 199.3205
+ 50 1.0007276 43.704844 0 45.205842 197.77053
+ 100 1.0039032 43.714201 0 45.219961 197.31118
+Loop time of 19.0326 on 1 procs for 100 steps with 16000 atoms
+
+Performance: 4539.577 tau/day, 5.254 timesteps/s
+99.9% CPU use with 1 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 17.842 | 17.842 | 17.842 | 0.0 | 93.74
+Neigh | 0.98674 | 0.98674 | 0.98674 | 0.0 | 5.18
+Comm | 0.066013 | 0.066013 | 0.066013 | 0.0 | 0.35
+Output | 0.00016284 | 0.00016284 | 0.00016284 | 0.0 | 0.00
+Modify | 0.11795 | 0.11795 | 0.11795 | 0.0 | 0.62
+Other | | 0.02012 | | | 0.11
+
+Nlocal: 16000 ave 16000 max 16000 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Nghost: 14126 ave 14126 max 14126 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+Neighs: 748927 ave 748927 max 748927 min
+Histogram: 1 0 0 0 0 0 0 0 0 0
+
+Total # of neighbors = 748927
+Ave neighs/atom = 46.8079
+Neighbor list builds = 37
+Dangerous builds = 0
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:01:55
diff --git a/examples/USER/meso/tdpd/log.16Aug17.tdpd.g++.4 b/examples/USER/meso/tdpd/log.16Aug17.tdpd.g++.4
new file mode 100644
index 0000000000..6cd99168f4
--- /dev/null
+++ b/examples/USER/meso/tdpd/log.16Aug17.tdpd.g++.4
@@ -0,0 +1,146 @@
+LAMMPS (11 Aug 2017)
+########################################################################
+### Pure diffusion with a reaction source term analog of a periodic ###
+### Poiseuille flow problem using transport DPD (tDPD) simulation ###
+### ###
+### Created : Zhen Li (zhen_li@brown.edu) ###
+### Division of Applied Mathematics, Brown University. ###
+### ###
+### tDPD system setup follows Fig.1 in the publication: ###
+### Z. Li, A. Yazdani, A. Tartakovsky and G.E. Karniadakis. ###
+### "Transport dissipative particle dynamics model for mesoscopic ###
+### advection-diffusion-reaction problems. J. Chem. Phys., ###
+### 2015, 143: 014101. DOI: 10.1063/1.4923254 ###
+########################################################################
+units lj
+dimension 3
+boundary p p p
+neighbor 0.2 bin
+neigh_modify every 1 delay 0 check yes
+
+atom_style tdpd 2
+
+region tdpd block -10 10 -10 10 -5 5 units box
+create_box 1 tdpd
+Created orthogonal box = (-10 -10 -5) to (10 10 5)
+ 2 by 2 by 1 MPI processor grid
+create_atoms 1 random 16000 276438 NULL
+Created 16000 atoms
+mass 1 1.0
+set atom * cc 1 1.0
+ 16000 settings made for cc index 1
+set atom * cc 2 1.0
+ 16000 settings made for cc index 2
+
+pair_style tdpd 1.0 1.58 9872598
+pair_coeff 1 1 18.75 4.5 0.41 1.58 1.58 1.0 1.0E-5 2.0 3.0 1.0E-5 2.0
+
+compute mythermo all temp
+thermo 50
+thermo_modify temp mythermo
+thermo_modify flush yes
+
+velocity all create 1.0 432982 loop local dist gaussian
+
+fix mvv all mvv/tdpd 0.5
+fix upper1 all tdpd/source 1 cuboid 0.0 5.0 0.0 20.0 10.0 10.0 0.01
+fix lower1 all tdpd/source 1 cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 -0.01
+fix upper2 all tdpd/source 2 cuboid 0.0 5.0 0.0 20.0 10.0 10.0 -0.01
+fix lower2 all tdpd/source 2 cuboid 0.0 -5.0 0.0 20.0 10.0 10.0 0.01
+
+timestep 0.01
+run 500
+Neighbor list info ...
+ update every 1 steps, delay 0 steps, check yes
+ max neighbors/atom: 2000, page size: 100000
+ master list distance cutoff = 1.78
+ ghost atom cutoff = 1.78
+ binsize = 0.89, bins = 23 23 12
+ 1 neighbor lists, perpetual/occasional/extra = 1 0 0
+ (1) pair tdpd, perpetual
+ attributes: half, newton on
+ pair build: half/bin/atomonly/newton
+ stencil: half/bin/3d/newton
+ bin: standard
+Per MPI rank memory allocation (min/avg/max) = 4.814 | 4.823 | 4.829 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1 48.948932 0 50.448838 199.65978
+ 50 1.0153476 43.948796 0 45.471722 198.3346
+ 100 1.0064284 43.754875 0 45.264424 197.5308
+ 150 0.99609985 43.726751 0 45.220807 197.50623
+ 200 1.0016604 43.720283 0 45.22268 197.81129
+ 250 1.0054979 43.718568 0 45.22672 195.79405
+ 300 0.9997618 43.716617 0 45.216166 197.84788
+ 350 0.99170101 43.72093 0 45.208389 196.07711
+ 400 1.0043692 43.71648 0 45.22294 199.55247
+ 450 1.0086263 43.709988 0 45.222833 198.20516
+ 500 1.0029076 43.717879 0 45.222146 197.26281
+Loop time of 24.5533 on 4 procs for 500 steps with 16000 atoms
+
+Performance: 17594.412 tau/day, 20.364 timesteps/s
+99.9% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 22.236 | 22.418 | 22.736 | 4.0 | 91.30
+Neigh | 1.2759 | 1.2883 | 1.3077 | 1.1 | 5.25
+Comm | 0.35749 | 0.69526 | 0.88462 | 24.1 | 2.83
+Output | 0.00043321 | 0.00050318 | 0.00070691 | 0.0 | 0.00
+Modify | 0.11555 | 0.11648 | 0.11888 | 0.4 | 0.47
+Other | | 0.03473 | | | 0.14
+
+Nlocal: 4000 ave 4012 max 3982 min
+Histogram: 1 0 0 0 0 1 0 0 0 2
+Nghost: 5986.25 ave 6016 max 5956 min
+Histogram: 1 0 0 0 1 0 1 0 0 1
+Neighs: 187309 ave 188264 max 186087 min
+Histogram: 1 0 0 0 1 0 0 1 0 1
+
+Total # of neighbors = 749235
+Ave neighs/atom = 46.8272
+Neighbor list builds = 180
+Dangerous builds = 0
+reset_timestep 0
+
+compute cc1 all tdpd/cc/atom 1
+compute cc2 all tdpd/cc/atom 2
+compute bin all chunk/atom bin/1d y 0.0 1.0
+fix stat all ave/chunk 1 100 100 bin c_cc1 c_cc2 norm sample file cc.profile
+
+run 100
+Per MPI rank memory allocation (min/avg/max) = 5.065 | 5.074 | 5.082 Mbytes
+Step Temp E_pair E_mol TotEng Press
+ 0 1.0029076 43.717879 0 45.222146 198.45789
+ 50 1.0077982 43.713264 0 45.224867 196.56183
+ 100 1.0036823 43.708022 0 45.213451 196.00815
+Loop time of 4.79577 on 4 procs for 100 steps with 16000 atoms
+
+Performance: 18015.870 tau/day, 20.852 timesteps/s
+99.9% CPU use with 4 MPI tasks x no OpenMP threads
+
+MPI task timing breakdown:
+Section | min time | avg time | max time |%varavg| %total
+---------------------------------------------------------------
+Pair | 4.3481 | 4.39 | 4.4398 | 1.7 | 91.54
+Neigh | 0.25477 | 0.25675 | 0.25963 | 0.4 | 5.35
+Comm | 0.059327 | 0.11194 | 0.15608 | 11.0 | 2.33
+Output | 0.00011206 | 0.00011748 | 0.00011992 | 0.0 | 0.00
+Modify | 0.030417 | 0.030622 | 0.030739 | 0.1 | 0.64
+Other | | 0.006301 | | | 0.13
+
+Nlocal: 4000 ave 4010 max 3987 min
+Histogram: 1 0 0 0 0 1 1 0 0 1
+Nghost: 5985.25 ave 6025 max 5959 min
+Histogram: 2 0 0 0 0 1 0 0 0 1
+Neighs: 187304 ave 188092 max 186449 min
+Histogram: 1 0 0 0 0 2 0 0 0 1
+
+Total # of neighbors = 749216
+Ave neighs/atom = 46.826
+Neighbor list builds = 38
+Dangerous builds = 0
+
+Please see the log.cite file for references relevant to this simulation
+
+Total wall time: 0:00:29
diff --git a/lib/gpu/Install.py b/lib/gpu/Install.py
index 6ea2159de5..13d7ad157e 100644
--- a/lib/gpu/Install.py
+++ b/lib/gpu/Install.py
@@ -9,8 +9,8 @@ import sys,os,subprocess
# help message
help = """
-Syntax from src dir: make lib-gpu args="-m machine -h hdir -a arch -p precision -e esuffix -m -o osuffix"
-Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -e esuffix -m -o osuffix
+Syntax from src dir: make lib-gpu args="-m machine -h hdir -a arch -p precision -e esuffix -b -o osuffix"
+Syntax from lib dir: python Install.py -m machine -h hdir -a arch -p precision -e esuffix -b -o osuffix
specify one or more options, order does not matter
diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md
index 3fe9e46111..43d3f17d63 100644
--- a/lib/kokkos/CHANGELOG.md
+++ b/lib/kokkos/CHANGELOG.md
@@ -1,5 +1,22 @@
# Change Log
+## [2.04.00](https://github.com/kokkos/kokkos/tree/2.04.00) (2017-08-16)
+[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.13...2.04.00)
+
+**Implemented enhancements:**
+
+- Added ROCm backend to support AMD GPUs
+- Kokkos::complex\ behaves slightly differently from std::complex\ [\#1011](https://github.com/kokkos/kokkos/issues/1011)
+- Kokkos::Experimental::Crs constructor arguments were in the wrong order [\#992](https://github.com/kokkos/kokkos/issues/992)
+- Work graph construction ease-of-use (one lambda for count and fill) [\#991](https://github.com/kokkos/kokkos/issues/991)
+- when\_all returns pointer of futures (improved interface) [\#990](https://github.com/kokkos/kokkos/issues/990)
+- Allow assignment of LayoutLeft to LayoutRight or vice versa for rank-0 Views [\#594](https://github.com/kokkos/kokkos/issues/594)
+- Changed the meaning of Kokkos\_ENABLE\_CXX11\_DISPATCH\_LAMBDA [\#1035](https://github.com/kokkos/kokkos/issues/1035)
+
+**Fixed bugs:**
+
+- memory pool default constructor does not properly set member variables. [\#1007](https://github.com/kokkos/kokkos/issues/1007)
+
## [2.03.13](https://github.com/kokkos/kokkos/tree/2.03.13) (2017-07-27)
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.03.05...2.03.13)
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
index d2967cf9a3..b8236e8fd1 100644
--- a/lib/kokkos/Makefile.kokkos
+++ b/lib/kokkos/Makefile.kokkos
@@ -4,10 +4,16 @@
KOKKOS_PATH=../../lib/kokkos
CXXFLAGS=$(CCFLAGS)
-# Options: Cuda,OpenMP,Pthreads,Qthreads,Serial
+# Options: Cuda,ROCm,OpenMP,Pthreads,Qthreads,Serial
KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthreads"
-# Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
+# Options:
+# Intel: KNC,KNL,SNB,HSW,BDW,SKX
+# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61
+# ARM: ARMv80,ARMv81,ARMv8-ThunderX
+# IBM: BGQ,Power7,Power8,Power9
+# AMD-GPUS: Kaveri,Carrizo,Fiji,Vega
+# AMD-CPUS: AMDAVX,Ryzen,Epyc
KOKKOS_ARCH ?= ""
# Options: yes,no
KOKKOS_DEBUG ?= "no"
@@ -43,8 +49,8 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | gr
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))
+
# Check for Kokkos Host Execution Spaces one of which must be on.
-KOKKOS_INTERNAL_USE_OPENMPTARGET := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMPTarget | wc -l))
KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(subst OpenMPTarget,,$(KOKKOS_DEVICES)) | grep OpenMP | wc -l))
KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
KOKKOS_INTERNAL_USE_QTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthreads | wc -l))
@@ -60,6 +66,8 @@ endif
# Check for other Execution Spaces.
KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
+KOKKOS_INTERNAL_USE_ROCM := $(strip $(shell echo $(KOKKOS_DEVICES) | grep ROCm | wc -l))
+KOKKOS_INTERNAL_USE_OPENMPTARGET := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMPTarget | wc -l))
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc)
@@ -87,6 +95,7 @@ ifneq ($(MPICH_CXX),)
endif
KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l))
KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l))
+KOKKOS_INTERNAL_COMPILER_HCC := $(strip $(shell $(CXX) --version 2>&1 | grep HCC | wc -l))
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2)
KOKKOS_INTERNAL_COMPILER_CLANG = 1
@@ -99,6 +108,10 @@ endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_APPLE_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_CLANG = 0
endif
+# AMD HCC passes both clang and hcc test so turn off clang
+ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
+ KOKKOS_INTENAL_COMPILER_CLANG = 0
+endif
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
@@ -183,8 +196,12 @@ else
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
KOKKOS_INTERNAL_CXX11_FLAG := -hstd=c++11
else
- KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
- KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
+ ifeq ($(KOKKOS_INTERNAL_COMPILER_HCC), 1)
+ KOKKOS_INTERNAL_CXX11_FLAG :=
+ else
+ KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
+ KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z
+ endif
endif
endif
endif
@@ -259,6 +276,13 @@ KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_
# AMD based.
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(strip $(shell echo $(KOKKOS_ARCH) | grep Ryzen | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_EPYC := $(strip $(shell echo $(KOKKOS_ARCH) | grep Epyc | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kaveri | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(strip $(shell echo $(KOKKOS_ARCH) | grep Carrizo | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_FIJI := $(strip $(shell echo $(KOKKOS_ARCH) | grep Fiji | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_VEGA := $(strip $(shell echo $(KOKKOS_ARCH) | grep Vega | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(strip $(shell echo $(KOKKOS_ARCH) | grep gfx901 | wc -l))
# Any AVX?
KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc ))
@@ -271,6 +295,7 @@ KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_US
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
+KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER7) | bc ))
# Decide whether we can support transactional memory
KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
@@ -319,8 +344,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
endif
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+ tmp := $(shell echo '\#define KOKKOS_ENABLE_ROCM 1' >> KokkosCore_config.tmp)
+endif
+
ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1)
- tmp := $(shell echo '\#define KOKKOS_ENABLE_OPENMPTARGET 1' >> KokkosCore_config.tmp)
+ tmp := $(shell echo '\#define KOKKOS_ENABLE_OPENMPTARGET 1' >> KokkosCore_config.tmp)
endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
@@ -363,6 +392,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
endif
+ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1)
+ tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp )
+ tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCBE" >> KokkosCore_config.tmp )
+ tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
+endif
+
tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
@@ -561,6 +596,18 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
endif
endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1)
+ tmp := $(shell echo "\#define KOKKOS_ARCH_POWER7 1" >> KokkosCore_config.tmp )
+
+ ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+
+ else
+ # Assume that this is a really a GNU compiler or it could be XL on P8.
+ KOKKOS_CXXFLAGS += -mcpu=power7 -mtune=power7
+ KOKKOS_LDFLAGS += -mcpu=power7 -mtune=power7
+ endif
+endif
+
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
@@ -742,7 +789,49 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
endif
endif
+# Figure out the architecture flag for ROCm.
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+ # Lets start with adding architecture defines
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KAVERI), 1)
+ tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 701" >> KokkosCore_config.tmp )
+ tmp := $(shell echo "\#define KOKKOS_ARCH_KAVERI 1" >> KokkosCore_config.tmp )
+ KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx701
+ endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_CARRIZO), 1)
+ tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 801" >> KokkosCore_config.tmp )
+ tmp := $(shell echo "\#define KOKKOS_ARCH_CARRIZO 1" >> KokkosCore_config.tmp )
+ KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx801
+ endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_FIJI), 1)
+ tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 803" >> KokkosCore_config.tmp )
+ tmp := $(shell echo "\#define KOKKOS_ARCH_FIJI 1" >> KokkosCore_config.tmp )
+ KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx803
+ endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA), 1)
+ tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 900" >> KokkosCore_config.tmp )
+ tmp := $(shell echo "\#define KOKKOS_ARCH_VEGA 1" >> KokkosCore_config.tmp )
+ KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx900
+ endif
+ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_GFX901), 1)
+ tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 901" >> KokkosCore_config.tmp )
+ tmp := $(shell echo "\#define KOKKOS_ARCH_GFX901 1" >> KokkosCore_config.tmp )
+ KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx901
+ endif
+
+
+ KOKKOS_INTERNAL_HCC_PATH := $(shell which $(CXX))
+ ROCM_HCC_PATH ?= $(KOKKOS_INTERNAL_HCC_PATH:/bin/clang++=)
+
+ KOKKOS_CXXFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --cxxflags)
+ KOKKOS_LDFLAGS += $(shell $(ROCM_HCC_PATH)/bin/hcc-config --ldflags) -lhc_am -lm
+ KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_ROCM_ARCH_FLAG)
+
+ KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.cpp)
+ KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.hpp)
+endif
+
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1)
+
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
else
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
index a9341a907c..964ec966d5 100644
--- a/lib/kokkos/Makefile.targets
+++ b/lib/kokkos/Makefile.targets
@@ -42,6 +42,17 @@ Kokkos_Cuda_Locks.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_C
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Locks.cpp
endif
+ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1)
+Kokkos_ROCm_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp
+ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Exec.cpp
+Kokkos_ROCm_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp
+ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Space.cpp
+Kokkos_ROCm_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp
+ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Task.cpp
+Kokkos_ROCm_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/ROCm/Kokkos_ROCm_Impl.cpp
+endif
+
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
diff --git a/lib/kokkos/README b/lib/kokkos/README
index 257a2e5db4..e5ed39ef53 100644
--- a/lib/kokkos/README
+++ b/lib/kokkos/README
@@ -80,6 +80,9 @@ Other compilers working:
X86:
Cygwin 2.1.0 64bit with gcc 4.9.3
+Limited testing of the following compilers on POWER7+ systems:
+ GCC 4.8.5 (on RHEL7.1 POWER7+)
+
Known non-working combinations:
Power8:
Pthreads backend
@@ -171,3 +174,22 @@ Contributions to Kokkos are welcome. In order to do so, please open an issue
where a feature request or bug can be discussed. Then issue a pull request
with your contribution. Pull requests must be issued against the develop branch.
+===========================================================================
+====Citing Kokkos==========================================================
+===========================================================================
+
+If you publish work which mentions Kokkos, please cite the following paper:
+
+@article{CarterEdwards20143202,
+title = "Kokkos: Enabling manycore performance portability through polymorphic memory access patterns ",
+journal = "Journal of Parallel and Distributed Computing ",
+volume = "74",
+number = "12",
+pages = "3202 - 3216",
+year = "2014",
+note = "Domain-Specific Languages and High-Level Frameworks for High-Performance Computing ",
+issn = "0743-7315",
+doi = "https://doi.org/10.1016/j.jpdc.2014.07.003",
+url = "http://www.sciencedirect.com/science/article/pii/S0743731514001257",
+author = "H. Carter Edwards and Christian R. Trott and Daniel Sunderland"
+}
diff --git a/lib/kokkos/config/kokkos-promotion.txt b/lib/kokkos/config/kokkos-promotion.txt
new file mode 100644
index 0000000000..d56298b416
--- /dev/null
+++ b/lib/kokkos/config/kokkos-promotion.txt
@@ -0,0 +1,140 @@
+Summary:
+
+- Step 1: Testing Kokkos itself using test_all_sandia
+
+- Step 2: Testing of Kokkos integrated into Trilinos (config/trilinos-integration/*.sh)
+
+- Step 3: Locally update CHANGELOG, merge into master, edit config/master_history.txt
+
+- Step 4: Locally snapshot new master into corresponding Trilinos branch (develop or temporary), push with checking-test-sems.sh
+
+- Step 5: Push local Kokkos master to GitHub (need Owner approval)
+
+Steps 1, 2, and 4 include testing that may fail. These failures must be fixed either by pull requests to Kokkos develop, or by creating a new Trilinos branch for parts of Trilinos that must be updated. This is what usually takes the most time.
+
+
+// -------------------------------------------------------------------------------- //
+
+
+Step 1: The following should be repeated on enough machines to cover all
+supported compilers. Those machines are:
+
+ kokkos-dev
+ ??? <- TODO: identify other machines
+
+ 1.1. Clone kokkos develop branch (or just switch to it)
+
+ git clone -b develop git@github.com:kokkos/kokkos.git
+ cd kokkos
+
+ 1.2. Create a testing directory
+
+ mkdir testing
+ cd testing
+
+ 1.3. Run the test_all_sandia script with no options to test all compilers
+
+ nohup ../config/test_all_sandia &
+ tail -f nohup.out # to watch progress
+
+// -------------------------------------------------------------------------------- //
+
+Step 2:
+ 2.1. Build and test Trilinos with 4 different configurations; Run scripts for white and shepard that are provided in kokkos/config/trilinos-integration. These scripts load their own modules/environment, so don't require preparation. You can run all four at the same time, use separate directories for each.
+
+ mkdir serial
+ cd serial
+ nohup KOKKOS_PATH/config/trilinos-integration/shepard_jenkins_run_script_serial_intel &
+
+ 2.2. Compare the compile errors and test failures between updated and pristine versions. There may be compile failures that happen in both, tests that fail in both, and there may be tests that only fail some times (thus, rerun tests manually as needed).
+
+// -------------------------------------------------------------------------------- //
+
+Step 3: This step should be run on kokkos-dev
+
+ 3.1. If you don't have a GitHub token already, generate one for yourself (this will give you TOKEN):
+
+ https://github.com/settings/tokens
+
+ 3.2. Get a clean copy of the Kokkos develop branch
+
+ git clone -b develop git@github.com:kokkos/kokkos.git
+ cd kokkos
+
+ 3.3. Generate the initial changelog. Use the most recent tag as OLDTAG (`git tag -l` can show you all tags). The NEWTAG is the new version number, e.g. "2.04.00". RUN THIS OUTSIDE THE KOKKOS SOURCE TREE!
+
+ module load ruby/2.3.1/gcc/5.3.0
+ gitthub_changelog_generator kokkos/kokkos --token TOKEN --no-pull-requests --include-labels 'InDevelop' --enhancement-labels 'enhancement,Feature Request' --future-release 'NEWTAG' --between-tags 'NEWTAG,OLDTAG'
+ cat CHANGELOG.md
+
+ 3.4. Manually cleanup and commit the change log. Pushing to develop requires Owner permission.
+ (Copy the new section from the generated CHANGELOG.md to KOKKOS_PATH/CHANGELOG.md)
+ (Make desired changes to CHANGELOG.md to enhance clarity (remove issues not noteworthy))
+ (Commit and push the CHANGELOG.md to develop)
+
+ 3.5. Merge develop into master. DO NOT FAST-FORWARD THE MERGE!!!!
+
+ (From kokkos directory):
+ git checkout master
+ git merge --no-ff origin/develop
+
+ 3.6. Update the tag in kokkos/config/master_history.txt
+
+ Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
+ Tag field widths: #.#.##
+ date description: month:day:year
+ date field widths: ##:##:####
+ master description: SHA1 of previous master commit (use `git log`?)
+ develop description: SHA1 of merged develop branch
+ SHA1 field width: ######## (8 chars)
+
+ # Append to config/master_history.txt:
+
+ tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
+
+ git commit --amend -a
+
+
+ 3.7. Create the new tag:
+
+ git tag -a #.#.##
+
+ (type the following into the tag message (same as for step 4.3))
+ tag: #.#.##
+ date: mm/dd/yyyy
+ master: sha1
+ develop: sha1
+
+ 3.8. DO NOT PUSH YET !!!
+
+
+// -------------------------------------------------------------------------------- //
+
+Step 4: This step can be done on any SEMS machine (e.g. kokkos-dev). Actually, the checkin step requires lots of disk space and RAM. Use ceerws1113 if you have access to it.
+
+ 4.1 Clone the Trilinos corresponding branch (or just switch to it)
+
+ git clone -b develop git@github.com:trilinos/Trilinos.git
+ TRILINOS_PATH=$PWD/Trilinos
+
+ 4.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files. Run the following outside of the Kokkos and Trilinos source trees.
+
+ module load sems-python/2.7.9
+ python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
+
+ 4.3. Run checkin-test to push to trilinos using the CI build modules (gcc/4.9.3)
+
+ cd TRILINOS_PATH
+ mkdir CHECKIN
+ cd CHECKIN
+ nohup ../cmake/std/sems/checkin-test-sems.sh --do-all --push &
+
+ 4.4. If there are failures, fix and backtrack. Otherwise, go to next step
+
+// -------------------------------------------------------------------------------- //
+
+Step 5: Push Kokkos master to GitHub (requires Owner permission).
+
+ cd KOKKOS_PATH
+ git push --follow-tags origin master
+
diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt
index 0447db4b2b..96b05c02e1 100644
--- a/lib/kokkos/config/master_history.txt
+++ b/lib/kokkos/config/master_history.txt
@@ -8,3 +8,4 @@ tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186
tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a
+tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1
diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia
index 005cd20721..e6fcaad261 100755
--- a/lib/kokkos/config/test_all_sandia
+++ b/lib/kokkos/config/test_all_sandia
@@ -167,7 +167,6 @@ if [ "$MACHINE" = "sems" ]; then
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
- "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
diff --git a/lib/kokkos/core/cmake/KokkosCore_config.h.in b/lib/kokkos/core/cmake/KokkosCore_config.h.in
index 621cd54e1c..599c6b0224 100644
--- a/lib/kokkos/core/cmake/KokkosCore_config.h.in
+++ b/lib/kokkos/core/cmake/KokkosCore_config.h.in
@@ -1,15 +1,15 @@
-#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
-#error "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
-#else
-#define KOKKOS_CORE_CONFIG_H
-#endif
-
/* The trivial 'src/build_common.sh' creates a config
* that must stay in sync with this file.
*/
#cmakedefine KOKKOS_FOR_SIERRA
-#ifndef KOKKOS_FOR_SIERRA
+#if !defined(KOKKOS_FOR_SIERRA)
+
+#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)
+#error "Don't include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."
+#else
+#define KOKKOS_CORE_CONFIG_H
+#endif
#cmakedefine KOKKOS_HAVE_CUDA
#cmakedefine KOKKOS_HAVE_OPENMP
@@ -93,12 +93,6 @@
#cmakedefine KOKKOS_ARCH_PASCAL60 1
#cmakedefine KOKKOS_ARCH_PASCAL61 1
-// Don't forbid users from defining this macro on the command line,
-// but still make sure that CMake logic can control its definition.
-#ifndef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
-#cmakedefine KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
-#endif
-
// TODO: These are currently not used in Kokkos. Should they be removed?
#cmakedefine KOKKOS_HAVE_MPI
#cmakedefine KOKKOS_HAVE_CUSPARSE
@@ -107,4 +101,4 @@
#cmakedefine KOKKOS_USING_DEPRECATED_VIEW
#cmakedefine KOKKOS_HAVE_CXX11
-#endif // KOKKOS_FOR_SIERRA
+#endif // !defined(KOKKOS_FOR_SIERRA)
diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt
index 492470d05d..0d5d97a829 100644
--- a/lib/kokkos/core/src/CMakeLists.txt
+++ b/lib/kokkos/core/src/CMakeLists.txt
@@ -9,30 +9,6 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11)
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA)
-# Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA governs whether Kokkos allows
-# use of lambdas at the outer level of parallel dispatch (that is, as
-# the argument to an outer parallel_for, parallel_reduce, or
-# parallel_scan). This works with non-CUDA execution spaces if C++11
-# is enabled. It does not currently work with public releases of
-# CUDA. If that changes, please change the default here to ON if CUDA
-# and C++11 are ON.
-IF (${PROJECT_NAME}_ENABLE_CXX11)
- IF (${PACKAGE_NAME}_ENABLE_CUDA)
- SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
- ELSE ()
- SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT ON)
- ENDIF ()
-ELSE ()
- SET(Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT OFF)
-ENDIF ()
-
-TRIBITS_ADD_OPTION_AND_DEFINE(
- Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA
- KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
- "Whether Kokkos allows use of lambdas at the outer level of parallel dispatch (that is, as the argument to an outer parallel_for, parallel_reduce, or parallel_scan). This requires C++11. It also does not currently work with public releases of CUDA. As a result, even if C++11 is enabled, this will be OFF by default if CUDA is enabled. If this option is ON, the macro KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA will be defined. For compatibility with Kokkos' Makefile build system, it is also possible to define that macro on the command line."
- ${Kokkos_ENABLE_CXX11_DISPATCH_LAMBDA_DEFAULT}
- )
-
TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h)
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp
index abb263b7cc..43e6386b54 100644
--- a/lib/kokkos/core/src/Kokkos_Array.hpp
+++ b/lib/kokkos/core/src/Kokkos_Array.hpp
@@ -152,10 +152,10 @@ public:
KOKKOS_INLINE_FUNCTION pointer data() { return pointer(0) ; }
KOKKOS_INLINE_FUNCTION const_pointer data() const { return const_pointer(0); }
- ~Array() = default ;
- Array() = default ;
- Array( const Array & ) = default ;
- Array & operator = ( const Array & ) = default ;
+ KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
+ KOKKOS_FUNCTION_DEFAULTED Array() = default ;
+ KOKKOS_FUNCTION_DEFAULTED Array( const Array & ) = default ;
+ KOKKOS_FUNCTION_DEFAULTED Array & operator = ( const Array & ) = default ;
// Some supported compilers are not sufficiently C++11 compliant
// for default move constructor and move assignment operator.
@@ -209,7 +209,7 @@ public:
KOKKOS_INLINE_FUNCTION pointer data() { return m_elem ; }
KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
- ~Array() = default ;
+ KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
Array() = delete ;
Array( const Array & rhs ) = delete ;
@@ -278,7 +278,7 @@ public:
KOKKOS_INLINE_FUNCTION pointer data() { return m_elem ; }
KOKKOS_INLINE_FUNCTION const_pointer data() const { return m_elem ; }
- ~Array() = default ;
+ KOKKOS_FUNCTION_DEFAULTED ~Array() = default ;
Array() = delete ;
Array( const Array & ) = delete ;
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
index 3c8673c66a..54a97a16c1 100644
--- a/lib/kokkos/core/src/Kokkos_Atomic.hpp
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -80,6 +80,11 @@
// Compiling NVIDIA device code, must use Cuda atomics:
#define KOKKOS_ENABLE_CUDA_ATOMICS
+
+#elif defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU)
+
+#define KOKKOS_ENABLE_ROCM_ATOMICS
+
#endif
#if ! defined( KOKKOS_ENABLE_GNU_ATOMICS ) && \
@@ -154,6 +159,19 @@ const char * atomic_query_version()
} // namespace Kokkos
+#if defined( KOKKOS_ENABLE_ROCM )
+#include
+namespace Kokkos {
+namespace Impl {
+extern KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr);
+
+extern KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr);
+}
+}
+#endif
+
#ifdef _WIN32
#include "impl/Kokkos_Atomic_Windows.hpp"
#else
diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp
index 1fe964a6d2..26b47a8b74 100644
--- a/lib/kokkos/core/src/Kokkos_Complex.hpp
+++ b/lib/kokkos/core/src/Kokkos_Complex.hpp
@@ -107,6 +107,11 @@ public:
re_ (val), im_ (0.0)
{}
+ // BUG HCC WORKAROUND
+ KOKKOS_INLINE_FUNCTION complex( const RealType& re, const RealType& im):
+ re_ (re), im_ (im)
+ {}
+
//! Constructor that takes the real and imaginary parts.
template
KOKKOS_INLINE_FUNCTION complex (const RealType1& re, const RealType2& im) :
@@ -227,6 +232,16 @@ public:
return re_;
}
+ //! Set the imaginary part of this complex number.
+ KOKKOS_INLINE_FUNCTION void imag (RealType v) {
+ im_ = v;
+ }
+
+ //! Set the real part of this complex number.
+ KOKKOS_INLINE_FUNCTION void real (RealType v) {
+ re_ = v;
+ }
+
KOKKOS_INLINE_FUNCTION
complex& operator += (const complex& src) {
re_ += src.re_;
@@ -299,7 +314,7 @@ public:
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
- const RealType s = ::fabs (y.real ()) + ::fabs (y.imag ());
+ const RealType s = std::fabs (y.real ()) + std::fabs (y.imag ());
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
@@ -537,7 +552,7 @@ operator / (const complex& x, const complex& y) {
// Scale (by the "1-norm" of y) to avoid unwarranted overflow.
// If the real part is +/-Inf and the imaginary part is -/+Inf,
// this won't change the result.
- const RealType s = ::fabs (real (y)) + ::fabs (imag (y));
+ const RealType s = std::fabs (real (y)) + std::fabs (imag (y));
// If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0.
// In that case, the relation x/y == (x/s) / (y/s) doesn't hold,
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
index ddb11d2894..3748c35eb0 100644
--- a/lib/kokkos/core/src/Kokkos_Core.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -74,6 +74,10 @@
#include
#endif
+#if defined( KOKKOS_ENABLE_ROCM )
+#include
+#endif
+
#include
#include
#include
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
index 8c080f7a8f..29001e19ed 100644
--- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -122,6 +122,13 @@ class CudaHostPinnedSpace; ///< Memory space on Host accessible to Cuda GPU
class Cuda; ///< Execution space for Cuda GPU
#endif
+#if defined( KOKKOS_ENABLE_ROCM )
+namespace Experimental {
+class ROCmSpace ; ///< Memory space on ROCm GPU
+class ROCm ; ///< Execution space for ROCm GPU
+}
+#endif
+
template
struct Device;
@@ -140,6 +147,8 @@ namespace Kokkos {
typedef Cuda DefaultExecutionSpace;
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
typedef Experimental::OpenMPTarget DefaultExecutionSpace ;
+#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
+ typedef Experimental::ROCm DefaultExecutionSpace ;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
typedef OpenMP DefaultExecutionSpace;
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@@ -185,6 +194,8 @@ namespace Impl {
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace;
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU )
+typedef Kokkos::HostSpace ActiveExecutionMemorySpace ;
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
typedef Kokkos::HostSpace ActiveExecutionMemorySpace;
#else
diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp
index 93b3fa5ca9..f089c16ad2 100644
--- a/lib/kokkos/core/src/Kokkos_Crs.hpp
+++ b/lib/kokkos/core/src/Kokkos_Crs.hpp
@@ -98,18 +98,18 @@ public:
typedef View row_map_type;
typedef View entries_type;
- entries_type entries;
row_map_type row_map;
+ entries_type entries;
//! Construct an empty view.
- Crs () : entries(), row_map() {}
+ Crs() : row_map(), entries() {}
//! Copy constructor (shallow copy).
- Crs (const Crs& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+ Crs(const Crs& rhs) : row_map(rhs.row_map), entries(rhs.entries)
{}
template
- Crs (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
+ Crs(const RowMapType& row_map_, const EntriesType& entries_) : row_map(row_map_), entries(entries_)
{}
/** \brief Assign to a view of the rhs array.
@@ -117,8 +117,8 @@ public:
* then allocated memory is deallocated.
*/
Crs& operator= (const Crs& rhs) {
- entries = rhs.entries;
row_map = rhs.row_map;
+ entries = rhs.entries;
return *this;
}
@@ -151,7 +151,7 @@ void get_crs_transpose_counts(
template< class OutCounts,
class InCrs>
-void get_crs_row_map_from_counts(
+typename OutCounts::value_type get_crs_row_map_from_counts(
OutCounts& out,
InCrs const& in,
std::string const& name = "row_map");
@@ -204,18 +204,20 @@ class CrsRowMapFromCounts {
using execution_space = typename InCounts::execution_space;
using value_type = typename OutRowMap::value_type;
using index_type = typename InCounts::size_type;
+ using last_value_type = Kokkos::View;
private:
- InCounts in;
- OutRowMap out;
+ InCounts m_in;
+ OutRowMap m_out;
+ last_value_type m_last_value;
public:
KOKKOS_INLINE_FUNCTION
void operator()(index_type i, value_type& update, bool final_pass) const {
- update += in(i);
- if (final_pass) {
- out(i + 1) = update;
- if (i == 0) {
- out(0) = 0;
- }
+ if (i < m_in.size()) {
+ update += m_in(i);
+ if (final_pass) m_out(i + 1) = update;
+ } else if (final_pass) {
+ m_out(0) = 0;
+ m_last_value() = update;
}
}
KOKKOS_INLINE_FUNCTION
@@ -226,12 +228,16 @@ class CrsRowMapFromCounts {
}
using self_type = CrsRowMapFromCounts;
CrsRowMapFromCounts(InCounts const& arg_in, OutRowMap const& arg_out):
- in(arg_in),out(arg_out) {
+ m_in(arg_in), m_out(arg_out), m_last_value("last_value") {
+ }
+ value_type execute() {
using policy_type = RangePolicy;
using closure_type = Kokkos::Impl::ParallelScan;
- closure_type closure(*this, policy_type(0, in.size()));
+ closure_type closure(*this, policy_type(0, m_in.size() + 1));
closure.execute();
- execution_space::fence();
+ auto last_value = Kokkos::create_mirror_view(m_last_value);
+ Kokkos::deep_copy(last_value, m_last_value);
+ return last_value();
}
};
@@ -297,13 +303,14 @@ void get_crs_transpose_counts(
template< class OutRowMap,
class InCounts>
-void get_crs_row_map_from_counts(
+typename OutRowMap::value_type get_crs_row_map_from_counts(
OutRowMap& out,
InCounts const& in,
std::string const& name) {
out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1);
Kokkos::Impl::Experimental::
CrsRowMapFromCounts functor(in, out);
+ return functor.execute();
}
template< class DataType,
@@ -328,6 +335,65 @@ void transpose_crs(
FillCrsTransposeEntries entries_functor(in, out);
}
+template< class CrsType,
+ class Functor>
+struct CountAndFill {
+ using data_type = typename CrsType::size_type;
+ using size_type = typename CrsType::size_type;
+ using row_map_type = typename CrsType::row_map_type;
+ using entries_type = typename CrsType::entries_type;
+ using counts_type = row_map_type;
+ CrsType m_crs;
+ Functor m_functor;
+ counts_type m_counts;
+ struct Count {};
+ KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const {
+ m_counts(i) = m_functor(i, nullptr);
+ }
+ struct Fill {};
+ KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const {
+ auto j = m_crs.row_map(i);
+ data_type* fill = &(m_crs.entries(j));
+ m_functor(i, fill);
+ }
+ using self_type = CountAndFill;
+ CountAndFill(CrsType& crs, size_type nrows, Functor const& f):
+ m_crs(crs),
+ m_functor(f)
+ {
+ using execution_space = typename CrsType::execution_space;
+ m_counts = counts_type("counts", nrows);
+ {
+ using count_policy_type = RangePolicy;
+ using count_closure_type =
+ Kokkos::Impl::ParallelFor;
+ const count_closure_type closure(*this, count_policy_type(0, nrows));
+ closure.execute();
+ }
+ auto nentries = Kokkos::Experimental::
+ get_crs_row_map_from_counts(m_crs.row_map, m_counts);
+ m_counts = counts_type();
+ m_crs.entries = entries_type("entries", nentries);
+ {
+ using fill_policy_type = RangePolicy;
+ using fill_closure_type =
+ Kokkos::Impl::ParallelFor;
+ const fill_closure_type closure(*this, fill_policy_type(0, nrows));
+ closure.execute();
+ }
+ crs = m_crs;
+ }
+};
+
+template< class CrsType,
+ class Functor>
+void count_and_fill_crs(
+ CrsType& crs,
+ typename CrsType::size_type nrows,
+ Functor const& f) {
+ Kokkos::Experimental::CountAndFill(crs, nrows, f);
+}
+
}} // namespace Kokkos::Experimental
#endif /* #define KOKKOS_CRS_HPP */
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
index 250ef6630a..7137eaae4b 100644
--- a/lib/kokkos/core/src/Kokkos_Macros.hpp
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -96,6 +96,14 @@
//----------------------------------------------------------------------------
+#if defined(KOKKOS_ENABLE_SERIAL) || defined(KOKKOS_ENABLE_THREADS) || \
+ defined(KOKKOS_ENABLE_OPENMP) || defined(KOKKOS_ENABLE_QTHREADS) || \
+ defined(KOKKOS_ENABLE_ROCM) || defined(KOKKOS_ENABLE_OPENMPTARGET)
+ #define KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND
+#endif
+
+#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+
#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
// Compiling with a CUDA compiler.
//
@@ -133,6 +141,9 @@
#if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
#define KOKKOS_LAMBDA [=]__device__
+ #if defined( KOKKOS_INTERNAL_ENABLE_NON_CUDA_BACKEND )
+ #undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+ #endif
#else
#define KOKKOS_LAMBDA [=]__host__ __device__
@@ -141,16 +152,13 @@
#endif
#endif
- #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
- #endif
-#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
-
-#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
- // Cuda version 8.0 still needs the functor wrapper
- #if /* ( CUDA_VERSION < 8000 ) && */ defined( __NVCC__ )
+ #if defined( __NVCC__ )
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
- #endif
-#endif
+ #endif
+ #else // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+ #undef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
+ #endif // !defined(KOKKOS_ENABLE_CUDA_LAMBDA)
+#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
//----------------------------------------------------------------------------
// Language info: C++, CUDA, OPENMP
@@ -161,8 +169,20 @@
#define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
#define KOKKOS_INLINE_FUNCTION __device__ __host__ inline
#define KOKKOS_FUNCTION __device__ __host__
+ #ifdef KOKKOS_COMPILER_CLANG
+ #define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
+ #endif
#endif // #if defined( __CUDA_ARCH__ )
+#if defined( KOKKOS_ENABLE_ROCM ) && defined( __HCC__ )
+
+ #define KOKKOS_FORCEINLINE_FUNCTION __attribute__((amp,cpu)) inline
+ #define KOKKOS_INLINE_FUNCTION __attribute__((amp,cpu)) inline
+ #define KOKKOS_FUNCTION __attribute__((amp,cpu))
+ #define KOKKOS_LAMBDA [=] __attribute__((amp,cpu))
+ #define KOKKOS_FUNCTION_DEFAULTED KOKKOS_FUNCTION
+#endif
+
#if defined( _OPENMP )
// Compiling with OpenMP.
// The value of _OPENMP is an integer value YYYYMM
@@ -179,15 +199,6 @@
// Host code is compiled again with another compiler.
// Device code is compile to 'ptx'.
#define KOKKOS_COMPILER_NVCC __NVCC__
-#else
- #if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
- #if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
- // CUDA (including version 6.5) does not support giving lambdas as
- // arguments to global functions. Thus its not currently possible
- // to dispatch lambdas from the host.
- #define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
- #endif
- #endif
#endif // #if defined( __NVCC__ )
#if !defined( KOKKOS_LAMBDA )
@@ -321,6 +332,10 @@
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
+
+ #if ! defined( KOKKOS_ENABLE_ASM )
+ #define KOKKOS_ENABLE_ASM 1
+ #endif
#endif
//----------------------------------------------------------------------------
@@ -397,6 +412,10 @@
#define KOKKOS_FUNCTION /**/
#endif
+#if !defined( KOKKOS_FUNCTION_DEFAULTED )
+ #define KOKKOS_FUNCTION_DEFAULTED /**/
+#endif
+
//----------------------------------------------------------------------------
// Define empty macro for restrict if necessary:
@@ -424,6 +443,7 @@
// There is zero or one default execution space specified.
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+ ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
@@ -435,6 +455,7 @@
// If default is not specified then chose from enabled execution spaces.
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
@@ -442,6 +463,8 @@
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
#elif defined( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined( KOKKOS_ENABLE_ROCM )
+ #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_ROCM
#elif defined( KOKKOS_ENABLE_OPENMPTARGET )
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMPTARGET
#elif defined( KOKKOS_ENABLE_OPENMP )
@@ -459,6 +482,8 @@
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#elif defined( __HCC__ ) && defined( __HCC_ACCELERATOR__ ) && defined( KOKKOS_ENABLE_ROCM )
+ #define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_ROCM_GPU
#else
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
#endif
diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
index 1da936067d..4ba5812f9e 100644
--- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
+++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp
@@ -233,12 +233,24 @@ public:
//--------------------------------------------------------------------------
- MemoryPool() = default ;
MemoryPool( MemoryPool && ) = default ;
MemoryPool( const MemoryPool & ) = default ;
MemoryPool & operator = ( MemoryPool && ) = default ;
MemoryPool & operator = ( const MemoryPool & ) = default ;
+ MemoryPool()
+ : m_tracker()
+ , m_sb_state_array(0)
+ , m_sb_state_size(0)
+ , m_sb_size_lg2(0)
+ , m_max_block_size_lg2(0)
+ , m_min_block_size_lg2(0)
+ , m_sb_count(0)
+ , m_hint_offset(0)
+ , m_data_offset(0)
+ , m_unused_padding(0)
+ {}
+
/**\brief Allocate a memory pool from 'memspace'.
*
* The memory pool will have at least 'min_total_alloc_size' bytes
diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
index 9df6d4ba09..c392fc5b9a 100644
--- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
+++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp
@@ -1016,7 +1016,7 @@ parallel_reduce( std::string const & arg_label
//------------------------------
- #if (KOKKOS_ENABLE_PROFILING)
+ #if defined(KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::beginParallelReduce(arg_label, 0, &kpID);
@@ -1042,7 +1042,7 @@ parallel_reduce( std::string const & arg_label
//------------------------------
- #if (KOKKOS_ENABLE_PROFILING)
+ #if defined(KOKKOS_ENABLE_PROFILING)
if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Profiling::endParallelReduce(kpID);
}
diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp
new file mode 100644
index 0000000000..b13b0b01de
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp
@@ -0,0 +1,220 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCM_HPP
+#define KOKKOS_ROCM_HPP
+
+#include
+
+#if defined( KOKKOS_ENABLE_ROCM )
+#include
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+/*--------------------------------------------------------------------------*/
+
+#include
+#include
+#include
+
+#if defined( __HCC_ACCELERATOR__ )
+
+using namespace ::Concurrency::precise_math ;
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ROCmExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/// \class ROCm
+/// \brief Kokkos device for multicore processors in the host memory space.
+class ROCm {
+public:
+ //------------------------------------
+ //! \name Type declarations that all Kokkos devices must provide.
+ //@{
+
+ //! Tag this class as a kokkos execution space
+ typedef ROCm execution_space ;
+ typedef ROCmSpace memory_space ;
+ typedef Kokkos::Device device_type;
+
+ typedef LayoutLeft array_layout ;
+ typedef HostSpace::size_type size_type ;
+
+ typedef ScratchMemorySpace< ROCm > scratch_memory_space ;
+
+ ~ROCm() {}
+ ROCm();
+// explicit ROCm( const int instance_id );
+
+ ROCm( ROCm && ) = default ;
+ ROCm( const ROCm & ) = default ;
+ ROCm & operator = ( ROCm && ) = default ;
+ ROCm & operator = ( const ROCm & ) = default ;
+
+
+ //@}
+ //------------------------------------
+ //! \name Functions that all Kokkos devices must implement.
+ //@{
+
+ KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __HCC_ACCELERATOR__ )
+ return true;
+#else
+ return false;
+#endif
+ }
+
+ /** \brief Set the device in a "sleep" state. */
+ static bool sleep() ;
+
+ /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+ static bool wake() ;
+
+ /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+ static void fence() ;
+
+ /// \brief Print configuration information to the given output stream.
+ static void print_configuration( std::ostream & , const bool detail = false );
+
+ /// \brief Free any resources being consumed by the device.
+ static void finalize() ;
+
+ /** \brief Initialize the device.
+ *
+ */
+ struct SelectDevice {
+ int rocm_device_id ;
+ SelectDevice() : rocm_device_id(1) {}
+ explicit SelectDevice( int id ) : rocm_device_id( id+1 ) {}
+ };
+
+ int rocm_device() const { return m_device ; }
+ bool isAPU();
+ bool isAPU(int device);
+
+ static void initialize( const SelectDevice = SelectDevice());
+
+ static int is_initialized();
+
+// static size_type device_arch();
+
+// static size_type detect_device_count();
+
+
+ static int concurrency() ;
+ static const char* name();
+private:
+ int m_device ;
+
+};
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct MemorySpaceAccess
+ < Kokkos::Experimental::ROCmSpace
+ , Kokkos::Experimental::ROCm::scratch_memory_space
+ >
+{
+ enum { assignable = false };
+ enum { accessible = true };
+ enum { deepcopy = false };
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+ < Kokkos::Experimental::ROCm::memory_space
+ , Kokkos::Experimental::ROCm::scratch_memory_space
+ >
+{
+ enum { value = true };
+ KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+ KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+ < Kokkos::HostSpace
+ , Kokkos::Experimental::ROCm::scratch_memory_space
+ >
+{
+ enum { value = false };
+ inline static void verify( void ) { Experimental::ROCmSpace::access_error(); }
+ inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); }
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+
+#include
+#include
+
+#endif
+#endif
+
+
diff --git a/lib/kokkos/core/src/Kokkos_ROCmSpace.hpp b/lib/kokkos/core/src/Kokkos_ROCmSpace.hpp
new file mode 100644
index 0000000000..dce6a3cb3f
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_ROCmSpace.hpp
@@ -0,0 +1,622 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ROCMSPACE_HPP
+#define KOKKOS_ROCMSPACE_HPP
+
+#include
+
+#if defined( KOKKOS_ENABLE_ROCM )
+
+#include
+#include
+#include
+
+#include
+
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Experimental {
+/** \brief ROCm on-device memory management */
+
+class ROCmSpace {
+public:
+
+ //! Tag this class as a kokkos memory space
+ typedef ROCmSpace memory_space ;
+ typedef Kokkos::Experimental::ROCm execution_space ;
+ typedef Kokkos::Device device_type;
+
+ typedef unsigned int size_type ;
+
+ /*--------------------------------*/
+
+ ROCmSpace();
+ ROCmSpace( ROCmSpace && rhs ) = default ;
+ ROCmSpace( const ROCmSpace & rhs ) = default ;
+ ROCmSpace & operator = ( ROCmSpace && rhs ) = default ;
+ ROCmSpace & operator = ( const ROCmSpace & rhs ) = default ;
+ ~ROCmSpace() = default ;
+
+ /**\brief Allocate untracked memory in the rocm space */
+ void * allocate( const size_t arg_alloc_size ) const ;
+
+ /**\brief Deallocate untracked memory in the rocm space */
+ void deallocate( void * const arg_alloc_ptr
+ , const size_t arg_alloc_size ) const ;
+
+ /**\brief Return Name of the MemorySpace */
+ static constexpr const char* name() { return m_name; };
+
+ /*--------------------------------*/
+ /** \brief Error reporting for HostSpace attempt to access ROCmSpace */
+ static void access_error();
+ static void access_error( const void * const );
+
+private:
+
+ int m_device ; ///< Which ROCm device
+
+ static constexpr const char* m_name = "ROCm";
+ friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void > ;
+};
+
+} // namespace Experimental
+
+namespace Impl {
+
+void * rocm_device_allocate(int);
+void * rocm_hostpinned_allocate(int);
+void rocm_device_free(void * );
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_arrays_rocm_space();
+
+/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* atomic_lock_array_rocm_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
+///
+/// Team and Thread private scratch allocations in
+/// global memory are aquired via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* scratch_lock_array_rocm_space_ptr(bool deallocate = false);
+
+/// \brief Retrieve the pointer to the scratch array for unique identifiers.
+///
+/// Unique identifiers in the range 0-ROCm::concurrency
+/// are provided via locks.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* threadid_lock_array_rocm_space_ptr(bool deallocate = false);
+}
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+namespace Experimental {
+/** \brief Host memory that is accessible to ROCm execution space
+ * through ROCm's host-pinned memory allocation.
+ */
+class ROCmHostPinnedSpace {
+public:
+
+ //! Tag this class as a kokkos memory space
+ /** \brief Memory is in HostSpace so use the HostSpace::execution_space */
+ typedef HostSpace::execution_space execution_space ;
+ typedef ROCmHostPinnedSpace memory_space ;
+ typedef Kokkos::Device device_type;
+ typedef unsigned int size_type ;
+
+ /*--------------------------------*/
+
+ ROCmHostPinnedSpace();
+ ROCmHostPinnedSpace( ROCmHostPinnedSpace && rhs ) = default ;
+ ROCmHostPinnedSpace( const ROCmHostPinnedSpace & rhs ) = default ;
+ ROCmHostPinnedSpace & operator = ( ROCmHostPinnedSpace && rhs ) = default ;
+ ROCmHostPinnedSpace & operator = ( const ROCmHostPinnedSpace & rhs ) = default ;
+ ~ROCmHostPinnedSpace() = default ;
+
+ /**\brief Allocate untracked memory in the space */
+ void * allocate( const size_t arg_alloc_size ) const ;
+
+ /**\brief Deallocate untracked memory in the space */
+ void deallocate( void * const arg_alloc_ptr
+ , const size_t arg_alloc_size ) const ;
+
+ /**\brief Return Name of the MemorySpace */
+ static constexpr const char* name() { return m_name; };
+
+private:
+
+ static constexpr const char* m_name = "ROCmHostPinned";
+
+ /*--------------------------------*/
+};
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace >::assignable , "" );
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace > {
+ enum { assignable = false };
+ enum { accessible = false };
+ enum { deepcopy = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
+ // HostSpace::execution_space == ROCmHostPinnedSpace::execution_space
+ enum { assignable = true };
+ enum { accessible = true };
+ enum { deepcopy = true };
+};
+
+//----------------------------------------
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace > {
+ enum { assignable = false };
+ enum { accessible = false };
+ enum { deepcopy = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace > {
+ // ROCmSpace::execution_space != ROCmHostPinnedSpace::execution_space
+ enum { assignable = false };
+ enum { accessible = true }; // ROCmSpace::execution_space
+ enum { deepcopy = true };
+};
+
+
+//----------------------------------------
+// ROCmHostPinnedSpace::execution_space == HostSpace::execution_space
+// ROCmHostPinnedSpace accessible to both ROCm and Host
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::HostSpace > {
+ enum { assignable = false }; // Cannot access from ROCm
+ enum { accessible = true }; // ROCmHostPinnedSpace::execution_space
+ enum { deepcopy = true };
+};
+
+template<>
+struct MemorySpaceAccess< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace > {
+ enum { assignable = false }; // Cannot access from Host
+ enum { accessible = false };
+ enum { deepcopy = true };
+};
+
+};
+//----------------------------------------
+
+} // namespace Kokkos::Impl
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+hc::completion_future DeepCopyAsyncROCm( void * dst , const void * src , size_t n);
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm>
+{
+ DeepCopy( void * dst , const void * src , size_t );
+ DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >
+{
+ DeepCopy( void * dst , const void * src , size_t );
+ DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >
+{
+ DeepCopy( void * dst , const void * src , size_t );
+ DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+ hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+ fut.wait();
+// DeepCopy (dst,src,n);
+ }
+};
+
+template struct DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , ExecutionSpace >
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+ DeepCopy (dst,src,n);
+ }
+};
+
+template
+struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+ DeepCopy (dst,src,n);
+ }
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm>
+{
+ DeepCopy( void * dst , const void * src , size_t );
+ DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm >
+{
+ DeepCopy( void * dst , const void * src , size_t );
+ DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >
+{
+ DeepCopy( void * dst , const void * src , size_t );
+ DeepCopy( const Kokkos::Experimental::ROCm & , void * dst , const void * src , size_t );
+};
+
+template
+struct DeepCopy< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace>
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< Kokkos::Experimental::ROCmSpace , HostSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+ hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+ fut.wait();
+// DeepCopyROCm (dst,src,n);
+ }
+};
+
+template struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmSpace , ExecutionSpace >
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+ hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+ fut.wait();
+// DeepCopyROCm (dst,src,n);
+ }
+};
+
+
+
+template struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+// hc::completion_future fut = DeepCopyAsyncROCm (dst,src,n);
+// fut.wait();
+// DeepCopyAsyncROCm (dst,src,n);
+ DeepCopy (dst,src,n);
+ }
+};
+
+template struct DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , ExecutionSpace >
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< Kokkos::Experimental::ROCmHostPinnedSpace , HostSpace , Kokkos::Experimental::ROCm>( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+ DeepCopy (dst,src,n);
+ }
+};
+
+template
+struct DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , ExecutionSpace >
+{
+ inline
+ DeepCopy( void * dst , const void * src , size_t n )
+ { (void) DeepCopy< HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace , Kokkos::Experimental::ROCm >( dst , src , n ); }
+
+ inline
+ DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n )
+ {
+ exec.fence();
+ DeepCopy (dst,src,n);
+ }
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in ROCmSpace attempting to access HostSpace: error */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::HostSpace >
+{
+ enum { value = false };
+ KOKKOS_INLINE_FUNCTION static void verify( void )
+ { Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
+
+ KOKKOS_INLINE_FUNCTION static void verify( const void * )
+ { Kokkos::abort("ROCm code attempted to access HostSpace memory"); }
+};
+
+/** Running in ROCmSpace accessing ROCmHostPinnedSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::ROCmSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
+{
+ enum { value = true };
+ KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+ KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in ROCmSpace attempting to access an unknown space: error */
+template< class OtherSpace >
+struct VerifyExecutionCanAccessMemorySpace<
+ typename enable_if< ! is_same::value , Kokkos::Experimental::ROCmSpace >::type ,
+ OtherSpace >
+{
+ enum { value = false };
+ KOKKOS_INLINE_FUNCTION static void verify( void )
+ { Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
+
+ KOKKOS_INLINE_FUNCTION static void verify( const void * )
+ { Kokkos::abort("ROCm code attempted to access unknown Space memory"); }
+};
+
+//----------------------------------------------------------------------------
+/** Running in HostSpace attempting to access ROCmSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmSpace >
+{
+ enum { value = false };
+ inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); }
+ inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); }
+};
+
+/** Running in HostSpace accessing ROCmHostPinnedSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::ROCmHostPinnedSpace >
+{
+ enum { value = true };
+ KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+ KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::ROCmSpace , void >
+ : public SharedAllocationRecord< void , void >
+{
+private:
+
+
+ typedef SharedAllocationRecord< void , void > RecordBase ;
+
+ SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+ SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+ static void deallocate( RecordBase * );
+
+ static RecordBase s_root_record ;
+
+ const Kokkos::Experimental::ROCmSpace m_space ;
+
+protected:
+
+ ~SharedAllocationRecord();
+
+ SharedAllocationRecord( const Kokkos::Experimental::ROCmSpace & arg_space
+ , const std::string & arg_label
+ , const size_t arg_alloc_size
+ , const RecordBase::function_type arg_dealloc = & deallocate
+ );
+
+public:
+
+ std::string get_label() const ;
+
+ static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmSpace & arg_space
+ , const std::string & arg_label
+ , const size_t arg_alloc_size );
+
+ /**\brief Allocate tracked memory in the space */
+ static
+ void * allocate_tracked( const Kokkos::Experimental::ROCmSpace & arg_space
+ , const std::string & arg_label
+ , const size_t arg_alloc_size );
+
+ /**\brief Reallocate tracked memory in the space */
+ static
+ void * reallocate_tracked( void * const arg_alloc_ptr
+ , const size_t arg_alloc_size );
+
+ /**\brief Deallocate tracked memory in the space */
+ static
+ void deallocate_tracked( void * const arg_alloc_ptr );
+
+ static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+ static void print_records( std::ostream & , const Kokkos::Experimental::ROCmSpace & , bool detail = false );
+};
+
+template<>
+class SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace , void >
+ : public SharedAllocationRecord< void , void >
+{
+private:
+
+ typedef SharedAllocationRecord< void , void > RecordBase ;
+
+ SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+ SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+ static void deallocate( RecordBase * );
+
+ static RecordBase s_root_record ;
+
+ const Kokkos::Experimental::ROCmHostPinnedSpace m_space ;
+
+protected:
+
+ ~SharedAllocationRecord();
+ SharedAllocationRecord() : RecordBase(), m_space() {}
+
+ SharedAllocationRecord( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+ , const std::string & arg_label
+ , const size_t arg_alloc_size
+ , const RecordBase::function_type arg_dealloc = & deallocate
+ );
+
+public:
+
+ std::string get_label() const ;
+
+ static SharedAllocationRecord * allocate( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+ , const std::string & arg_label
+ , const size_t arg_alloc_size
+ );
+ /**\brief Allocate tracked memory in the space */
+ static
+ void * allocate_tracked( const Kokkos::Experimental::ROCmHostPinnedSpace & arg_space
+ , const std::string & arg_label
+ , const size_t arg_alloc_size );
+
+ /**\brief Reallocate tracked memory in the space */
+ static
+ void * reallocate_tracked( void * const arg_alloc_ptr
+ , const size_t arg_alloc_size );
+
+ /**\brief Deallocate tracked memory in the space */
+ static
+ void deallocate_tracked( void * const arg_alloc_ptr );
+
+
+ static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+ static void print_records( std::ostream & , const Kokkos::Experimental::ROCmHostPinnedSpace & , bool detail = false );
+};
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */
+#endif /* #define KOKKOS_ROCMSPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
index fcfc91a4ee..079f80f556 100644
--- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
+++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp
@@ -681,6 +681,67 @@ public:
return f ;
}
+ template < class F >
+ KOKKOS_FUNCTION
+ Future< execution_space >
+ when_all( int narg , F const func )
+ {
+ using input_type = decltype( func(0) );
+ using future_type = Future< execution_space > ;
+ using task_base = Kokkos::Impl::TaskBase< void , void , void > ;
+
+ static_assert( is_future< input_type >::value
+ , "Functor must return a Kokkos::Future" );
+
+ future_type f ;
+
+ if ( 0 == narg ) return f ;
+
+ size_t const alloc_size = m_queue->when_all_allocation_size( narg );
+
+ f.m_task =
+ reinterpret_cast< task_base * >( m_queue->allocate( alloc_size ) );
+
+ if ( f.m_task ) {
+
+ // Reference count starts at two:
+ // +1 to match decrement when task completes
+ // +1 for the future
+
+ new( f.m_task ) task_base();
+
+ f.m_task->m_queue = m_queue ;
+ f.m_task->m_ref_count = 2 ;
+ f.m_task->m_alloc_size = alloc_size ;
+ f.m_task->m_dep_count = narg ;
+ f.m_task->m_task_type = task_base::Aggregate ;
+
+ // Assign dependences, reference counts were already incremented
+
+ task_base * volatile * const dep =
+ f.m_task->aggregate_dependences();
+
+ for ( int i = 0 ; i < narg ; ++i ) {
+ const input_type arg_f = func(i);
+ if ( 0 != arg_f.m_task ) {
+
+ if ( m_queue != static_cast< queue_type * >( arg_f.m_task->m_queue ) ) {
+ Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
+ }
+ // Increment reference count to track subsequent assignment.
+ Kokkos::atomic_increment( &(arg_f.m_task->m_ref_count) );
+ dep[i] = arg_f.m_task ;
+ }
+ }
+
+ Kokkos::memory_fence();
+
+ m_queue->schedule_aggregate( f.m_task );
+ // this when_all may be processed at any moment
+ }
+ return f ;
+ }
+
//----------------------------------------
KOKKOS_INLINE_FUNCTION
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
index 1754e4a8fb..47b105cfdc 100644
--- a/lib/kokkos/core/src/Kokkos_View.hpp
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -2429,6 +2429,7 @@ template < class ValueType >
struct CommonViewAllocProp< void, ValueType >
{
using value_type = ValueType;
+ using scalar_array_type = ValueType;
template < class ... Views >
CommonViewAllocProp( const Views & ... ) {}
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
new file mode 100644
index 0000000000..a93f488203
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp
@@ -0,0 +1,439 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include
+//#include
+
+#ifdef KOKKOS_ENABLE_ROCM_ATOMICS
+namespace Kokkos {
+ //ROCm can do:
+ //Types int/unsigned int
+ //variants: atomic_exchange/compare_exchange/fetch_add/fetch_sub/fetch_max/fetch_min/fetch_and/fetch_or/fetch_xor/fetch_inc/fetch_dec
+
+
+ KOKKOS_INLINE_FUNCTION
+ int atomic_exchange(int* dest, const int& val) {
+ return hc::atomic_exchange_int(dest, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ unsigned int atomic_exchange(unsigned int* dest, const unsigned int& val) {
+ return hc::atomic_exchange_unsigned(dest, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ int64_t atomic_exchange(int64_t* dest, const int64_t& val) {
+ return (int64_t)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ uint64_t atomic_exchange(uint64_t* dest, const uint64_t& val) {
+ return hc::atomic_exchange_uint64(dest, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ long long atomic_exchange(long long* dest, const long long& val) {
+ return (long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ unsigned long long atomic_exchange(unsigned long long* dest, const unsigned long long& val) {
+ return (unsigned long long)hc::atomic_exchange_uint64((uint64_t*)dest, (const uint64_t&)val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ float atomic_exchange(float* dest, const float& val) {
+ union U {
+ int i ;
+ float f ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } idest,ival;
+ idest.f = *dest;
+ ival.f = val;
+ idest.i = hc::atomic_exchange_int((int*)dest, ival.i);
+ return idest.f;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ double atomic_exchange(double* dest, const double& val) {
+ union U {
+ uint64_t i ;
+ double d ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } idest,ival;
+ idest.d = *dest;
+ ival.d = val;
+ idest.i = hc::atomic_exchange_uint64((uint64_t*)dest, ival.i);
+ return idest.d;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ int atomic_compare_exchange(int* dest, int compare, const int& val);
+
+ KOKKOS_INLINE_FUNCTION
+ int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val);
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_exchange(T* dest, typename std::enable_if::type val) {
+ union U {
+ int i ;
+ T t ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } assume , oldval , newval ;
+
+ oldval.t = *dest ;
+ assume.i = oldval.i ;
+ newval.t = val ;
+ atomic_compare_exchange( reinterpret_cast(dest) , assume.i, newval.i );
+
+ return oldval.t ;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_exchange(T* dest, typename std::enable_if::type val) {
+ union U {
+ uint64_t i ;
+ T t ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } assume , oldval , newval ;
+
+ oldval.t = *dest ;
+
+ assume.i = oldval.i ;
+ newval.t = val ;
+ atomic_compare_exchange( (int64_t*)(dest) , assume.i, newval.i );
+
+ return oldval.t ;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_exchange(T* dest, typename std::enable_if::type val) {
+ return val;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ int atomic_compare_exchange(int* dest, int compare, const int& val) {
+ return hc::atomic_compare_exchange_int(dest, compare, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ unsigned int atomic_compare_exchange(unsigned int* dest, unsigned int compare, const unsigned int& val) {
+ return hc::atomic_compare_exchange_unsigned(dest, compare, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ int64_t atomic_compare_exchange(int64_t* dest, int64_t compare, const int64_t& val) {
+ return (int64_t) hc::atomic_compare_exchange_uint64((uint64_t*)dest, (uint64_t)compare, (const uint64_t&)val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ uint64_t atomic_compare_exchange(uint64_t* dest, uint64_t compare, const uint64_t& val) {
+ return hc::atomic_compare_exchange_uint64(dest, compare, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ long long atomic_compare_exchange(long long* dest, long long compare, const long long& val) {
+ return (long long)hc::atomic_compare_exchange_uint64((uint64_t*)(dest), (uint64_t)(compare), (const uint64_t&)(val));
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ float atomic_compare_exchange(float* dest, float compare, const float& val) {
+ union U {
+ int i ;
+ float f ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } idest,icompare,ival;
+ idest.f = *dest;
+ icompare.f = compare;
+ ival.f = val;
+ idest.i = hc::atomic_compare_exchange_int(reinterpret_cast(dest), icompare.i, ival.i);
+ return idest.f;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ double atomic_compare_exchange(double* dest, double compare, const double& val) {
+ union U {
+ uint64_t i ;
+ double d ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } idest,icompare,ival;
+ idest.d = *dest;
+ icompare.d = compare;
+ ival.d = val;
+ idest.i = hc::atomic_compare_exchange_uint64(reinterpret_cast(dest), icompare.i, ival.i);
+ return idest.d;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if::type val) {
+ union U {
+ int i ;
+ T f ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } idest,icompare,ival;
+ idest.f = *dest;
+ icompare.f = compare;
+ ival.f = val;
+ idest.i = hc::atomic_compare_exchange_int((int*)(dest), icompare.i, ival.i);
+ return idest.f;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if::type val) {
+ union U {
+ uint64_t i ;
+ T f ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } idest,icompare,ival;
+ idest.f = *dest;
+ icompare.f = compare;
+ ival.f = val;
+ idest.i = hc::atomic_compare_exchange_uint64((uint64_t*)(dest), icompare.i, ival.i);
+ return idest.f;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_compare_exchange(volatile T* dest, T compare, typename std::enable_if<(sizeof(T) != sizeof(int32_t)) && (sizeof(T) != sizeof(int64_t)), const T&>::type val) {
+ return val;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ int atomic_fetch_add (volatile int * dest, const int& val) {
+ return hc::atomic_fetch_add((int *)dest, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ unsigned int atomic_fetch_add(unsigned int* dest, const unsigned int& val) {
+ return hc::atomic_fetch_add(dest, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ unsigned long atomic_fetch_add(volatile unsigned long* dest, const unsigned long& val) {
+ return (unsigned long)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t)val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ int64_t atomic_fetch_add(volatile int64_t* dest, const int64_t& val) {
+ return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, (const uint64_t&)val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ char atomic_fetch_add(volatile char * dest, const char& val) {
+ unsigned int oldval,newval,assume;
+ oldval = *(int *)dest ;
+
+ do {
+ assume = oldval ;
+ newval = assume&0x7fffff00 + ((assume&0xff)+val)&0xff ;
+ oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+ } while ( assume != oldval );
+
+ return oldval ;
+ }
+
+
+ KOKKOS_INLINE_FUNCTION
+ short atomic_fetch_add(volatile short * dest, const short& val) {
+ unsigned int oldval,newval,assume;
+ oldval = *(int *)dest ;
+
+ do {
+ assume = oldval ;
+ newval = assume&0x7fff0000 + ((assume&0xffff)+val)&0xffff ;
+ oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+ } while ( assume != oldval );
+
+ return oldval ;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ long long atomic_fetch_add(volatile long long * dest, const long long& val) {
+ return (long long)hc::atomic_fetch_add((uint64_t*)dest, (const uint64_t&)val);
+ }
+
+
+
+ KOKKOS_INLINE_FUNCTION
+ int atomic_fetch_sub (volatile int * dest, const int& val) {
+ return hc::atomic_fetch_sub((int *)dest, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ unsigned int atomic_fetch_sub(volatile unsigned int* dest, const unsigned int& val) {
+ return hc::atomic_fetch_sub((unsigned int *)dest, val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ int64_t atomic_fetch_sub(int64_t* dest, const int64_t& val) {
+ return (int64_t)hc::atomic_fetch_add((uint64_t *)dest, -(const uint64_t&)val);
+// return (int64_t)hc::atomic_fetch_sub_uint64((uint64_t*)dest, (const uint64_t&)val);
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ char atomic_fetch_sub(volatile char * dest, const char& val) {
+ unsigned int oldval,newval,assume;
+ oldval = *(int *)dest ;
+
+ do {
+ assume = oldval ;
+ newval = assume&0x7fffff00 + ((assume&0xff)-val)&0xff ;
+ oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+ } while ( assume != oldval );
+
+ return oldval ;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ short atomic_fetch_sub(volatile short * dest, const short& val) {
+ unsigned int oldval,newval,assume;
+ oldval = *(int *)dest ;
+
+ do {
+ assume = oldval ;
+ newval = assume&0x7fff0000 + ((assume&0xffff)-val)&0xffff;
+ oldval = hc::atomic_compare_exchange_unsigned((unsigned int*)dest, assume,newval);
+ } while ( assume != oldval );
+
+ return oldval ;
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ long long atomic_fetch_sub(volatile long long * dest, const long long& val) {
+ return (long long)hc::atomic_fetch_add((uint64_t*)dest, -(const uint64_t&)val);
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_fetch_add(volatile T* dest, typename std::enable_if::type val) {
+ union U {
+ unsigned int i ;
+ T t ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } assume , oldval , newval ;
+
+ oldval.t = *dest ;
+
+ do {
+ assume.i = oldval.i ;
+ newval.t = assume.t + val ;
+ oldval.i = atomic_compare_exchange( (unsigned int*)(dest) , assume.i , newval.i );
+ } while ( assume.i != oldval.i );
+
+ return oldval.t ;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_fetch_add(volatile T* dest, typename std::enable_if::type val) {
+ union U {
+ uint64_t i ;
+ T t ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } assume , oldval , newval ;
+
+ oldval.t = *dest ;
+
+ do {
+ assume.i = oldval.i ;
+ newval.t = assume.t + val ;
+ oldval.i = atomic_compare_exchange( (uint64_t*)dest , assume.i , newval.i );
+ } while ( assume.i != oldval.i );
+
+ return oldval.t ;
+ }
+
+
+ //WORKAROUND
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_fetch_add(volatile T* dest, typename std::enable_if::type val) {
+ return val ;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_fetch_sub(volatile T* dest, typename std::enable_if::type & val) {
+ union U {
+ int i ;
+ T t ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } assume , oldval , newval ;
+
+ oldval.t = *dest ;
+
+ do {
+ assume.i = oldval.i ;
+ newval.t = assume.t - val ;
+ oldval.i = Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+ } while ( assume.i != oldval.i );
+
+ return oldval.t ;
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ T atomic_fetch_sub(volatile T* dest, typename std::enable_if::type val) {
+ union U {
+ int64_t i ;
+ T t ;
+ KOKKOS_INLINE_FUNCTION U() {};
+ } assume , oldval , newval ;
+
+ oldval.t = *dest ;
+
+ do {
+ assume.i = oldval.i ;
+ newval.t = assume.t - val ;
+ oldval.i = atomic_compare_exchange( (int64_t*)dest , assume.i , newval.i );
+ } while ( assume.i != oldval.i );
+
+ return oldval.t ;
+ }
+}
+#endif
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
new file mode 100644
index 0000000000..83b5792a64
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Config.hpp
@@ -0,0 +1,51 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
+#define GUARD_CORE_KOKKOS_ROCM_CONFIG_HPP
+
+#ifndef KOKKOS_ROCM_HAS_WORKAROUNDS
+#define KOKKOS_ROCM_HAS_WORKAROUNDS 1
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
new file mode 100644
index 0000000000..e919d35903
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.cpp
@@ -0,0 +1,133 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCMEXEC_HPP
+#define KOKKOS_ROCMEXEC_HPP
+
+#include
+#include
+#include
+//#include
+#include
+
+#define ROCM_SPACE_ATOMIC_MASK 0x1FFFF
+#define ROCM_SPACE_ATOMIC_XOR_MASK 0x15A39
+#define ROCM_CONCURRENCY 20480
+//#define ROCM_CONCURRENCY 81920 # for fiji
+
+namespace Kokkos {
+ static int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
+ static int rocm_space_scratch_locks[ROCM_CONCURRENCY];
+ static int rocm_space_threadid_locks[ROCM_CONCURRENCY];
+namespace Impl {
+// TODO: mimic cuda implemtation, add dgpu capability
+
+ void init_rocm_atomic_lock_array() {
+ static int is_initialized = 0;
+ if(!is_initialized)
+ {
+ for(int i = 0; i < ROCM_SPACE_ATOMIC_MASK+1; i++)
+ rocm_space_atomic_locks[i] = 0;
+ is_initialized = 1;
+ }
+ }
+
+ void init_rocm_scratch_lock_array() {
+ static int is_initialized = 0;
+ if(!is_initialized)
+ {
+ for(int i = 0; i < ROCM_CONCURRENCY; i++)
+ rocm_space_scratch_locks[i] = 0;
+ is_initialized = 1;
+ }
+ }
+
+ void init_rocm_threadid_lock_array() {
+ static int is_initialized = 0;
+ if(!is_initialized)
+ {
+ for(int i = 0; i < ROCM_CONCURRENCY; i++)
+ rocm_space_threadid_locks[i] = 0;
+ is_initialized = 1;
+ }
+ }
+
+ void init_lock_arrays_rocm_space() {
+ init_rocm_atomic_lock_array();
+// init_rocm_scratch_lock_array();
+// init_rocm_threadid_lock_array();
+ }
+}
+
+} // namespace Kokkos
+#if 0
+namespace Kokkos {
+namespace Impl {
+KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr) {
+#if 0
+return(Kokkos::Impl::lock_address_host_space(ptr));
+#else
+ size_t offset = size_t(ptr);
+ offset = offset >> 2;
+ offset = offset & ROCM_SPACE_ATOMIC_MASK;
+ return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
+#endif
+}
+
+KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr) {
+#if 0
+Kokkos::Impl::unlock_address_host_space(ptr) ;
+#else
+ size_t offset = size_t(ptr);
+ offset = offset >> 2;
+ offset = offset & ROCM_SPACE_ATOMIC_MASK;
+ hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
+#endif
+}
+
+}
+} // namespace Kokkos
+#endif
+
+#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
new file mode 100644
index 0000000000..48a27eb11d
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Exec.hpp
@@ -0,0 +1,137 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ROCMEXEC_HPP
+#define KOKKOS_ROCMEXEC_HPP
+
+#include
+#include
+
+#if defined(__HCC_ACCELERATOR__)
+#define printf(...)
+#endif
+
+namespace Kokkos {
+namespace Impl {
+struct ROCmTraits {
+// TODO: determine if needed
+ enum { WavefrontSize = 64 /* 64 */ };
+ enum { WorkgroupSize = 64 /* 64 */ };
+ enum { WavefrontIndexMask = 0x001f /* Mask for warpindex */ };
+ enum { WavefrontIndexShift = 5 /* WarpSize == 1 << WarpShift */ };
+
+ enum { SharedMemoryBanks = 32 /* Compute device 2.0 */ };
+ enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+ enum { SharedMemoryUsage = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+ enum { UpperBoundExtentCount = 65535 /* Hard upper bound */ };
+#if 0
+ KOKKOS_INLINE_FUNCTION static
+ ROCmSpace::size_type wavefront_count( ROCmSpace::size_type i )
+ { return ( i + WavefrontIndexMask ) >> WavefrontIndexShift ; }
+
+ KOKKOS_INLINE_FUNCTION static
+ ROCmSpace::size_type wavefront_align( ROCmSpace::size_type i )
+ {
+ enum { Mask = ~ROCmSpace::size_type( WavefrontIndexMask ) };
+ return ( i + WavefrontIndexMask ) & Mask ;
+ }
+#endif
+};
+size_t rocm_internal_cu_count();
+size_t rocm_internal_maximum_workgroup_count();
+
+size_t * rocm_internal_scratch_flags( const size_t size );
+size_t * rocm_internal_scratch_space( const size_t size );
+
+}
+} // namespace Kokkos
+#define ROCM_SPACE_ATOMIC_MASK 0x1FFFF
+#define ROCM_SPACE_ATOMIC_XOR_MASK 0x15A39
+//int rocm_space_atomic_locks[ROCM_SPACE_ATOMIC_MASK+1];
+extern int
+ *rocm_space_atomic_locks;
+
+namespace Kokkos {
+namespace Impl {
+ void init_lock_arrays_rocm_space();
+
+ void* rocm_resize_scratch_space(size_t bytes, bool force_shrink = false);
+
+// TODO: determine if needed
+KOKKOS_INLINE_FUNCTION
+bool lock_address_rocm_space(void* ptr) {
+#if 0
+return(Kokkos::Impl::lock_address_host_space(ptr));
+#else
+ size_t offset = size_t(ptr);
+ offset = offset >> 2;
+ offset = offset & ROCM_SPACE_ATOMIC_MASK;
+ return (0 == hc::atomic_compare_exchange(&rocm_space_atomic_locks[offset],0,1));
+#endif
+}
+KOKKOS_INLINE_FUNCTION
+void unlock_address_rocm_space(void* ptr) {
+#if 0
+Kokkos::Impl::unlock_address_host_space(ptr) ;
+#else
+ size_t offset = size_t(ptr);
+ offset = offset >> 2;
+ offset = offset & ROCM_SPACE_ATOMIC_MASK;
+ hc::atomic_exchange( &rocm_space_atomic_locks[ offset ], 0);
+#endif
+}
+
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Impl {
+//extern
+//KOKKOS_INLINE_FUNCTION
+//void init_lock_arrays_rocm_space();
+
+
+}
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ROCMEXEC_HPP */
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
new file mode 100644
index 0000000000..1322391d92
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp
@@ -0,0 +1,753 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#ifdef KOKKOS_ENABLE_ROCM
+
+//#include
+#include
+#include
+#include
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include
+
+/* Standard 'C++' libraries */
+#include
+#include
+#include
+#include
+
+
+
+//KOKKOS_INLINE_FUNCTION
+// Kokkos::Impl::ROCmLockArraysStruct kokkos_impl_rocm_lock_arrays ;
+
+
+/*--------------------------------------------------------------------------*/
+namespace Kokkos {
+namespace Impl {
+
+#if 0
+namespace {
+__global__
+void query_rocm_kernel_arch( int * d_arch )
+{
+#if defined( __HCC_ACCELERATOR__ )
+ *d_arch = OCM_ARCH__ ;
+#else
+ *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int rocm_kernel_arch()
+{
+ int * d_arch = 0 ;
+ rocmMalloc( (void **) & d_arch , sizeof(int) );
+ query_rocm_kernel_arch<<<1,1>>>( d_arch );
+ int arch = 0 ;
+ rocmMemcpy( & arch , d_arch , sizeof(int) , rocmMemcpyDefault );
+ rocmFree( d_arch );
+ return arch ;
+}
+bool rocm_launch_blocking()
+{
+ const char * env = getenv("ROCM_LAUNCH_BLOCKING");
+
+ if (env == 0) return false;
+
+ return atoi(env);
+}
+
+}
+#endif
+
+// true device memory allocation, not visible from host
+void * rocm_device_allocate(int size)
+{
+ void * ptr;
+ hc::accelerator acc;
+ ptr = hc::am_alloc(size,acc,0);
+ return ptr;
+}
+
+// host pinned allocation
+// flag = 1, non-coherent, host resident, but with gpu address space pointer
+// flag = 2, coherent, host resident, but with host address space pointer
+void * rocm_hostpinned_allocate(int size)
+{
+ void * ptr;
+ hc::accelerator acc;
+ ptr = hc::am_alloc(size,acc,2);
+ return ptr;
+}
+// same free used by all rocm memory allocations
+void rocm_device_free(void * ptr)
+{
+ hc::am_free(ptr);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+void rocm_device_synchronize()
+{
+ hc::accelerator_view av = hc::accelerator().get_default_view();
+ hc::completion_future fut = av.create_marker();
+ fut.wait();
+}
+
+void rocm_internal_error_throw( const char * name, const char * file, const int line )
+{
+#if 0
+ std::ostringstream out ;
+ out << name << " error( " << rocmGetErrorName(e) << "): " << rocmGetErrorString(e);
+ if (file) {
+ out << " " << file << ":" << line;
+ }
+ throw_runtime_exception( out.str() );
+#endif
+}
+
+//----------------------------------------------------------------------------
+// Some significant rocm device properties:
+//
+// rocmDeviceProp::name : Text label for device
+// rocmDeviceProp::major : Device major number
+// rocmDeviceProp::minor : Device minor number
+// rocmDeviceProp::workgroupSize : number of threads per workgroup
+// rocmDeviceProp::multiProcessorCount : number of multiprocessors
+// rocmDeviceProp::sharedMemPerBlock : capacity of shared memory per wavefront
+// rocmDeviceProp::totalConstMem : capacity of constant memory
+// rocmDeviceProp::totalGlobalMem : capacity of global memory
+// rocmDeviceProp::maxGridSize[3] : maximum grid size
+
+//
+//
+// the data we have available from a ROCm accelerator
+// std::wstring get_device_path()
+// std::wstring get_description()
+// unsigned int get_version()
+// bool get_has_display()
+// size_t get_dedicated_memory()
+// bool get_supports_double_precision()
+// bool get_supports_limited_double_precision()
+// bool get_is_debug()
+// bool get_supports_cpu_shared_memory()
+// size_t get_max_tile_static_size()
+// unsigned int get_cu_count()
+// bool has_cpu_accessible_am()
+struct rocmDeviceProp {
+ char name[256];
+ char description[256];
+ unsigned int version;
+ int device_type;
+ int device_ordinal;
+ int major;
+ int minor;
+ size_t totalGlobalMem;
+ size_t sharedMemPerWavefront;
+ int WavefrontSize;
+ int WorkgroupSize;
+ int MaxTileCount;
+ int maxThreadsPerWorkgroup;
+ int multiProcessorCount;
+ int canMapHostMemory;
+ bool APU;
+};
+
+
+
+void rocmGetDeviceProperties(struct rocmDeviceProp* devProp, int device)
+{
+ std::wstring s;
+ int i,n;
+ hc::accelerator acc;
+ std::vector accv = acc.get_all() ;
+
+ hc::accelerator a = accv[device];
+
+ s=a.get_device_path();
+ i = 0;
+ for(wchar_t c: s)
+ if((n=std::wctomb(&devProp->name[i],c))>0)
+ i+=n;
+
+ /* assume a CPU */
+ devProp->version = a.get_version();
+ devProp->major = a.get_version()>>16; // for CPU, these are meaningless
+ devProp->minor = a.get_version()&0xff;
+ devProp->device_ordinal = 0;
+
+ /* is this an AMD graphics card */
+ if((devProp->name[0]=='g') && (devProp->name[1]=='f')
+ && (devProp->name[2]=='x')) {
+ /* for AMD cards, the name has the format gfxMmmO */
+
+ devProp->device_type = ((devProp->name[3]-0x30)<<16)
+ + ((devProp->name[4]-0x30)<<8)
+ + (devProp->name[5]-0x30);
+ devProp->device_ordinal = devProp->name[6]-0x30;
+ devProp->major = devProp->name[3]-0x30;
+ devProp->minor = devProp->name[5]-0x30;
+ }
+
+ s=a.get_description();
+ i = 0;
+ for(wchar_t c: s)
+ if((n=std::wctomb(&devProp->description[i],c))>0)
+ i+=n;
+ devProp->totalGlobalMem = a.get_dedicated_memory();
+ devProp->sharedMemPerWavefront = a.get_max_tile_static_size();
+ devProp->WavefrontSize = 64;
+ devProp->WorkgroupSize = 256; // preferred
+ devProp->MaxTileCount = 409600; // as defined in /opt/rocm/hcc-lc/include/hsa_new.h
+ devProp->maxThreadsPerWorkgroup = 1024;
+ devProp->multiProcessorCount = a.get_cu_count();
+ devProp->canMapHostMemory = a.get_supports_cpu_shared_memory();
+// Kaveri has 64KB L2 per CU, 16KB L1, 64KB Vector Regs/SIMD, or 128 regs/thread
+// GCN has 64KB LDS per CU
+
+//Kaveri APU is 7:0:0
+//Carrizo APU is 8:0:1
+ devProp->APU = (((devProp->major==7)&&(devProp->minor==0))|
+ ((devProp->major==8)&&(devProp->minor==1)))?true:false;
+}
+
+namespace {
+
+
+
+class ROCmInternalDevices {
+public:
+ enum { MAXIMUM_DEVICE_COUNT = 64 };
+ struct rocmDeviceProp m_rocmProp[ MAXIMUM_DEVICE_COUNT ] ;
+ int m_rocmDevCount ;
+
+ ROCmInternalDevices();
+
+ static const ROCmInternalDevices & singleton();
+};
+
+ROCmInternalDevices::ROCmInternalDevices()
+{
+ hc::accelerator acc;
+ std::vector accv = acc.get_all() ;
+ m_rocmDevCount = accv.size();
+
+ if(m_rocmDevCount > MAXIMUM_DEVICE_COUNT) {
+ Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
+ }
+ for ( int i = 0 ; i < m_rocmDevCount ; ++i ) {
+ rocmGetDeviceProperties( m_rocmProp + i , i );
+ }
+}
+
+const ROCmInternalDevices & ROCmInternalDevices::singleton()
+{
+ static ROCmInternalDevices* self = nullptr;
+ if (!self) {
+ self = new ROCmInternalDevices();
+ }
+ return *self;
+
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class ROCmInternal {
+private:
+
+ ROCmInternal( const ROCmInternal & );
+ ROCmInternal & operator = ( const ROCmInternal & );
+
+
+public:
+
+ typedef Kokkos::Experimental::ROCm::size_type size_type ;
+
+ int m_rocmDev ;
+ int m_rocmArch ;
+ unsigned m_multiProcCount ;
+ unsigned m_maxWorkgroup ;
+ unsigned m_maxSharedWords ;
+ size_type m_scratchSpaceCount ;
+ size_type m_scratchFlagsCount ;
+ size_type * m_scratchSpace ;
+ size_type * m_scratchFlags ;
+
+ static int was_finalized;
+
+ static ROCmInternal & singleton();
+
+ int verify_is_initialized( const char * const label ) const ;
+
+ int is_initialized() const
+ { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+ void initialize( int rocm_device_id );
+ void finalize();
+
+ void print_configuration( std::ostream & ) const ;
+
+
+ ~ROCmInternal();
+
+ ROCmInternal()
+ : m_rocmDev( -1 )
+ , m_rocmArch( -1 )
+ , m_multiProcCount( 0 )
+ , m_maxWorkgroup( 0 )
+ , m_maxSharedWords( 0 )
+ , m_scratchSpaceCount( 0 )
+ , m_scratchFlagsCount( 0 )
+ , m_scratchSpace( 0 )
+ , m_scratchFlags( 0 )
+ {}
+
+ size_type * scratch_space( const size_type size );
+ size_type * scratch_flags( const size_type size );
+};
+
+int ROCmInternal::was_finalized = 0;
+//----------------------------------------------------------------------------
+
+
+void ROCmInternal::print_configuration( std::ostream & s ) const
+{
+ const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
+
+#if defined( KOKKOS_ENABLE_ROCM )
+ s << "macro KOKKOS_ENABLE_ROCM : defined" << std::endl ;
+#endif
+#if defined( __hcc_version__ )
+ s << "macro __hcc_version__ = " << __hcc_version__
+ << std::endl ;
+#endif
+
+ for ( int i = 0 ; i < dev_info.m_rocmDevCount ; ++i ) {
+ s << "Kokkos::Experimental::ROCm[ " << i << " ] "
+ << dev_info.m_rocmProp[i].name
+ << " version " << (dev_info.m_rocmProp[i].major) << "." << dev_info.m_rocmProp[i].minor
+ << ", Total Global Memory: " << human_memory_size(dev_info.m_rocmProp[i].totalGlobalMem)
+ << ", Shared Memory per Wavefront: " << human_memory_size(dev_info.m_rocmProp[i].sharedMemPerWavefront);
+ if ( m_rocmDev == i ) s << " : Selected" ;
+ s << std::endl ;
+ }
+}
+
+//----------------------------------------------------------------------------
+
+ROCmInternal::~ROCmInternal()
+{
+ if ( m_scratchSpace ||
+ m_scratchFlags ) {
+ std::cerr << "Kokkos::Experimental::ROCm ERROR: Failed to call Kokkos::Experimental::ROCm::finalize()"
+ << std::endl ;
+ std::cerr.flush();
+ }
+
+ m_rocmDev = -1 ;
+ m_rocmArch = -1 ;
+ m_multiProcCount = 0 ;
+ m_maxWorkgroup = 0 ;
+ m_maxSharedWords = 0 ;
+ m_scratchSpaceCount = 0 ;
+ m_scratchFlagsCount = 0 ;
+ m_scratchSpace = 0 ;
+ m_scratchFlags = 0 ;
+}
+
+int ROCmInternal::verify_is_initialized( const char * const label ) const
+{
+ if ( m_rocmDev < 0 ) {
+ std::cerr << "Kokkos::Experimental::ROCm::" << label << " : ERROR device not initialized" << std::endl ;
+ }
+ return 0 <= m_rocmDev ;
+}
+
+ROCmInternal & ROCmInternal::singleton()
+{
+ static ROCmInternal* self = nullptr ;
+ if (!self) {
+ self = new ROCmInternal();
+ }
+ return *self ;
+
+}
+
+void ROCmInternal::initialize( int rocm_device_id )
+{
+ if ( was_finalized ) Kokkos::abort("Calling ROCm::initialize after ROCm::finalize is illegal\n");
+
+ if ( is_initialized() ) return;
+
+ enum { WordSize = sizeof(size_type) };
+
+ if ( ! HostSpace::execution_space::is_initialized() ) {
+ const std::string msg("ROCm::initialize ERROR : HostSpace::execution_space is not initialized");
+ throw_runtime_exception( msg );
+ }
+
+ const ROCmInternalDevices & dev_info = ROCmInternalDevices::singleton();
+
+ const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+ const bool ok_id = 1 <= rocm_device_id &&
+ rocm_device_id < dev_info.m_rocmDevCount ;
+
+ // Need at least a GPU device
+
+ const bool ok_dev = ok_id &&
+ ( 1 <= dev_info.m_rocmProp[ rocm_device_id ].major &&
+ 0 <= dev_info.m_rocmProp[ rocm_device_id ].minor );
+ if ( ok_init && ok_dev ) {
+
+ const struct rocmDeviceProp & rocmProp =
+ dev_info.m_rocmProp[ rocm_device_id ];
+
+ m_rocmDev = rocm_device_id ;
+
+// rocmSetDevice( m_rocmDev ) );
+ Kokkos::Impl::rocm_device_synchronize();
+
+/*
+ // Query what compute capability architecture a kernel executes:
+ m_rocmArch = rocm_kernel_arch();
+ if ( m_rocmArch != rocmProp.major * 100 + rocmProp.minor * 10 ) {
+ std::cerr << "Kokkos::Experimental::ROCm::initialize WARNING: running kernels compiled for compute capability "
+ << ( m_rocmArch / 100 ) << "." << ( ( m_rocmArch % 100 ) / 10 )
+ << " on device with compute capability "
+ << rocmProp.major << "." << rocmProp.minor
+ << " , this will likely reduce potential performance."
+ << std::endl ;
+ }
+*/
+ // number of multiprocessors
+
+ m_multiProcCount = rocmProp.multiProcessorCount ;
+
+ //----------------------------------
+ // Maximum number of wavefronts,
+ // at most one workgroup per thread in a workgroup for reduction.
+
+
+ m_maxSharedWords = rocmProp.sharedMemPerWavefront/ WordSize ;
+
+ //----------------------------------
+ // Maximum number of Workgroups:
+
+ m_maxWorkgroup = 5*rocmProp.multiProcessorCount; //TODO: confirm usage and value
+
+ //----------------------------------
+ // Multiblock reduction uses scratch flags for counters
+ // and scratch space for partial reduction values.
+ // Allocate some initial space. This will grow as needed.
+
+ {
+ const unsigned reduce_block_count = m_maxWorkgroup * Impl::ROCmTraits::WorkgroupSize ;
+
+ (void) scratch_flags( reduce_block_count * 2 * sizeof(size_type) );
+ (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+ }
+ //----------------------------------
+
+ }
+ else {
+
+ std::ostringstream msg ;
+ msg << "Kokkos::Experimental::ROCm::initialize(" << rocm_device_id << ") FAILED" ;
+
+ if ( ! ok_init ) {
+ msg << " : Already initialized" ;
+ }
+ if ( ! ok_id ) {
+ msg << " : Device identifier out of range "
+ << "[0.." << (dev_info.m_rocmDevCount-1) << "]" ;
+ }
+ else if ( ! ok_dev ) {
+ msg << " : Device " ;
+ msg << dev_info.m_rocmProp[ rocm_device_id ].major ;
+ msg << "." ;
+ msg << dev_info.m_rocmProp[ rocm_device_id ].minor ;
+ msg << " Need at least a GPU" ;
+ msg << std::endl;
+ }
+ Kokkos::Impl::throw_runtime_exception( msg.str() );
+ }
+
+
+ // Init the array for used for arbitrarily sized atomics
+ Kokkos::Impl::init_lock_arrays_rocm_space();
+
+// Kokkos::Impl::ROCmLockArraysStruct locks;
+// locks.atomic = atomic_lock_array_rocm_space_ptr(false);
+// locks.scratch = scratch_lock_array_rocm_space_ptr(false);
+// locks.threadid = threadid_lock_array_rocm_space_ptr(false);
+// rocmMemcpyToSymbol( kokkos_impl_rocm_lock_arrays , & locks , sizeof(ROCmLockArraysStruct) );
+}
+
+//----------------------------------------------------------------------------
+
+typedef Kokkos::Experimental::ROCm::size_type ScratchGrain[ Impl::ROCmTraits::WorkgroupSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+void rocmMemset( Kokkos::Experimental::ROCm::size_type * ptr , Kokkos::Experimental::ROCm::size_type value , Kokkos::Experimental::ROCm::size_type size)
+{
+char * mptr = (char * ) ptr;
+#if 0
+ parallel_for_each(hc::extent<1>(size),
+ [=, &ptr]
+ (hc::index<1> idx) __HC__
+ {
+ int i = idx[0];
+ ptr[i] = value;
+ }).wait();
+#else
+ for (int i= 0; i Record ;
+
+ Record * const r = Record::allocate( Kokkos::HostSpace()
+ , "InternalScratchFlags"
+ , ( sizeScratchGrain * m_scratchFlagsCount ) );
+
+ Record::increment( r );
+
+ m_scratchFlags = reinterpret_cast( r->data() );
+
+ rocmMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain );
+ }
+
+ return m_scratchFlags ;
+}
+
+Kokkos::Experimental::ROCm::size_type *
+ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size )
+{
+ if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+ m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+ typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
+
+ Record * const r = Record::allocate( Kokkos::HostSpace()
+ , "InternalScratchSpace"
+ , ( sizeScratchGrain * m_scratchSpaceCount ) );
+
+ Record::increment( r );
+
+ m_scratchSpace = reinterpret_cast( r->data() );
+ }
+
+ return m_scratchSpace ;
+}
+
+//----------------------------------------------------------------------------
+
+void ROCmInternal::finalize()
+{
+ was_finalized = 1;
+ if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+// atomic_lock_array_rocm_space_ptr(false);
+// scratch_lock_array_rocm_space_ptr(false);
+// threadid_lock_array_rocm_space_ptr(false);
+
+ typedef Kokkos::Experimental::Impl::SharedAllocationRecord< HostSpace > RecordROCm ;
+ typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::Experimental::ROCmHostPinnedSpace > RecordHost ;
+
+ RecordROCm::decrement( RecordROCm::get_record( m_scratchFlags ) );
+ RecordROCm::decrement( RecordROCm::get_record( m_scratchSpace ) );
+
+ m_rocmDev = -1 ;
+ m_multiProcCount = 0 ;
+ m_maxWorkgroup = 0 ;
+ m_maxSharedWords = 0 ;
+ m_scratchSpaceCount = 0 ;
+ m_scratchFlagsCount = 0 ;
+ m_scratchSpace = 0 ;
+ m_scratchFlags = 0 ;
+ }
+}
+
+//----------------------------------------------------------------------------
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_cu_count()
+{ return ROCmInternal::singleton().m_multiProcCount ; }
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_extent_size()
+{ return ROCmInternal::singleton().m_maxWorkgroup ; }
+
+Kokkos::Experimental::ROCm::size_type rocm_internal_maximum_shared_words()
+{ return ROCmInternal::singleton().m_maxSharedWords ; }
+
+Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_space( const Kokkos::Experimental::ROCm::size_type size )
+{ return ROCmInternal::singleton().scratch_space( size ); }
+
+Kokkos::Experimental::ROCm::size_type * rocm_internal_scratch_flags( const Kokkos::Experimental::ROCm::size_type size )
+{ return ROCmInternal::singleton().scratch_flags( size ); }
+
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+//ROCm::size_type ROCm::detect_device_count()
+//{ return Impl::ROCmInternalDevices::singleton().m_rocmDevCount ; }
+
+int ROCm::concurrency() {
+#if defined(KOKKOS_ARCH_KAVERI)
+ return 8*64*40; // 20480 kaveri
+#else
+ return 32*8*40; // 81920 fiji and hawaii
+#endif
+}
+int ROCm::is_initialized()
+{ return Kokkos::Impl::ROCmInternal::singleton().is_initialized(); }
+
+void ROCm::initialize( const ROCm::SelectDevice config )
+{
+ Kokkos::Impl::ROCmInternal::singleton().initialize( config.rocm_device_id );
+
+ #if defined(KOKKOS_ENABLE_PROFILING)
+ Kokkos::Profiling::initialize();
+ #endif
+}
+
+#if 0
+std::vector
+ROCm::detect_device_arch()
+{
+ const Impl::ROCmInternalDevices & s = Impl::ROCmInternalDevices::singleton();
+
+ std::vector output( s.m_rocmDevCount );
+
+ for ( int i = 0 ; i < s.m_rocmDevCount ; ++i ) {
+ output[i] = s.m_rocmProp[i].major * 100 + s.m_rocmProp[i].minor ;
+ }
+
+ return output ;
+}
+
+ROCm::size_type ROCm::device_arch()
+{
+ return 1 ;
+}
+#endif
+
+void ROCm::finalize()
+{
+ Kokkos::Impl::ROCmInternal::singleton().finalize();
+
+ #if defined(KOKKOS_ENABLE_PROFILING)
+ Kokkos::Profiling::finalize();
+ #endif
+}
+
+ROCm::ROCm()
+ : m_device( Kokkos::Impl::ROCmInternal::singleton().m_rocmDev )
+{
+ Kokkos::Impl::ROCmInternal::singleton().verify_is_initialized( "ROCm instance constructor" );
+}
+
+bool ROCm::isAPU(int device) {
+ const Kokkos::Impl::ROCmInternalDevices & dev_info =
+ Kokkos::Impl::ROCmInternalDevices::singleton();
+ return (dev_info.m_rocmProp[device].APU);
+}
+
+bool ROCm::isAPU() {
+ return ROCm::isAPU(rocm_device());
+}
+
+//ROCm::ROCm( const int instance_id )
+// : m_device( Impl::ROCmInternal::singleton().m_rocmDev )
+//{}
+
+void ROCm::print_configuration( std::ostream & s , const bool )
+{ Kokkos::Impl::ROCmInternal::singleton().print_configuration( s ); }
+
+bool ROCm::sleep() { return false ; }
+
+bool ROCm::wake() { return true ; }
+
+void ROCm::fence()
+{
+ Kokkos::Impl::rocm_device_synchronize();
+}
+
+const char* ROCm::name() { return "ROCm"; }
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif // KOKKOS_ENABLE_ROCM
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
new file mode 100644
index 0000000000..481e7df3a4
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Invoke.hpp
@@ -0,0 +1,138 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include
+#include
+
+#if !defined( KOKKOS_ROCM_INVOKE_H )
+#define KOKKOS_ROCM_INVOKE_H
+
+namespace Kokkos {
+namespace Impl {
+
+template()), int>::type = 0>
+KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
+{
+ f(Tag(), static_cast(xs)...);
+}
+
+template()), int>::type = 0>
+KOKKOS_INLINE_FUNCTION void rocm_invoke(F&& f, Ts&&... xs)
+{
+ f(static_cast(xs)...);
+}
+
+
+template
+struct rocm_invoke_fn
+{
+ F* f;
+ rocm_invoke_fn(F& f_) : f(&f_)
+ {}
+
+ template
+ KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
+ {
+ rocm_invoke(*f, static_cast(xs)...);
+ }
+};
+
+template
+KOKKOS_INLINE_FUNCTION rocm_invoke_fn make_rocm_invoke_fn(F& f)
+{
+ return {f};
+}
+
+template
+KOKKOS_INLINE_FUNCTION T& rocm_unwrap(T& x)
+{
+ return x;
+}
+
+template
+KOKKOS_INLINE_FUNCTION T& rocm_unwrap(std::reference_wrapper x)
+{
+ return x;
+}
+
+template
+struct rocm_capture_fn
+{
+ F f;
+ T data;
+
+ KOKKOS_INLINE_FUNCTION rocm_capture_fn(F f_, T x)
+ : f(f_), data(x)
+ {}
+
+ template
+ KOKKOS_INLINE_FUNCTION void operator()(Ts&&... xs) const
+ {
+ f(rocm_unwrap(data), static_cast(xs)...);
+ }
+};
+
+template
+KOKKOS_INLINE_FUNCTION rocm_capture_fn rocm_capture(F f, T x)
+{
+ return {f, x};
+}
+
+template
+KOKKOS_INLINE_FUNCTION auto rocm_capture(F f, T x, U y, Ts... xs) -> decltype(rocm_capture(rocm_capture(f, x), y, xs...))
+{
+ return rocm_capture(rocm_capture(f, x), y, xs...);
+}
+
+struct rocm_apply_op
+{
+ template
+ KOKKOS_INLINE_FUNCTION void operator()(F&& f, Ts&&... xs) const
+ {
+ f(static_cast(xs)...);
+ }
+};
+
+}}
+
+#endif
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
new file mode 100644
index 0000000000..d3d150703e
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Join.hpp
@@ -0,0 +1,72 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#if !defined( KOKKOS_ROCM_JOIN_H )
+#define KOKKOS_ROCM_JOIN_H
+
+namespace Kokkos {
+namespace Impl {
+
+
+// Adaptor to use ValueJoin with standard algorithms
+template
+struct join_operator
+{
+ const F* fp;
+ template
+ T operator()(T x, const U& y) const
+ {
+ Joiner::join(*fp, &x, &y);
+ return x;
+ }
+};
+
+template
+join_operator make_join_operator(const F& f)
+{
+ return join_operator{&f};
+}
+
+}}
+
+#endif
diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
new file mode 100644
index 0000000000..14ab52a1c2
--- /dev/null
+++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp
@@ -0,0 +1,1265 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+// Kokkos v. 2.0
+// Copyright (2014) Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include
+#include
+#include
+#include
+#include
+
+
+namespace Kokkos {
+namespace Impl {
+
+struct ROCmTeamMember ;
+
+template< class ... Properties >
+class TeamPolicyInternal< Kokkos::Experimental::ROCm, Properties ... >: public PolicyTraits {
+private:
+ int m_league_size ;
+ int m_team_size ;
+ int m_vector_length ;
+ int m_team_scratch_size[2] ;
+ int m_thread_scratch_size[2] ;
+ int m_chunk_size ;
+
+
+public:
+
+ using execution_policy = TeamPolicyInternal ;
+ using execution_space = Kokkos::Experimental::ROCm ;
+ typedef PolicyTraits traits;
+
+ TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
+ m_league_size = p.m_league_size;
+ m_team_size = p.m_team_size;
+ m_vector_length = p.m_vector_length;
+ m_team_scratch_size[0] = p.m_team_scratch_size[0];
+ m_team_scratch_size[1] = p.m_team_scratch_size[1];
+ m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
+ m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
+ m_chunk_size = p.m_chunk_size;
+ return *this;
+ }
+
+ TeamPolicyInternal()
+ : m_league_size( 0 )
+ , m_team_size( 0 )
+ , m_vector_length( 0 )
+ , m_team_scratch_size {0,0}
+ , m_thread_scratch_size {0,0}
+ , m_chunk_size ( 64 )
+ {}
+
+ TeamPolicyInternal( const int arg_league_size
+ , const int arg_team_size )
+ : m_league_size( arg_league_size ),
+ m_team_size( arg_team_size )
+ , m_team_scratch_size {0,0}
+ , m_thread_scratch_size {0,0}
+ , m_chunk_size ( 64 )
+ {}
+
+ TeamPolicyInternal( const int arg_league_size
+ , const int arg_team_size
+ , const int vector_length_request=1)
+ : m_league_size( arg_league_size ),
+ m_team_size( arg_team_size ),
+ m_vector_length (vector_length_request)
+ , m_team_scratch_size {0,0}
+ , m_thread_scratch_size {0,0}
+ , m_chunk_size ( 64 )
+ {}
+
+ TeamPolicyInternal( const int arg_league_size
+ , const Kokkos::AUTO_t )
+ : m_league_size( arg_league_size ), m_team_size( -1 )
+ , m_team_scratch_size {0,0}
+ , m_thread_scratch_size {0,0}
+ , m_chunk_size ( 64 )
+ {}
+
+ TeamPolicyInternal( const int arg_league_size
+ , const Kokkos::AUTO_t
+ , const int vector_length_request)
+ : m_league_size( arg_league_size ),
+ m_team_size( -1 ),
+ m_vector_length (vector_length_request)
+ , m_team_scratch_size {0,0}
+ , m_thread_scratch_size {0,0}
+ , m_chunk_size ( 64 )
+ {}
+
+ inline int chunk_size() const { return m_chunk_size ; }
+
+ /** \brief set chunk_size to a discrete value*/
+ KOKKOS_INLINE_FUNCTION TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
+ TeamPolicyInternal p = *this;
+ p.m_chunk_size = chunk_size_;
+ return p;
+ }
+
+ /** \brief set per team scratch size for a specific level of the scratch hierarchy */
+ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
+ TeamPolicyInternal p = *this;
+ p.m_team_scratch_size[level] = per_team.value;
+ return p;
+ };
+
+ /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
+ inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
+ TeamPolicyInternal p = *this;
+ p.m_thread_scratch_size[level] = per_thread.value;
+ return p;
+ };
+
+ /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
+ inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
+ TeamPolicyInternal p = *this;
+ p.m_team_scratch_size[level] = per_team.value;
+ p.m_thread_scratch_size[level] = per_thread.value;
+ return p;
+ };
+
+// TODO: evaluate proper team_size_max requirements
+ template< class Functor_Type>
+ KOKKOS_INLINE_FUNCTION static
+ int team_size_max( const Functor_Type & functor)
+ {
+ typedef typename Kokkos::Impl::FunctorValueTraits::value_type value_type;
+ return team_size_recommended(functor);
+ // return std::min(Kokkos::Impl::get_max_tile_size() / sizeof(value_type), Kokkos::Impl::get_max_tile_thread());
+ }
+
+ template< class Functor_Type>
+ KOKKOS_INLINE_FUNCTION static int team_size_recommended(const Functor_Type & functor)
+ { return Kokkos::Impl::get_tile_size::value_type>(); }
+
+ template< class Functor_Type >
+ KOKKOS_INLINE_FUNCTION static int team_size_recommended(const Functor_Type &functor, const int vector_length)
+ {
+ int max = team_size_recommended( functor )/vector_length;
+ if(max < 1) max = 1;
+ return(max);
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION int team_size(const F& f) const { return (m_team_size > 0) ? m_team_size : team_size_recommended(f); }
+ KOKKOS_INLINE_FUNCTION int team_size() const { return (m_team_size > 0) ? m_team_size : Impl::get_max_tile_thread(); ; }
+ KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+
+
+ inline int vector_length() const { return m_vector_length ; }
+ inline int scratch_size(int level, int team_size_ = -1) const {
+ if(team_size_<0) team_size_ = m_team_size;
+ return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level];
+ }
+ inline size_t team_scratch_size(int level) const {
+ return m_team_scratch_size[level];
+ }
+ inline size_t thread_scratch_size(int level) const {
+ return m_thread_scratch_size[level];
+ }
+
+ typedef Impl::ROCmTeamMember member_type;
+};
+
+ struct ROCmTeamMember {
+ typedef Kokkos::Experimental::ROCm execution_space ;
+ typedef Kokkos::ScratchMemorySpace scratch_memory_space ;
+
+ KOKKOS_INLINE_FUNCTION
+ const scratch_memory_space & team_shmem() const
+ { return m_team_shared.set_team_thread_mode(0,1,0); }
+ KOKKOS_INLINE_FUNCTION
+ const execution_space::scratch_memory_space & team_scratch(const int& level) const
+ { return m_team_shared.set_team_thread_mode(level,1,0) ; }
+ KOKKOS_INLINE_FUNCTION
+ const execution_space::scratch_memory_space & thread_scratch(const int& level) const
+ { return m_team_shared.set_team_thread_mode(level,
+ team_size(),
+ team_rank()) ; }
+
+
+ /* Rank of this team within the league of teams */
+ KOKKOS_INLINE_FUNCTION int league_rank() const { return m_idx.tile[0]; }
+ /* Number of teams in the league */
+ KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
+ /* Rank of this thread within this team */
+ KOKKOS_INLINE_FUNCTION int team_rank() const { return m_idx.local[0] / m_vector_length; }
+ /* Rank of this thread within this thread */
+ KOKKOS_INLINE_FUNCTION int vector_rank() const { return m_idx.local[0] % m_vector_length; }
+ KOKKOS_INLINE_FUNCTION int lindex() const { return m_idx.local[0]; }
+ KOKKOS_INLINE_FUNCTION int gindex() const { return m_idx.global[0]; }
+ KOKKOS_INLINE_FUNCTION int tindex() const { return m_idx.tile[0]; }
+ KOKKOS_INLINE_FUNCTION int tile_dim() const { return m_idx.tile_dim[0]; }
+ KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
+ KOKKOS_INLINE_FUNCTION int vector_length() const { return m_vector_length; }
+
+
+ KOKKOS_INLINE_FUNCTION
+ ROCmTeamMember( const hc::tiled_index< 1 > & arg_idx, int league_size_,int team_size_ )
+ : m_league_size( league_size_ )
+ , m_team_size( team_size_ )
+ , m_team_shared( nullptr, 0 )
+ , m_vector_length( 1 )
+ , m_idx( arg_idx )
+ {}
+
+ KOKKOS_INLINE_FUNCTION
+ ROCmTeamMember( const hc::tiled_index< 1 > & arg_idx, int league_size_,int team_size_, char * shared, std::size_t shsize, std::size_t scratch_size0, char * scratch_ptr, std::size_t scratch_size1, std::size_t vector_length)
+ : m_league_size( league_size_ )
+ , m_team_size( team_size_ )
+ , m_team_shared( shared +
+ arg_idx.tile[0]*(shsize+scratch_size0),
+ (shsize+scratch_size0)*league_size_,
+ scratch_ptr + arg_idx.tile[0]*scratch_size1,
+ scratch_size1*league_size_)
+ , m_vector_length( vector_length )
+ , m_idx( arg_idx )
+ {}
+
+ KOKKOS_INLINE_FUNCTION
+ void team_barrier() const {
+ m_idx.barrier.wait();
+ }
+
+ template
+ KOKKOS_INLINE_FUNCTION
+ void team_broadcast(const ValueType& value, const int& thread_id ) const
+ {
+ static_assert(std::is_trivially_default_constructible(), "Only trivial constructible types can be broadcasted");
+ tile_static ValueType local_value;
+ zero_init(local_value);
+ if (this->team_rank() == thread_id) {
+ local_value = value;
+ }
+ this->team_barrier();
+ value = local_value;
+ }
+// Reduce accross a team of threads.
+//
+// Each thread has vector_length elements.
+// This reduction is for TeamThreadRange operations, where the range
+// is spread across threads. Effectively, there are vector_length
+// independent reduction operations.
+// This is different from a reduction across the elements of a thread,
+// which reduces every vector element.
+
+ template< class ValueType, class JoinOp >
+ KOKKOS_INLINE_FUNCTION
+ ValueType team_reduce( const ValueType & value , const JoinOp & op_in) const
+ {
+ typedef JoinLambdaAdapter JoinOpFunctor ;
+ const JoinOpFunctor op(op_in);
+
+ tile_static ValueType buffer[512];
+ const auto local = lindex();
+ const auto team = team_rank();
+ auto vector_rank = local%m_vector_length;
+ auto thread_base = team*m_vector_length;
+
+ const std::size_t size = next_pow_2(m_team_size+1)/2;
+#if defined(ROCM15)
+ buffer[local] = value;
+#else
+ // ROCM 1.5 handles address spaces better, previous version didn't
+ lds_for(buffer[local], [&](ValueType& x)
+ {
+ x = value;
+ });
+#endif
+ m_idx.barrier.wait();
+
+ for(std::size_t s = 1; s < size; s *= 2)
+ {
+ const std::size_t index = 2 * s * team;
+ if (index < size)
+ {
+#if defined(ROCM15)
+ op.join(buffer[vector_rank+index*m_vector_length],
+ buffer[vector_rank+(index+s)*m_vector_length]);
+#else
+ lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x)
+ {
+ lds_for(buffer[vector_rank+(index+s)*m_vector_length],
+ [&](ValueType& y)
+ {
+ op.join(x, y);
+ });
+ });
+#endif
+ }
+ m_idx.barrier.wait();
+ }
+
+ if (local == 0)
+ {
+ for(int i=size*m_vector_length; i
+ KOKKOS_INLINE_FUNCTION
+ ValueType thread_reduce( const ValueType & value , const JoinOp & op_in) const
+ {
+ typedef JoinLambdaAdapter JoinOpFunctor ;
+ const JoinOpFunctor op(op_in);
+
+ const auto local = m_idx.local[0];
+ tile_static ValueType buffer[512];
+ const std::size_t size = m_vector_length; //vector length must be power of 2
+ auto vector_rank = local%m_vector_length;
+ auto thread_base = team_rank()*m_vector_length;
+ lds_for(buffer[local], [&](ValueType& x)
+ {
+ x = value;
+ });
+ m_idx.barrier.wait();
+ for(std::size_t s = 1; s < size; s *= 2)
+ {
+ const std::size_t index = 2 * s * vector_rank;
+ if (index < size)
+ {
+#if defined(ROCM15)
+ op.join(buffer[thread_base+index], buffer[thread_base+index+s]);
+#else
+
+ lds_for(buffer[thread_base+index], [&](ValueType& x)
+ {
+ lds_for(buffer[thread_base+index+s], [&](ValueType& y)
+ {
+ op.join(x, y);
+ });
+ });
+#endif
+ }
+ m_idx.barrier.wait();
+ }
+
+ m_idx.barrier.wait();
+ return buffer[thread_base];
+ }
+
+ /** \brief Intra-team exclusive prefix sum with team_rank() ordering
+ * with intra-team non-deterministic ordering accumulation.
+ *
+ * The global inter-team accumulation value will, at the end of the
+ * league's parallel execution, be the scan's total.
+ * Parallel execution ordering of the league's teams is non-deterministic.
+ * As such the base value for each team's scan operation is similarly
+ * non-deterministic.
+ */
+ template< typename Type >
+ KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum = nullptr ) const
+ {
+ #if 0
+ const auto local = m_idx.local[0];
+ const auto last = m_team_size - 1;
+ const auto init = 0;
+ tile_static Type buffer[256];
+
+ if (local == last) buffer[0] = init;
+ else buffer[local] = value;
+
+ m_idx.barrier.wait();
+
+ for(std::size_t s = 1; s < m_team_size; s *= 2)
+ {
+ if (local >= s) buffer[local] += buffer[local - s];
+ m_idx.barrier.wait();
+ }
+
+ if ( global_accum )
+ {
+ if(local == last)
+ {
+ atomic_fetch_add(global_accum, buffer[local] + value);
+ }
+ m_idx.barrier.wait();
+ buffer[local] += *global_accum;
+ }
+ m_idx.barrier.wait();
+ return buffer[local];
+#else
+ tile_static Type sarray[2][256+1];
+ int lid = m_idx.local[0];
+ int lp1 = lid+1;
+
+ int toggle = 1;
+ int _toggle = 0;
+ m_idx.barrier.wait();
+
+ if(lid == 0)
+ {
+ sarray[1][0] = 0;
+ sarray[0][0] = 0;
+ }
+ sarray[1][lp1] = value;
+
+ m_idx.barrier.wait();
+ for(int stride = 1; stride < m_team_size; stride*=2)
+ {
+ if(lid >= stride)
+ {
+ sarray[_toggle][lp1] =
+ sarray[toggle][lp1]+sarray[toggle][lp1-stride];
+ }
+ else
+ {
+ sarray[_toggle][lp1] = sarray[toggle][lp1];
+ }
+ toggle = _toggle;
+ _toggle = 1-toggle;
+ m_idx.barrier.wait();
+ }
+
+ if ( global_accum )
+ {
+ if(m_team_size == lp1)
+ {
+ sarray[toggle][m_team_size] = atomic_fetch_add(global_accum,sarray[toggle][m_team_size]);
+ }
+ m_idx.barrier.wait();
+ sarray[toggle][lid] += sarray[toggle][m_team_size];
+ }
+ m_idx.barrier.wait();
+ return sarray[toggle][lid];
+#endif
+ }
+
+ private:
+ int m_league_size ;
+ int m_team_size ;
+ const scratch_memory_space m_team_shared;
+
+ public:
+ int m_vector_length;
+ hc::tiled_index<1> m_idx;
+ };
+}
+} // namespace Kokkos
+#include
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class... Traits >
+class ParallelFor< FunctorType
+ , Kokkos::RangePolicy< Traits... >, Kokkos::Experimental::ROCm >
+{
+private:
+
+ typedef Kokkos::RangePolicy< Traits... > Policy ;
+
+public:
+
+ inline
+ ParallelFor( const FunctorType & f
+ , const Policy & policy )
+ {
+
+
+ const auto len = policy.end()-policy.begin();
+ const auto offset = policy.begin();
+ if(len == 0) return;
+// define a lambda to work around a compiler issue. The compiler does not
+// properly dereference f inside the pfe.
+auto foo = [=](size_t i){rocm_invoke(f, i);};
+
+#if __hcc_workweek__ > 16600
+ hc::parallel_for_each(hc::extent<1>(len) , [=](const hc::index<1> & idx) [[hc]] [[hc_max_workgroup_dim(1024,1,1)]]
+#else
+ hc::parallel_for_each(hc::extent<1>(len).tile(256) , [=](const hc::index<1> & idx) [[hc]]
+#endif
+ {
+ if(idx[0]
+class ParallelFor< F
+ , Kokkos::TeamPolicy< Traits... >
+ , Kokkos::Experimental::ROCm >
+{
+ using Policy = Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::ROCm, Traits... >;
+ typedef Kokkos::Impl::FunctorValueTraits ValueTraits;
+
+public:
+ inline
+ ParallelFor( const F & f
+ , const Policy & policy )
+ {
+ const auto league_size = policy.league_size();
+ const auto team_size = policy.team_size();
+ const int vector_length = policy.vector_length();
+ const auto total_size = league_size * team_size * vector_length;
+ const int scratch_size0 = policy.scratch_size(0,team_size);
+ const int scratch_size1 = policy.scratch_size(1,team_size);
+
+ if(total_size == 0) return;
+
+ const auto shared_size = FunctorTeamShmemSize< F >::value( f , team_size );
+ char * scratch = NULL;
+ char * shared = (char *)rocm_device_allocate(shared_size * league_size +
+ scratch_size0*league_size);
+ if(0 flat_extent( total_size );
+
+ hc::tiled_extent< 1 > team_extent = flat_extent.tile(team_size*vector_length);
+ hc::parallel_for_each( team_extent , [=](hc::tiled_index<1> idx) [[hc]]
+ {
+ rocm_invoke(f, typename Policy::member_type(idx, league_size, team_size, shared, shared_size, scratch_size0, scratch, scratch_size1,vector_length));
+ }).wait();
+
+ if(0
+class ParallelReduce<
+ FunctorType , Kokkos::RangePolicy< Traits... >, ReducerType, Kokkos::Experimental::ROCm >
+{
+public:
+
+ typedef Kokkos::RangePolicy< Traits... > Policy ;
+
+ // TODO: Use generic lambdas instead
+ struct invoke_fn
+ {
+ template
+ KOKKOS_INLINE_FUNCTION void operator()(std::size_t size, F&& f, hc::tiled_index<1> idx, tile_desc td, Ts&&... xs) const
+ {
+ auto global = idx.global[0];
+ if (global < size) f(idx.global[0], static_cast(xs)...);
+ }
+ };
+
+ template< class ViewType >
+ inline
+ ParallelReduce( const FunctorType & f,
+ const Policy & policy,
+ const ViewType & result_view,
+ typename std::enable_if<
+ Kokkos::is_view< ViewType >::value &&
+ !Kokkos::is_reducer_type::value
+ ,void*>::type = NULL)
+ {
+ typedef typename Policy::work_tag Tag;
+ typedef Kokkos::Impl::FunctorValueTraits< FunctorType , Tag > ValueTraits;
+ typedef Kokkos::Impl::FunctorValueInit< FunctorType , Tag > ValueInit;
+ typedef typename ValueTraits::reference_type reference_type;
+
+ const auto total_size = policy.end() - policy.begin();
+
+ if(total_size==0) {
+ if (result_view.data()) {
+ ValueInit::init( f , result_view.data() );
+ }
+ return;
+ }
+
+ Kokkos::Impl::reduce_enqueue< Tag >
+ ( total_size
+ , f
+ , InvalidType{}
+ , rocm_capture(invoke_fn{}, total_size)
+ , result_view.data()
+ , result_view.extent(0)
+ );
+ }
+
+ inline
+ ParallelReduce( const FunctorType & f,
+ Policy policy,
+ const ReducerType& reducer )
+ {
+ typedef typename Policy::work_tag Tag;
+
+ typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional;
+ typedef typename ReducerConditional::type ReducerTypeFwd;
+ typedef Kokkos::Impl::FunctorValueTraits< FunctorType , Tag > ValueTraits;
+ typedef Kokkos::Impl::FunctorValueInit< ReducerType, Tag > ValueInit ;
+
+ typedef typename ValueTraits::reference_type reference_type;
+
+ const auto total_size = policy.end() - policy.begin();
+
+ if(total_size==0) {
+ if (reducer.view().data()) {
+ ValueInit::init( ReducerConditional::select(f,reducer),
+ reducer.view().data() );
+ }
+ return;
+ }
+
+ Kokkos::Impl::reduce_enqueue< Tag >
+ ( total_size
+ , f
+ , reducer
+ , rocm_capture(invoke_fn{}, total_size)
+ , reducer.view().data()
+ , reducer.view().extent(0)
+ );
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ void execute() const {}
+
+};
+
+template< class FunctorType, class ReducerType, class... Traits >
+class ParallelReduce<
+ FunctorType , Kokkos::TeamPolicy< Traits... >, ReducerType, Kokkos::Experimental::ROCm >
+{
+ using Policy = Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::ROCm, Traits... >;
+ typedef Kokkos::Impl::FunctorValueTraits ValueTraits;
+
+public:
+
+ struct invoke_fn
+ {
+ template
+ KOKKOS_INLINE_FUNCTION void operator()(Create&& create, F&& f, hc::tiled_index<1> idx, tile_desc td, Ts&&... xs) const
+ {
+ f(create(idx, td), static_cast(xs)...);
+ }
+ };
+
+ template< class ViewType >
+ inline
+ ParallelReduce( const FunctorType & f,
+ const Policy & policy,
+ const ViewType & result_view,
+ typename std::enable_if<
+ Kokkos::is_view< ViewType >::value &&
+ !Kokkos::is_reducer_type::value
+ ,void*>::type = NULL)
+ {
+ const int league_size = policy.league_size();
+ const int team_size = policy.team_size(f);
+ const int vector_length = policy.vector_length();
+ const int scratch_size0 = policy.scratch_size(0,team_size);
+ const int scratch_size1 = policy.scratch_size(1,team_size);
+ const int total_size = league_size * team_size ;
+
+ if(total_size == 0) return;
+
+ const int reduce_size = ValueTraits::value_size( f );
+ const int shared_size = FunctorTeamShmemSize< FunctorType >::value( f , team_size );
+
+ char * shared;
+ char * scratch = NULL;
+
+ shared = (char *)rocm_device_allocate(league_size *
+ (shared_size + scratch_size0));
+ if(0 idx, tile_desc td)
+ {
+
+ return typename Policy::member_type(idx, league_size, td.team_size,
+ shared, shared_size, scratch_size0,
+ scratch, scratch_size1,
+ vector_length);
+ };
+
+ Kokkos::Impl::reduce_enqueue< typename Policy::work_tag >
+ ( total_size*vector_length
+ , f
+ , InvalidType{}
+ , rocm_capture(invoke_fn{}, create_team_member)
+ , result_view.ptr_on_device()
+ , result_view.dimension_0()
+ , team_size
+ , vector_length
+ , shared_size
+ );
+
+ if(0::value( f , team_size );
+ const int scratch_size0 = policy.scratch_size(0,team_size);
+ const int scratch_size1 = policy.scratch_size(1,team_size);
+
+ char * shared;
+ char * scratch = NULL;
+ shared = (char *)rocm_device_allocate((shared_size + scratch_size0) *
+ league_size);
+ if(0 idx, tile_desc td)
+ {
+ return typename Policy::member_type(idx, league_size, td.tile_size, shared, shared_size, scratch_size0, scratch, scratch_size1, vector_length);
+ };
+
+ Kokkos::Impl::reduce_enqueue< typename Policy::work_tag >
+ ( league_size
+ , f
+ , reducer
+ , rocm_capture(invoke_fn{}, create_team_member)
+ , reducer.view().data()
+ , reducer.view().extent(0),team_size,vector_length
+ , shared_size
+ );
+
+ if(0
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Traits... >, Kokkos::Experimental::ROCm >
+{
+private:
+
+ typedef Kokkos::RangePolicy< Traits... > Policy;
+ typedef typename Policy::work_tag Tag;
+ typedef Kokkos::Impl::FunctorValueTraits< FunctorType, Tag> ValueTraits;
+
+public:
+
+ //----------------------------------------
+
+ inline
+ ParallelScan( const FunctorType & f
+ , const Policy & policy )
+ {
+ const auto len = policy.end()-policy.begin();
+
+
+ if(len==0) return;
+
+ scan_enqueue(len, f, [](hc::tiled_index<1> idx, int, int) { return idx.global[0]; });
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ void execute() const {}
+
+ //----------------------------------------
+};
+
+template< class FunctorType , class... Traits>
+class ParallelScan< FunctorType , Kokkos::TeamPolicy< Traits... >, Kokkos::Experimental::ROCm >
+{
+private:
+
+ using Policy = Kokkos::Impl::TeamPolicyInternal< Kokkos::Experimental::ROCm, Traits... >;
+ typedef typename Policy::work_tag Tag;
+ typedef Kokkos::Impl::FunctorValueTraits< FunctorType, Tag> ValueTraits;
+
+public:
+
+ //----------------------------------------
+
+ inline
+ ParallelScan( const FunctorType & f
+ , const Policy & policy )
+ {
+ const auto league_size = policy.league_size();
+ const auto team_size = policy.team_size(f);
+ const auto len = league_size * team_size;
+
+ if(len == 0) return;
+
+ scan_enqueue(len, f, [&](hc::tiled_index<1> idx, int n_teams, int n_leagues) { return typename Policy::member_type(idx,n_leagues,n_teams); });
+ }
+
+ KOKKOS_INLINE_FUNCTION
+ void execute() const {}
+
+ //----------------------------------------
+};
+
+}
+}
+
+namespace Kokkos {
+namespace Impl {
+ template
+ struct TeamThreadRangeBoundariesStruct {
+ typedef iType index_type;
+ const iType start;
+ const iType end;
+ const iType increment;
+ const ROCmTeamMember& thread;
+
+#if defined( __HCC_ACCELERATOR__ )
+ KOKKOS_INLINE_FUNCTION
+ TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+ start( thread_.team_rank() ),
+ end( count ),
+ increment( thread_.team_size() ),
+ thread(thread_)
+ {}
+ KOKKOS_INLINE_FUNCTION
+ TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& begin_, const iType& end_):
+ start( begin_ + thread_.team_rank() ),
+ end( end_ ),
+ increment( thread_.team_size() ),
+ thread(thread_)
+ {}
+#else
+ KOKKOS_INLINE_FUNCTION
+ TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+ start( 0 ),
+ end( count ),
+ increment( 1 ),
+ thread(thread_)
+ {}
+ KOKKOS_INLINE_FUNCTION
+ TeamThreadRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& begin_, const iType& end_):
+ start( begin_ ),
+ end( end_ ),
+ increment( 1 ),
+ thread(thread_)
+ {}
+#endif
+ };
+ template
+ struct ThreadVectorRangeBoundariesStruct {
+ typedef iType index_type;
+ const iType start;
+ const iType end;
+ const iType increment;
+ const ROCmTeamMember& thread;
+
+#if defined( __HCC_ACCELERATOR__ )
+ KOKKOS_INLINE_FUNCTION
+ ThreadVectorRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+ start( thread_.lindex()%thread_.vector_length() ),
+ end( count ),
+ increment( thread_.vector_length() ),
+ thread(thread_)
+ {}
+
+// KOKKOS_INLINE_FUNCTION
+// ThreadVectorRangeBoundariesStruct (const iType& count):
+// start( 0 ),
+// end( count ),
+// increment( 1 )
+// {}
+#else
+ KOKKOS_INLINE_FUNCTION
+ ThreadVectorRangeBoundariesStruct (const ROCmTeamMember& thread_, const iType& count):
+ start( 0 ),
+ end( count ),
+ increment( 1 ),
+ thread(thread_)
+ {}
+ KOKKOS_INLINE_FUNCTION
+ ThreadVectorRangeBoundariesStruct (const iType& count):
+ start( 0 ),
+ end( count ),
+ increment( 1 )
+ {}
+#endif
+ };
+
+}
+}
+
+namespace Kokkos {
+
+template
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct
+ TeamThreadRange(const Impl::ROCmTeamMember& thread, const iType& count) {
+ return Impl::TeamThreadRangeBoundariesStruct(thread,count);
+}
+
+template
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct::type,Impl::ROCmTeamMember>
+ TeamThreadRange(const Impl::ROCmTeamMember& thread, const iType1& begin, const iType2& end) {
+ typedef typename std::common_type< iType1, iType2 >::type iType;
+ return Impl::TeamThreadRangeBoundariesStruct(thread,begin,end);
+}
+
+template
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct
+ ThreadVectorRange(const Impl::ROCmTeamMember& thread, const iType& count) {
+ return Impl::ThreadVectorRangeBoundariesStruct(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct PerTeam(const Impl::ROCmTeamMember& thread) {
+ return Impl::ThreadSingleStruct(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct PerThread(const Impl::ROCmTeamMember& thread) {
+ return Impl::VectorSingleStruct(thread);
+}
+
+template
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct& single_struct, const FunctorType& lambda) {
+ if(single_struct.team_member.vector_rank()==0) lambda();
+}
+
+template
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct& single_struct, const FunctorType& lambda) {
+ if((single_struct.team_member.lindex()==0)) lambda();
+}
+
+template
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct& single_struct, const FunctorType& lambda, ValueType& val) {
+#if defined(ROCM15)
+ // 1.5 needs this more proper restriction on which work units run
+ if( single_struct.team_member.vector_rank()==0) lambda(val);
+ val = shfl(val,0,single_struct.team_member.vector_length());
+#else
+ // but older compilers are fine with this (TestTeamVector::Test< Kokkos::Experimental::ROCm >(4))
+ lambda(val);
+#endif
+}
+
+template
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct& single_struct, const FunctorType& lambda, ValueType& val) {
+ if(single_struct.team_member.lindex()==0) lambda(val);
+ single_struct.team_member.team_broadcast(val,0);
+}
+
+}
+
+namespace Kokkos {
+
+ /** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+ * This functionality requires C++11 support.*/
+template
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries, const Lambda& lambda) {
+ for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+ lambda(i);
+}
+
+/** \brief Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries,
+ const Lambda & lambda, ValueType& result) {
+
+ result = ValueType();
+
+ for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+ ValueType tmp = ValueType();
+ lambda(i,tmp);
+ result+=tmp;
+ }
+ result = loop_boundaries.thread.team_reduce(result,
+ Impl::JoinAdd());
+// Impl::rocm_intra_workgroup_reduction( loop_boundaries.thread, result,
+// Impl::JoinAdd());
+// Impl::rocm_inter_workgroup_reduction( loop_boundaries.thread, result,
+// Impl::JoinAdd());
+}
+
+/** \brief Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries,
+ const Lambda & lambda, const JoinType& join, ValueType& result) {
+
+#if defined(ROCM15)
+ ValueType tmp = result;
+ // Simpler code works with ROCM1.5
+ for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+ lambda(i,tmp);
+ }
+ result = loop_boundaries.thread.team_reduce(tmp,join);
+#else
+ // this workaround freezes up with ROCM1.5, but needed for earlier compilers
+ for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+ ValueType tmp = ValueType();
+ lambda(i,tmp);
+ join(result,tmp);
+ }
+ result = loop_boundaries.thread.team_reduce(result,join);
+#endif
+// Impl::rocm_intra_workgroup_reduction( loop_boundaries.thread, result,join);
+// Impl::rocm_inter_workgroup_reduction( loop_boundaries.thread, result,join);
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct&
+ loop_boundaries, const Lambda& lambda) {
+
+ for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+ lambda(i);
+}
+
+/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct&
+ loop_boundaries, const Lambda & lambda, ValueType& result) {
+ result = ValueType();
+
+ for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+ ValueType tmp = ValueType();
+ lambda(i,tmp);
+ result+=tmp;
+ }
+ result = loop_boundaries.thread.thread_reduce(result,Impl::JoinAdd());
+}
+
+/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct&
+ loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result) {
+
+ for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+ lambda(i,result);
+ loop_boundaries.thread.team_barrier();
+ }
+ result = loop_boundaries.thread.thread_reduce(result,join);
+}
+
+/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ * for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct&
+ loop_boundaries, const FunctorType & lambda) {
+
+ typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+ typedef typename ValueTraits::value_type value_type ;
+
+ value_type scan_val = value_type();
+#if (__ROCM_ARCH__ >= 800)
+// adopt the cuda vector shuffle method
+ const int VectorLength = loop_boundaries.increment;
+ int lid = loop_boundaries.thread.lindex();
+ int vector_rank = lid%VectorLength;
+
+ iType loop_bound = ((loop_boundaries.end+VectorLength-1)/VectorLength) * VectorLength;
+ value_type val ;
+ for(int _i = vector_rank; _i < loop_bound; _i += VectorLength) {
+ val = value_type();
+ if(_i 1) {
+ const value_type tmp2 = shfl_up(tmp, 1,VectorLength);
+ if(vector_rank > 0)
+ tmp+=tmp2;
+ }
+ if(vector_rank == 1)
+ result_i = tmp;
+ if (VectorLength > 3) {
+ const value_type tmp2 = shfl_up(tmp, 2,VectorLength);
+ if(vector_rank > 1)
+ tmp+=tmp2;
+ }
+ if ((vector_rank >= 2) &&
+ (vector_rank < 4))
+ result_i = tmp;
+ if (VectorLength > 7) {
+ const value_type tmp2 = shfl_up(tmp, 4,VectorLength);
+ if(vector_rank > 3)
+ tmp+=tmp2;
+ }
+ if ((vector_rank >= 4) &&
+ (vector_rank < 8))
+ result_i = tmp;
+ if (VectorLength > 15) {
+ const value_type tmp2 = shfl_up(tmp, 8,VectorLength);
+ if(vector_rank > 7)
+ tmp+=tmp2;
+ }
+ if ((vector_rank >= 8) &&
+ (vector_rank < 16))
+ result_i = tmp;
+ if (VectorLength > 31) {
+ const value_type tmp2 = shfl_up(tmp, 16,VectorLength);
+ if(vector_rank > 15)
+ tmp+=tmp2;
+ }
+ if ((vector_rank >=16) &&
+ (vector_rank < 32))
+ result_i = tmp;
+ if (VectorLength > 63) {
+ const value_type tmp2 = shfl_up(tmp, 32,VectorLength);
+ if(vector_rank > 31)
+ tmp+=tmp2;
+ }
+
+ if (vector_rank >= 32)
+ result_i = tmp;
+
+ val = scan_val + result_i - val;
+ scan_val += shfl(tmp,VectorLength-1,VectorLength);
+ if(_i
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace Kokkos {
+namespace Impl {
+
+template
+T* reduce_value(T* x, std::true_type) [[hc]]
+{
+ return x;
+}
+
+template
+T& reduce_value(T* x, std::false_type) [[hc]]
+{
+ return *x;
+}
+
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+struct always_true
+{
+ template
+ bool operator()(Ts&&...) const
+ {
+ return true;
+ }
+};
+#endif
+
+template< class Tag, class F, class ReducerType, class Invoker, class T >
+void reduce_enqueue(
+ const int szElements, // size of the extent
+ const F & f,
+ const ReducerType& reducer,
+ Invoker invoke,
+ T * const output_result,
+ int const output_length,
+ const int team_size=64,
+ const int vector_size=1,
+ int const shared_size=0)
+{
+ using namespace hc ;
+
+ typedef Kokkos::Impl::if_c< std::is_same::value, F, ReducerType> ReducerConditional;
+ typedef typename ReducerConditional::type ReducerTypeFwd;
+
+ typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , Tag > ValueTraits ;
+ typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , Tag > ValueInit ;
+ typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , Tag > ValueJoin ;
+ typedef Kokkos::Impl::FunctorFinal< ReducerTypeFwd , Tag > ValueFinal ;
+
+ typedef typename ValueTraits::pointer_type pointer_type ;
+ typedef typename ValueTraits::reference_type reference_type ;
+
+ if (output_length < 1) return;
+
+ assert(output_result != nullptr);
+ const auto td = get_tile_desc(szElements,output_length,team_size,vector_size, shared_size);
+
+ // allocate host and device memory for the results from each team
+ std::vector result_cpu(td.num_tiles*output_length);
+ hc::array result(td.num_tiles*output_length);
+
+ auto fut = tile_for(td, [=,&result](hc::tiled_index<1> t_idx, tile_buffer buffer) [[hc]]
+ {
+ const auto local = t_idx.local[0];
+ const auto global = t_idx.global[0];
+ const auto tile = t_idx.tile[0];
+
+ buffer.action_at(local, [&](T* state)
+ {
+ ValueInit::init(ReducerConditional::select(f, reducer), state);
+ invoke(make_rocm_invoke_fn(f), t_idx, td, reduce_value(state, std::is_pointer()));
+ });
+ t_idx.barrier.wait();
+
+ // Reduce within a tile using multiple threads.
+// even though buffer.size is always 64, the value 64 must be hard coded below
+// due to a compiler bug
+// for(std::size_t s = 1; s < buffer.size(); s *= 2)
+ for(std::size_t s = 1; s < 64; s *= 2)
+ {
+ const std::size_t index = 2 * s * local;
+// if (index < buffer.size())
+ if (index < 64)
+ {
+ buffer.action_at(index, index + s, [&](T* x, T* y)
+ {
+ ValueJoin::join(ReducerConditional::select(f, reducer), x, y);
+ });
+ }
+ t_idx.barrier.wait();
+ }
+
+ // Store the tile result in the global memory.
+ if (local == 0)
+ {
+#if KOKKOS_ROCM_HAS_WORKAROUNDS
+ // Workaround for assigning from LDS memory: std::copy should work
+ // directly
+ buffer.action_at(0, [&](T* x)
+ {
+#if ROCM15
+// new ROCM 15 address space changes aren't implemented in std algorithms yet
+ auto * src = reinterpret_cast(x);
+ auto * dest = reinterpret_cast(result.data()+tile*output_length);
+ for(int i=0; i
+
+/* only compile this file if ROCM is enabled for Kokkos */
+#if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
+
+//#include
+
+#include
+#include
+#include
+#include
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl( T & out , T const & in , int lane ,
+ typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+ *reinterpret_cast(&out) =
+ __shfl( *reinterpret_cast(&in) , lane , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl( T & out , T const & in , int lane ,
+ typename std::enable_if
+ < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+ , int >::type width )
+{
+ enum : int { N = sizeof(T) / sizeof(int) };
+
+ for ( int i = 0 ; i < N ; ++i ) {
+ reinterpret_cast(&out)[i] =
+ __shfl( reinterpret_cast(&in)[i] , lane , width );
+ }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_down( T & out , T const & in , int delta ,
+ typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+ *reinterpret_cast(&out) =
+ __shfl_down( *reinterpret_cast(&in) , delta , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_down( T & out , T const & in , int delta ,
+ typename std::enable_if
+ < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+ , int >::type width )
+{
+ enum : int { N = sizeof(T) / sizeof(int) };
+
+ for ( int i = 0 ; i < N ; ++i ) {
+ reinterpret_cast(&out)[i] =
+ __shfl_down( reinterpret_cast(&in)[i] , delta , width );
+ }
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_up( T & out , T const & in , int delta ,
+ typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
+{
+ *reinterpret_cast(&out) =
+ __shfl_up( *reinterpret_cast(&in) , delta , width );
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void rocm_shfl_up( T & out , T const & in , int delta ,
+ typename std::enable_if
+ < ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
+ , int >::type width )
+{
+ enum : int { N = sizeof(T) / sizeof(int) };
+
+ for ( int i = 0 ; i < N ; ++i ) {
+ reinterpret_cast